#GENETICS IN R!
install.packages("poppr")
library ("poppr")


#WE will learn about the basic R packages to analyse molecular data
#Will plot some diversity indexes
#Will in addition learn some tricks to customize plots, and handle data in R

#First let's load a package for better plots
library("ggplot2")   	#better graphics

#WE ALSO NEED GENETIC PACKAGES
install.packages("adegraphics")
install.packages("adegenet", dependencies=TRUE)
library ("adegenet")   	#main genetic package

library("adegraphics")   	#graphic outputs for adegenet


#then as usual
#check if you are in the folder you want to be
getwd()   	#execute this to check

#if your working directory is not where your data is at
#This I cannot do for you, because it depends on your computer, set working directory
setwd("G:/evoleco20/prak4")		#edit this, use always "/" not "\"


##############################
####   INPUT FILE CHECK   ####
##############################

#Learn the structure of a Structure input file (a widely used format and program)
#Open the input file to check how many SNPs, samples, etc.
#arguments are
	#file name between quotation marks "lizards_SNP_input.str"
	#separator: "\t"
	#there is no headers (column names), so headers is FALSE
dataset = read.table("lizards_SNP_input.str", sep = '\t', header =FALSE)

#check it
str(dataset)
#If too long (too many SNPs), we can reduce it to see only the first 10 columns
#add to str() the argument list.len
#try with 10 variables: list.len=10
str(dataset, list.len=10)


#we need to know how many loci and how many samples we have, check the individual labels in column 1
head(dataset)
# loci are in the columns and samples in the rows
# actually we need to check how many samples and how many loci we have
#check the dimensions: rows x columns
dim(dataset)


#how many samples we have?
	#we have two rows of alleles per sample (they are diploid), so samples = rows/2 = 240/2 = 120
#how many loci we have?
	#we have 102 columns, but the first two are individual ID and population, so 100 are loci


#################################
####   INPUT FILE 1 IMPORT   ####
#################################

#Adegenet an many genetic packages for R don't use dataframes as input files, they use files that handle the allelic frequencies
#Most of them use genind format, we will need to transform our data to this format
#we need the command read.structure() in order to open the input file (the one that ends in .str)
#arguments: 
	#file name between quotation marks "lizards_SNP_input.str"
	#we don't have a row with the SNPs (genetical marker) names, so we will set row.marknames to 0 (or FALSE)
	#do we have one row per individual or two? set onerowperind to either TRUE or FALSE
	#number of individuals? n.ind=
	#number of loci? n.loc=
	#which column has the individual labels? col.lab=
	#which columns have the population names? col.pop=


#input file, open from Genepop format (a widely used format for genetics software)
rawgenind <- read.structure ("lizards_SNP_input.str", row.marknames=0, onerowperind=FALSE, n.ind=120, n.loc=100, col.lab=1, col.pop=2, NA.char = "5", ask=FALSE)

#check the new file
rawgenind


#See information that could be useful for plots or analysis
#also to check that all was imported properly

#number of individuals
nInd(rawgenind)

#names of the individuals
indNames(rawgenind)

#number of loci
nLoc(rawgenind)

#number of alleles per loci
alleles(rawgenind)

#ploidy of your samples
ploidy(rawgenind)

#population codes
pop(rawgenind)


#################################
####       PLOTS             ####
#################################


# PLOT SOME OF THE FILE INFO
# FIRST WE NEED TO CHOOSE SOME COLOURS


# you have a full list of colours in R: http://www.stat.columbia.edu/~tzheng/files/Rcolor.pdf
#have fun
mypalette <- c("black", "red1", "goldenrod1", "purple3", "darkorange3", "darkgreen", "blue", "magenta", "orange", "chartreuse3")


#Let's plot how many samples there is of each population
#we need to count how many times each pop name appears
#as we saw we can get the populations from all samples with pop()
#table() will make a table by grouping the frequency (how many times each population appears)


pop(rawgenind)
popnames <- table(pop(rawgenind))
popnames #grouped as a table


#we will use the barplot() command.
#you need to introduce the folowing arguments separated by commas (",")
	#object you want to plot: popnames
	# colour source: col=mypalette
	#you can name the x axis "Populations", use the argument for x axis labels: xlab=""
	#you can name the y axis "Sample size", use the argument for y axis labels: ylab=""


barplot(popnames, col=mypalette, xlab="Population", ylab="N")


# Now let's play around with the plot


#Default Y axis is too short, if we feel perfectionist we can fix it
#we will need a new argument for y axis limits: ylim =
#then we add a range of values, from min to max, remember we use "c" to concatenate values: c(min,max)
#instead of "min " and "max" set it from 0 to 20


#set the ylim and execute
barplot(popnames, col=mypalette, xlab="Population", ylab="Sample size", ylim =c(0,20))

#now we need to fix the population names
#we can make the font smaller with cex.names=
#default size is 1, anything bellow that will reduce the size
#try to add the argument cex.names (use a comma to separate arguments) and set it to 0.7
barplot(popnames, col=mypalette, xlab="Population", ylab="Sample size", cex.names = 0.7, ylim = c(0,20))


###########################
###  DIVERSITY INDEXES  ###
###########################


####Now let's analyse our data

# We can get some basic statistics and info from our file with "summary" (same than with ANOVA)
sum_rawgenind <- summary(rawgenind)
sum_rawgenind


# ALLELIC RICHNESS

#Check the Number of alleles per population

#you will need to plot this object:
sum_rawgenind$pop.n.all

barplot(sum_rawgenind$pop.n.all, col=mypalette, xlab="Population", ylab="Number of alleles")

#Why we are using number of alleles and not Allelic Richness (Ar)?

# Allelic richness is related to the total alleles
# the number of alleles is highly influenced by the number of samples. 
# compare with the previous plot (number of samples per populations)
# and do the correlation in excel


#try to print it to a file using "cat()" and capture.output()
capture.output(sum_rawgenind$pop.n.all) # use this as an object to print with cat() (first argument for cat)
#the rest of cat() arguments should be:
#second argument, how to separate lines of the matrix: sep=
#separate line with "new line" symbol; "\n"
#third argument, give the file to print a name: file="number_of_alleles.txt"
cat(capture.output(sum_rawgenind$pop.n.all), sep="\n", file="number_of_alleles.txt") 


#if the printed file looks good you should be able to copy paste it on excel
#it will all be pasted in the same column, but you should be able to separate it
#select column with the data, click on the "Data" tab in excel
#Text to columns > Fixed width > Next
#delimit the columns and you should have it ready, otherwise just type the values yourself on excel


#   	PRIVATE ALLELES
#######################

#Plot the ones calculated by Adegenet
pa <- private_alleles(rawgenind, report = "data.frame", form=locus~.,count.alleles = FALSE)
pa
ggplot(pa) + geom_tile(aes(x = population, y = locus, fill = count))
#most of the private alleles are in mainland, why could be that?


#   	HETEROZYGOSITY
######################

#Heterozygosity is an important information to check
#Is given by all the different packages we used
#Including the basic Adegenet summary
#Expected heterozygosity (under HW) is usefull as a way to compared the real (observed) with the ideal (expected) in all loci
plot(sum_rawgenind$Hexp, sum_rawgenind$Hobs, pch=1, cex=1, col="red1", xlim=c(0,0.6), ylim=c(0,0.6), xlab="Expected Heterozygosity", ylab="Observed Heterozygosity")
#add a line to see if the propotion is balanced, higher or lower
abline(0,1,lty=2)


#what does this mean?
#HWe violation (linkage, inbreeding, gene flow, selection, )?

#Are they significantly different?

#first check normality to see if we can use ANOVA
shapiro.test(sum_rawgenind$Hexp)
shapiro.test(sum_rawgenind$Hobs)

#well in any case those are proportions (number of heterozygous / total number) so there are more adequate tests
#  Bartlett test of homogeneity of variances
bartlett.test(list(sum_rawgenind$Hexp, sum_rawgenind$Hobs))
# Chi^2 test of independence
chisq.test(sum_rawgenind$Hexp, sum_rawgenind$Hobs, simulate.p.value=TRUE, B=99)

#yes, they are different

#another approximation to an ideal expected heterozygosity is the within population gene diversity (Hs)
# Let's plot Hs, is equivalent but NOT EQUAL to expected heterozygosity by pop
#gendind objects are tricky, we can not treat it as a table, and just plot the data inside
#Usually to calculate population-wise indexes we will use a genpop object (equivalent indeed to our input file format).

#Transform to genpop format used by lots of classical population genetics packages
rawgenpop <- genind2genpop(rawgenind)
rawgenpop
popNames(rawgenpop)
#compute Hs
GeneDiversityHet <- Hs(rawgenpop)
# plot this object with the Hs
barplot(GeneDiversityHet, col=mypalette, xlab="Population", ylab="Hs: Within population gene diversity (~He)")


#WE NEED THIS INFORMATION FOR OUR EXCEL FILE
#try to print it to a file using "cat()" and capture.output()
capture.output(GeneDiversityHet) # use this as an object to print with cat() (first argument)
#the rest of cat() argumetns should be:
#second argument: sep=
#separate line with "new line" symbol; "\n"
#third argument, name of the file to print: file=""
cat()# complete the command with the three arguments and save a file with the data

cat(capture.output(GeneDiversityHet), sep="\n", file="geneDiversityHet.txt") 


###########################################################
####    GENETIC DISTANCES and POPULATION DIFERENCES    ####
###########################################################


###   	F STATISTICS: Fis 

#Imbreeding differences per population?
#First we need to build the 95% probability confidence intervals
#boot pp fis will perform population bootstrap iterations of population Fis to calculate them
Fis <- boot.ppfis(rawgenind, quant=c(0.025,0.975),diploid=TRUE,dig=4)
Fis
str(Fis)
? boot.ppfis
?? boot.ppfis

install.packages("hierfstat")
library("hierfstat")

#There is a problem
head(Fis)
#we have the confidence intervals (lower (ll) and higher(hl)) the columns and the populations in the rows
# but we want to plot the histogram with populations in X axis (columns)
#we need to swap rows with columns, we can do this with transpose function: t()

#the information we are interested in is in the $fis.ci slot
#check
Fis$fis.ci

#transpose
FisCI <- t(Fis$fis.ci)

#great, but ANOTHER`PROBLEM: R did not take the population names
#there are just numbers
FisCI
#we actually want are the population names that we got in the input file
#we saved it as "dataset", extract the populations from dataset
#they are in the second column [,2]
dataset[,2]

#but we don't need them all, just one name from each population
#we can use unique() to extract the one unique name from that list in the same order
#use unique() to get the names only once, use unique with the second column of dataset

properpops <- unique(dataset[,2])
length(properpops)

#now we need were to input the population names
FisCI

#those will be the columm names. we can access them with the command: colnames()
#now we will replace them  with the proper names
colnames(FisCI) <- properpops

#check if it worked
FisCI


#Now we can FINALLY plot
boxplot(FisCI, boxwex=0.05, border=0, col=mypalette)


####################################
#####   	Global Fstat	   #####
####################################

#now let's calculate the genetic distances between each pair of populations

####PAIR WISE FST
#pairFst <- fstat(rawgenind, fstonly=TRUE)
#pairFst <- genet.dist(rawgenind,method="Fst")
pairFst <- genet.dist(rawgenind, method="Fst")

pairFst

# Is it high? 
# 0 = panmixia
# 1 = no historical relationship
#Fst only makes sense comparatively


# again we need to give the populations names
#lets check where should we add them
str(pairFst)

#is not a dataframe (AGAIN), this time is a "dist" object
#we need a new package in order to edit names of a "dist" object
#install.packages("usedist")
library("usedist")

#now we add them with dist_setNames()
#we need two arguments (separated per commas)
# the object with the Fsts: pairFst
# the object with proper population names we saved: properpops

FstsMatrix <- dist_setNames(pairFst,properpops)		#edit this with the two arguments
FstsMatrix

#THIS IS THE INFORMATION WE NEED FOR OUR EXCEL FILE
#try to print it to a file using "cat()" and capture.output()
capture.output(FstsMatrix) # use this as an object to print with cat() (first argument)
#the rest of cat() argumetns should be:
#second argument: sep=
#separate line with "new line" symbol; "\n"
#third argument, name of the file to print: file=""
cat()# complete the command with the three arguments and save a file with the data

cat(capture.output(FstsMatrix), sep="\n", file="FstsMatrix.txt") 


#Done
#Lets see it graphically
#a matrix of pairwise genetic distances in which the higher the value the darker the colour
table.image(FstsMatrix)

#lame... Let's give it more resolution
table.image(FstsMatrix, nclass=16)

#Which one is more different?

#To appreciate even better the genetic distances between each pair build a tree
library("ape") #phylogenetic trees


psic.tree <- nj(FstsMatrix)	   #NETWORK JOINING TREE
#plot
plot(psic.tree, type="unr", tip.col=mypalette, font=2, main="Genetic distances (pair-wise Fst) tree-plot")
#add a bar to indicate the relationship between the length and the Fst value
add.scale.bar()
#we can also add the Fst values if we wish to have the information, although it may look messy
annot <- round(psic.tree$edge.length,2)
edgelabels(annot[annot>0], which(annot>0), frame="n")

#now we should compare to the map and geographical information (EXCEL)