#GENETICS IN R! install.packages("poppr") library ("poppr") #WE will learn about the basic R packages to analyse molecular data #Will plot some diversity indexes #Will in addition learn some tricks to customize plots, and handle data in R #First let's load a package for better plots library("ggplot2") #better graphics #WE ALSO NEED GENETIC PACKAGES install.packages("adegraphics") install.packages("adegenet", dependencies=TRUE) library ("adegenet") #main genetic package library("adegraphics") #graphic outputs for adegenet #then as usual #check if you are in the folder you want to be getwd() #execute this to check #if your working directory is not where your data is at #This I cannot do for you, because it depends on your computer, set working directory setwd("G:/evoleco20/prak4") #edit this, use always "/" not "\" ############################## #### INPUT FILE CHECK #### ############################## #Learn the structure of a Structure input file (a widely used format and program) #Open the input file to check how many SNPs, samples, etc. #arguments are #file name between quotation marks "lizards_SNP_input.str" #separator: "\t" #there is no headers (column names), so headers is FALSE dataset = read.table("lizards_SNP_input.str", sep = '\t', header =FALSE) #check it str(dataset) #If too long (too many SNPs), we can reduce it to see only the first 10 columns #add to str() the argument list.len #try with 10 variables: list.len=10 str(dataset, list.len=10) #we need to know how many loci and how many samples we have, check the individual labels in column 1 head(dataset) # loci are in the columns and samples in the rows # actually we need to check how many samples and how many loci we have #check the dimensions: rows x columns dim(dataset) #how many samples we have? #we have two rows of alleles per sample (they are diploid), so samples = rows/2 = 240/2 = 120 #how many loci we have? #we have 102 columns, but the first two are individual ID and population, so 100 are loci ################################# #### INPUT FILE 1 IMPORT #### ################################# #Adegenet an many genetic packages for R don't use dataframes as input files, they use files that handle the allelic frequencies #Most of them use genind format, we will need to transform our data to this format #we need the command read.structure() in order to open the input file (the one that ends in .str) #arguments: #file name between quotation marks "lizards_SNP_input.str" #we don't have a row with the SNPs (genetical marker) names, so we will set row.marknames to 0 (or FALSE) #do we have one row per individual or two? set onerowperind to either TRUE or FALSE #number of individuals? n.ind= #number of loci? n.loc= #which column has the individual labels? col.lab= #which columns have the population names? col.pop= #input file, open from Genepop format (a widely used format for genetics software) rawgenind <- read.structure ("lizards_SNP_input.str", row.marknames=0, onerowperind=FALSE, n.ind=120, n.loc=100, col.lab=1, col.pop=2, NA.char = "5", ask=FALSE) #check the new file rawgenind #See information that could be useful for plots or analysis #also to check that all was imported properly #number of individuals nInd(rawgenind) #names of the individuals indNames(rawgenind) #number of loci nLoc(rawgenind) #number of alleles per loci alleles(rawgenind) #ploidy of your samples ploidy(rawgenind) #population codes pop(rawgenind) ################################# #### PLOTS #### ################################# # PLOT SOME OF THE FILE INFO # FIRST WE NEED TO CHOOSE SOME COLOURS # you have a full list of colours in R: http://www.stat.columbia.edu/~tzheng/files/Rcolor.pdf #have fun mypalette <- c("black", "red1", "goldenrod1", "purple3", "darkorange3", "darkgreen", "blue", "magenta", "orange", "chartreuse3") #Let's plot how many samples there is of each population #we need to count how many times each pop name appears #as we saw we can get the populations from all samples with pop() #table() will make a table by grouping the frequency (how many times each population appears) pop(rawgenind) popnames <- table(pop(rawgenind)) popnames #grouped as a table #we will use the barplot() command. #you need to introduce the folowing arguments separated by commas (",") #object you want to plot: popnames # colour source: col=mypalette #you can name the x axis "Populations", use the argument for x axis labels: xlab="" #you can name the y axis "Sample size", use the argument for y axis labels: ylab="" barplot(popnames, col=mypalette, xlab="Population", ylab="N") # Now let's play around with the plot #Default Y axis is too short, if we feel perfectionist we can fix it #we will need a new argument for y axis limits: ylim = #then we add a range of values, from min to max, remember we use "c" to concatenate values: c(min,max) #instead of "min " and "max" set it from 0 to 20 #set the ylim and execute barplot(popnames, col=mypalette, xlab="Population", ylab="Sample size", ylim =c(0,20)) #now we need to fix the population names #we can make the font smaller with cex.names= #default size is 1, anything bellow that will reduce the size #try to add the argument cex.names (use a comma to separate arguments) and set it to 0.7 barplot(popnames, col=mypalette, xlab="Population", ylab="Sample size", cex.names = 0.7, ylim = c(0,20)) ########################### ### DIVERSITY INDEXES ### ########################### ####Now let's analyse our data # We can get some basic statistics and info from our file with "summary" (same than with ANOVA) sum_rawgenind <- summary(rawgenind) sum_rawgenind # ALLELIC RICHNESS #Check the Number of alleles per population #you will need to plot this object: sum_rawgenind$pop.n.all barplot(sum_rawgenind$pop.n.all, col=mypalette, xlab="Population", ylab="Number of alleles") #Why we are using number of alleles and not Allelic Richness (Ar)? # Allelic richness is related to the total alleles # the number of alleles is highly influenced by the number of samples. # compare with the previous plot (number of samples per populations) # and do the correlation in excel #try to print it to a file using "cat()" and capture.output() capture.output(sum_rawgenind$pop.n.all) # use this as an object to print with cat() (first argument for cat) #the rest of cat() arguments should be: #second argument, how to separate lines of the matrix: sep= #separate line with "new line" symbol; "\n" #third argument, give the file to print a name: file="number_of_alleles.txt" cat(capture.output(sum_rawgenind$pop.n.all), sep="\n", file="number_of_alleles.txt") #if the printed file looks good you should be able to copy paste it on excel #it will all be pasted in the same column, but you should be able to separate it #select column with the data, click on the "Data" tab in excel #Text to columns > Fixed width > Next #delimit the columns and you should have it ready, otherwise just type the values yourself on excel # PRIVATE ALLELES ####################### #Plot the ones calculated by Adegenet pa <- private_alleles(rawgenind, report = "data.frame", form=locus~.,count.alleles = FALSE) pa ggplot(pa) + geom_tile(aes(x = population, y = locus, fill = count)) #most of the private alleles are in mainland, why could be that? # HETEROZYGOSITY ###################### #Heterozygosity is an important information to check #Is given by all the different packages we used #Including the basic Adegenet summary #Expected heterozygosity (under HW) is usefull as a way to compared the real (observed) with the ideal (expected) in all loci plot(sum_rawgenind$Hexp, sum_rawgenind$Hobs, pch=1, cex=1, col="red1", xlim=c(0,0.6), ylim=c(0,0.6), xlab="Expected Heterozygosity", ylab="Observed Heterozygosity") #add a line to see if the propotion is balanced, higher or lower abline(0,1,lty=2) #what does this mean? #HWe violation (linkage, inbreeding, gene flow, selection, )? #Are they significantly different? #first check normality to see if we can use ANOVA shapiro.test(sum_rawgenind$Hexp) shapiro.test(sum_rawgenind$Hobs) #well in any case those are proportions (number of heterozygous / total number) so there are more adequate tests # Bartlett test of homogeneity of variances bartlett.test(list(sum_rawgenind$Hexp, sum_rawgenind$Hobs)) # Chi^2 test of independence chisq.test(sum_rawgenind$Hexp, sum_rawgenind$Hobs, simulate.p.value=TRUE, B=99) #yes, they are different #another approximation to an ideal expected heterozygosity is the within population gene diversity (Hs) # Let's plot Hs, is equivalent but NOT EQUAL to expected heterozygosity by pop #gendind objects are tricky, we can not treat it as a table, and just plot the data inside #Usually to calculate population-wise indexes we will use a genpop object (equivalent indeed to our input file format). #Transform to genpop format used by lots of classical population genetics packages rawgenpop <- genind2genpop(rawgenind) rawgenpop popNames(rawgenpop) #compute Hs GeneDiversityHet <- Hs(rawgenpop) # plot this object with the Hs barplot(GeneDiversityHet, col=mypalette, xlab="Population", ylab="Hs: Within population gene diversity (~He)") #WE NEED THIS INFORMATION FOR OUR EXCEL FILE #try to print it to a file using "cat()" and capture.output() capture.output(GeneDiversityHet) # use this as an object to print with cat() (first argument) #the rest of cat() argumetns should be: #second argument: sep= #separate line with "new line" symbol; "\n" #third argument, name of the file to print: file="" cat()# complete the command with the three arguments and save a file with the data cat(capture.output(GeneDiversityHet), sep="\n", file="geneDiversityHet.txt") ########################################################### #### GENETIC DISTANCES and POPULATION DIFERENCES #### ########################################################### ### F STATISTICS: Fis #Imbreeding differences per population? #First we need to build the 95% probability confidence intervals #boot pp fis will perform population bootstrap iterations of population Fis to calculate them Fis <- boot.ppfis(rawgenind, quant=c(0.025,0.975),diploid=TRUE,dig=4) Fis str(Fis) ? boot.ppfis ?? boot.ppfis install.packages("hierfstat") library("hierfstat") #There is a problem head(Fis) #we have the confidence intervals (lower (ll) and higher(hl)) the columns and the populations in the rows # but we want to plot the histogram with populations in X axis (columns) #we need to swap rows with columns, we can do this with transpose function: t() #the information we are interested in is in the $fis.ci slot #check Fis$fis.ci #transpose FisCI <- t(Fis$fis.ci) #great, but ANOTHER`PROBLEM: R did not take the population names #there are just numbers FisCI #we actually want are the population names that we got in the input file #we saved it as "dataset", extract the populations from dataset #they are in the second column [,2] dataset[,2] #but we don't need them all, just one name from each population #we can use unique() to extract the one unique name from that list in the same order #use unique() to get the names only once, use unique with the second column of dataset properpops <- unique(dataset[,2]) length(properpops) #now we need were to input the population names FisCI #those will be the columm names. we can access them with the command: colnames() #now we will replace them with the proper names colnames(FisCI) <- properpops #check if it worked FisCI #Now we can FINALLY plot boxplot(FisCI, boxwex=0.05, border=0, col=mypalette) #################################### ##### Global Fstat ##### #################################### #now let's calculate the genetic distances between each pair of populations ####PAIR WISE FST #pairFst <- fstat(rawgenind, fstonly=TRUE) #pairFst <- genet.dist(rawgenind,method="Fst") pairFst <- genet.dist(rawgenind, method="Fst") pairFst # Is it high? # 0 = panmixia # 1 = no historical relationship #Fst only makes sense comparatively # again we need to give the populations names #lets check where should we add them str(pairFst) #is not a dataframe (AGAIN), this time is a "dist" object #we need a new package in order to edit names of a "dist" object #install.packages("usedist") library("usedist") #now we add them with dist_setNames() #we need two arguments (separated per commas) # the object with the Fsts: pairFst # the object with proper population names we saved: properpops FstsMatrix <- dist_setNames(pairFst,properpops) #edit this with the two arguments FstsMatrix #THIS IS THE INFORMATION WE NEED FOR OUR EXCEL FILE #try to print it to a file using "cat()" and capture.output() capture.output(FstsMatrix) # use this as an object to print with cat() (first argument) #the rest of cat() argumetns should be: #second argument: sep= #separate line with "new line" symbol; "\n" #third argument, name of the file to print: file="" cat()# complete the command with the three arguments and save a file with the data cat(capture.output(FstsMatrix), sep="\n", file="FstsMatrix.txt") #Done #Lets see it graphically #a matrix of pairwise genetic distances in which the higher the value the darker the colour table.image(FstsMatrix) #lame... Let's give it more resolution table.image(FstsMatrix, nclass=16) #Which one is more different? #To appreciate even better the genetic distances between each pair build a tree library("ape") #phylogenetic trees psic.tree <- nj(FstsMatrix) #NETWORK JOINING TREE #plot plot(psic.tree, type="unr", tip.col=mypalette, font=2, main="Genetic distances (pair-wise Fst) tree-plot") #add a bar to indicate the relationship between the length and the Fst value add.scale.bar() #we can also add the Fst values if we wish to have the information, although it may look messy annot <- round(psic.tree$edge.length,2) edgelabels(annot[annot>0], which(annot>0), frame="n") #now we should compare to the map and geographical information (EXCEL)