#read in each library used for this analysislibrary(vcfR)
***** *** vcfR *** *****
This is vcfR 1.14.0
browseVignettes('vcfR') # Documentation
citation('vcfR') # Citation
***** ***** ***** *****
library(SNPfiltR)
This is SNPfiltR v.1.0.2
Detailed usage information is available at: devonderaad.github.io/SNPfiltR/
If you use SNPfiltR in your published work, please cite the following papers:
DeRaad, D.A. (2022), SNPfiltR: an R package for interactive and reproducible SNP filtering. Molecular Ecology Resources, 22, 2443-2453. http://doi.org/10.1111/1755-0998.13618
Knaus, Brian J., and Niklaus J. Grunwald. 2017. VCFR: a package to manipulate and visualize variant call format data in R. Molecular Ecology Resources, 17.1:44-53. http://doi.org/10.1111/1755-0998.12549
Registered S3 method overwritten by 'pegas':
method from
print.amova ade4
Attaching package: 'pegas'
The following object is masked from 'package:ape':
mst
The following object is masked from 'package:ade4':
amova
The following objects are masked from 'package:vcfR':
getINFO, write.vcf
0.2 read in vcf and trim it
#read in the vcf file containing all filtered SNPsvcfR <-read.vcfR("~/Desktop/cali.zosterops.rad/filtered.snps.vcf.gz")
Scanning file to determine attributes.
File attributes:
meta lines: 14
header_line: 15
variant count: 15704
column count: 133
Meta line 14 read in.
All meta lines processed.
gt matrix initialized.
Character matrix gt created.
Character matrix gt rows: 15704
Character matrix gt cols: 133
skip: 0
nrows: 15704
row_num: 0
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant 6000
Processed variant 7000
Processed variant 8000
Processed variant 9000
Processed variant 10000
Processed variant 11000
Processed variant 12000
Processed variant 13000
Processed variant 14000
Processed variant 15000
Processed variant: 15704
All variants processed
#check out the detailsvcfR
***** Object of Class vcfR *****
124 samples
239 CHROMs
15,704 variants
Object size: 166.5 Mb
5.314 percent missing data
***** ***** *****
#read in sample info csvsample.info<-read.csv("~/Desktop/cali.zosterops.rad/zosterops.trimmed.RAD.sampling.csv")#trim sampling info dataframe to retain only the samples that passed all filtering protocolssample.info<-sample.info[sample.info$ID %in%colnames(vcfR@gt),]#check how many samples per species are retainedtable(sample.info$Species)
#make sure sample info DF order matches order of samples in vcf (this should return all true if the dataframe sample order matches the order these samples occur in the vcf)sample.info$ID ==colnames(vcfR@gt)[-1]
#fix row namesrownames(sample.info)<-c(1:nrow(sample.info))#remove the two putative hybrid samples from the vcf for this analysisv.sub<-vcfR[,c(1:46,48:51,53:125)]#subset sample info dataframesample.info<-sample.info[c(1:45,47:50,52:124),]
0.3 convert vcfR to genlight and calculate fst
#convert vcfR to genlight objectgen<-vcfR2genlight(v.sub)#assign samples to groups based on species identitygen@pop<-as.factor(sample.info$Species)#calculate pairwise Fst using the stampp packagedi.heat<-stamppFst(gen)#extract the pairwise matrixm<-di.heat$Fsts#fill in upper triangle of the matrixm[upper.tri(m)] <-t(m)[upper.tri(m)]#melt to tidy format for ggplottingheat <- reshape::melt(m)
Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by
the caller; using TRUE
Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by
the caller; using TRUE
0.4 plot results
#plot as heatmap with exact values labeling each cellggplot(data = heat, aes(x=X1, y=X2, fill=value)) +geom_tile()+geom_text(data=heat,aes(label=round(value, 2)))+theme_minimal()+scale_fill_gradient2(low ="white", high ="red", space ="Lab", name="Fst") +theme(axis.text.x =element_text(angle =45, hjust =1),axis.text.y =element_text(angle =45, hjust =1))
0.5 try splitting up the three clades of japonicus to compare the intra to inter species FST’s
#duplicate species columnspec.assignments<-sample.info$Species#assign northern Philippines cladespec.assignments[c(50:54,59:64,87:93)]<-"north.phil"#assign southern Philippines cladespec.assignments[c(55:58,65:71,81:86,98,113,114)]<-"south.phil"#assign samples to the three groups shown abovegen@pop<-as.factor(spec.assignments)#calculate pairwise Fst using the stampp packagedi.heat<-stamppFst(gen)#extract the pairwise matrixm<-di.heat$Fsts#fill in upper triangle of the matrixm[upper.tri(m)] <-t(m)[upper.tri(m)]#melt to tidy format for ggplottingheat <- reshape::melt(m)
Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by
the caller; using TRUE
Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by
the caller; using TRUE
#plot as heatmap with exact values labeling each cellggplot(data = heat, aes(x=X1, y=X2, fill=value)) +geom_tile()+geom_text(data=heat,aes(label=round(value, 2)))+theme_minimal()+scale_fill_gradient2(low ="white", high ="red", space ="Lab", name="Fst") +theme(axis.text.x =element_text(angle =45, hjust =1),axis.text.y =element_text(angle =45, hjust =1))
#identify the number of fixed differences between pops#convert vcf to genotype matrixmat<-extract.gt(v.sub)#duplicate matrixconv.mat<-mat#convert genotypes to binary format (i.e., 0 = homozygous ref, 1 = het, 2 = homozygous alt)conv.mat[conv.mat =="0/0"]<-0conv.mat[conv.mat =="0/1"]<-1conv.mat[conv.mat =="1/1"]<-2#convert matrix to data.frameconv.mat<-as.data.frame(conv.mat)#convert to data.frame to numericfor (i in1:ncol(conv.mat)){ conv.mat[,i]<-as.numeric(as.character(conv.mat[,i]))}#compare colnames of the matrix to your popmap to verify you're subsetting correctlycolnames(conv.mat) == sample.info$ID #should be all true
#make vector to fill with number of pairwise fixed diffsf<-c()#this generic for loop will calculate the number of fixed diffs between each of your designated popsfor (i in1:nrow(heat)){#calc af of pop1 and pop2 pop1.af<-(rowSums(conv.mat[,spec.assignments == heat$X1[i]], na.rm=T)/(rowSums(is.na(conv.mat[,spec.assignments == heat$X1[i]]) ==FALSE)))/2 pop2.af<-(rowSums(conv.mat[,spec.assignments == heat$X2[i]], na.rm=T)/(rowSums(is.na(conv.mat[,spec.assignments == heat$X2[i]]) ==FALSE)))/2#store number of fixed differences f[i]<-sum(is.na(abs(pop1.af - pop2.af)) ==FALSE&abs(pop1.af - pop2.af) ==1) #find fixed SNPs and add to vector}#make sure this worked correctlyf
#add number of fixed diffs to your existing dfheat$fixed<-f### this code will get you the vector needed to combine FST values and fixed differences into a single vector split by #define n as the number of taxa used in your pairwise Fst comparisonn<-8#here 3i<-1#always begin incrementer (i) at 1x<-c() #always begin with an empty vector#while loop that will make the appropriate vector and store it in the variable 'x'while (i < n){#the first set of numbers is simply 2:nif(i ==1){ x<-c(2:n) i=i+1 }#the second set of numbers is (2+n+1):(2*n) which we add to the existing vectorif(i ==2){ x<-c(x,(2+n+1):(2*n)) i=i+1 }if(n ==3){break} #handle the edge case where n=3 and the code proceeds to the next step even though it is in violation of the outside while loop, because it tests all internal statements before looping back to the top to test the while loop condition#we then add (2+((i-1)*(n+1))):(i*n) to the vector, where i=3, incrememnt i by 1, and continue adding this vector to the growing vector until i = n-1if(i >2){ x<-c(x,(2+((i-1)*(n+1))):(i*n)) i=i+1 }}#order your Fst and fixed difference values correctly in a single mixed vector to plot the Fst values above and # of fixed differences below the diagonal in the heatmap, using the vector you just created (named 'x')heat$mixed<-heat$valueheat$mixed[x]<-heat$fixed[x]#plot with labelsggplot(data = heat, aes(x=X1, y=X2, fill=value)) +geom_tile()+geom_text(data=heat,aes(label=round(mixed, 2)), size=4)+theme_minimal()+scale_fill_gradient2(low ="white", high ="red", space ="Lab", name="Fst") +theme(axis.text.x =element_text(angle =45, vjust=.9, hjust = .9, size=12),axis.text.y =element_text(angle =45, hjust =1, size=12),axis.title.x =element_blank(), axis.title.y =element_blank())