Zosterops pariwise FST table

0.1 Load libraries

#read in each library used for this analysis
library(vcfR)


   *****       ***   vcfR   ***       *****
   This is vcfR 1.14.0 
     browseVignettes('vcfR') # Documentation
     citation('vcfR') # Citation
   *****       *****      *****       *****

library(SNPfiltR)

This is SNPfiltR v.1.0.2

Detailed usage information is available at: devonderaad.github.io/SNPfiltR/ 

If you use SNPfiltR in your published work, please cite the following papers: 

DeRaad, D.A. (2022), SNPfiltR: an R package for interactive and reproducible SNP filtering. Molecular Ecology Resources, 22, 2443-2453. http://doi.org/10.1111/1755-0998.13618 

Knaus, Brian J., and Niklaus J. Grunwald. 2017. VCFR: a package to manipulate and visualize variant call format data in R. Molecular Ecology Resources, 17.1:44-53. http://doi.org/10.1111/1755-0998.12549

library(ggplot2)
library(adegenet)

Loading required package: ade4


   /// adegenet 2.1.10 is loaded ////////////

   > overview: '?adegenet'
   > tutorials/doc/questions: 'adegenetWeb()' 
   > bug reports/feature requests: adegenetIssues()

library(StAMPP)

Loading required package: pegas

Loading required package: ape

Registered S3 method overwritten by 'pegas':
  method      from
  print.amova ade4


Attaching package: 'pegas'

The following object is masked from 'package:ape':

    mst

The following object is masked from 'package:ade4':

    amova

The following objects are masked from 'package:vcfR':

    getINFO, write.vcf

0.2 read in vcf and trim it

#read in the vcf file containing all filtered SNPs
vcfR <- read.vcfR("~/Desktop/cali.zosterops.rad/filtered.snps.vcf.gz")

Scanning file to determine attributes.
File attributes:
  meta lines: 14
  header_line: 15
  variant count: 15704
  column count: 133

Meta line 14 read in.
All meta lines processed.
gt matrix initialized.
Character matrix gt created.
  Character matrix gt rows: 15704
  Character matrix gt cols: 133
  skip: 0
  nrows: 15704
  row_num: 0

Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant 6000
Processed variant 7000
Processed variant 8000
Processed variant 9000
Processed variant 10000
Processed variant 11000
Processed variant 12000
Processed variant 13000
Processed variant 14000
Processed variant 15000
Processed variant: 15704
All variants processed

#check out the details
vcfR

***** Object of Class vcfR *****
124 samples
239 CHROMs
15,704 variants
Object size: 166.5 Mb
5.314 percent missing data
*****        *****         *****

#read in sample info csv
sample.info<-read.csv("~/Desktop/cali.zosterops.rad/zosterops.trimmed.RAD.sampling.csv")
#trim sampling info dataframe to retain only the samples that passed all filtering protocols
sample.info<-sample.info[sample.info$ID %in% colnames(vcfR@gt),]
#check how many samples per species are retained
table(sample.info$Species)


erythropleurus       everetti      japonicus       nigrorum    palpebrosus 
             6              3             83              4              4 
       simplex 
            24

#see sample order
colnames(vcfR@gt)

  [1] "FORMAT"        "ZJlo012"       "ZJlo015"       "ZJlo016"      
  [5] "ZJlo017"       "ZJlo021"       "ZJlo022"       "ZJlo024"      
  [9] "ZJlo031"       "ZJlo045"       "ZJlo046"       "ZJlo050"      
 [13] "ZJlo052"       "ZJlo055"       "ZJja001"       "ZJja002"      
 [17] "ZJja003"       "ZJja004"       "ZJja005"       "ZJja009"      
 [21] "ZJja010"       "ZJja012"       "ZJja013"       "ZJja014"      
 [25] "ZJja016"       "ZJja030"       "ZJal003"       "Zjal004"      
 [29] "ZJal005"       "ZJal010"       "ZJal011"       "ZJal012"      
 [33] "ZJal020"       "ZJst007"       "ZJst009"       "ZJst010"      
 [37] "ZJst012"       "ZJst014"       "ZJst015"       "ZJin001"      
 [41] "ZJin003"       "ZJsi002"       "ZJsi017"       "ZJsi032"      
 [45] "ZJsi023"       "ZJsi024"       "ZJsi027"       "ZJsi028"      
 [49] "ZJsi029"       "ZJsi031"       "ZPxx001"       "ZPxx002"      
 [53] "ZMxx002"       "ZMxx003"       "ZMxx004"       "ZMxx005"      
 [57] "ZMxx006"       "ZMOxx001"      "ZMOxx002"      "ZMOxx003"     
 [61] "ZMOxx005"      "ZMOha001"      "ZMOwh002"      "ZMOwh003"     
 [65] "ZMOwh004"      "ZMOwh005"      "ZMOwh006"      "ZMOvu002"     
 [69] "ZMOvu003"      "ZMOvu004"      "ZMOvu005"      "ZMOmo001"     
 [73] "ZMOpa001"      "ZMOpa002"      "ZNni001"       "ZERxx002"     
 [77] "ZERxx004"      "ZERxx005"      "Zsim23588"     "Zsim28142"    
 [81] "Zsim30897"     "Zpal23498"     "Zpal23522"     "Zmon20893"    
 [85] "Zmon20892"     "Zmon20902"     "Zmon20909"     "Zsim31166"    
 [89] "Zsim31159"     "Zmey17876"     "Zmey17877"     "Zmey17852"    
 [93] "Zmey17922"     "Zmey17920"     "Zmey17925"     "Zmey17923"    
 [97] "Zery28090"     "Zery28091"     "Zery28088"     "Zni10863"     
[101] "Zni14341"      "Zni19650"      "Zni17984"      "ZjaDOT-10981" 
[105] "ZjaDOT-5235"   "Z_LA_122866_2" "Z_LA_122577_2" "Z_LA_122188_2"
[109] "Zsim13809"     "Zsim13773"     "Zsim6797"      "Zsim10336"    
[113] "Zsim11362"     "Zsim11102"     "Zsim11220"     "Zmon20899"    
[117] "Zmon28375"     "Zev13949"      "Zev31650"      "Zev28451"     
[121] "Z_HI_BRY431"   "Z_HI_NAN290"   "Z_HI_WAI087"   "Z_HI_WAI078"  
[125] "Z_HI_SOL783"

#make sure sample info DF order matches order of samples in vcf (this should return all true if the dataframe sample order matches the order these samples occur in the vcf)
sample.info$ID == colnames(vcfR@gt)[-1]

  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [46] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [61] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [76] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [91] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[106] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[121] TRUE TRUE TRUE TRUE

#fix row names
rownames(sample.info)<-c(1:nrow(sample.info))

#remove the two putative hybrid samples from the vcf for this analysis
v.sub<-vcfR[,c(1:46,48:51,53:125)]

#subset sample info dataframe
sample.info<-sample.info[c(1:45,47:50,52:124),]

0.3 convert vcfR to genlight and calculate fst

#convert vcfR to genlight object
gen<-vcfR2genlight(v.sub)

#assign samples to groups based on species identity
gen@pop<-as.factor(sample.info$Species)
#calculate pairwise Fst using the stampp package
di.heat<-stamppFst(gen)
#extract the pairwise matrix
m<-di.heat$Fsts
#fill in upper triangle of the matrix
m[upper.tri(m)] <- t(m)[upper.tri(m)]

#melt to tidy format for ggplotting
heat <- reshape::melt(m)

Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by
the caller; using TRUE

Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by
the caller; using TRUE

0.4 plot results

#plot as heatmap with exact values labeling each cell
ggplot(data = heat, aes(x=X1, y=X2, fill=value)) + 
  geom_tile()+
  geom_text(data=heat,aes(label=round(value, 2)))+
  theme_minimal()+
  scale_fill_gradient2(low = "white", high = "red", space = "Lab", name="Fst") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        axis.text.y = element_text(angle = 45, hjust = 1))

Warning: Removed 6 rows containing missing values (`geom_text()`).

#save the plot
#ggsave("~/Desktop/cali.zosterops.rad/subset.heatmap.pdf", width = 4.5, height = 3, units= "in")

0.5 try splitting up the three clades of japonicus to compare the intra to inter species FST’s

#duplicate species column
spec.assignments<-sample.info$Species
#assign northern Philippines clade
spec.assignments[c(50:54,59:64,87:93)]<-"north.phil"
#assign southern Philippines clade
spec.assignments[c(55:58,65:71,81:86,98,113,114)]<-"south.phil"
#assign samples to the three groups shown above
gen@pop<-as.factor(spec.assignments)
#calculate pairwise Fst using the stampp package
di.heat<-stamppFst(gen)
#extract the pairwise matrix
m<-di.heat$Fsts
#fill in upper triangle of the matrix
m[upper.tri(m)] <- t(m)[upper.tri(m)]

#melt to tidy format for ggplotting
heat <- reshape::melt(m)

Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by
the caller; using TRUE

Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by
the caller; using TRUE

#plot as heatmap with exact values labeling each cell
ggplot(data = heat, aes(x=X1, y=X2, fill=value)) + 
  geom_tile()+
  geom_text(data=heat,aes(label=round(value, 2)))+
  theme_minimal()+
  scale_fill_gradient2(low = "white", high = "red", space = "Lab", name="Fst") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        axis.text.y = element_text(angle = 45, hjust = 1))

Warning: Removed 8 rows containing missing values (`geom_text()`).

#save plot
#ggsave("~/Desktop/cali.zosterops.rad/fst.heatmap.pdf", width = 5, height = 3, units= "in")

0.6 add in fixed differences

#identify the number of fixed differences between pops
#convert vcf to genotype matrix
mat<-extract.gt(v.sub)
#duplicate matrix
conv.mat<-mat
#convert genotypes to binary format (i.e., 0 = homozygous ref, 1 = het, 2 = homozygous alt)
conv.mat[conv.mat == "0/0"]<-0
conv.mat[conv.mat == "0/1"]<-1
conv.mat[conv.mat == "1/1"]<-2
#convert matrix to data.frame
conv.mat<-as.data.frame(conv.mat)
#convert to data.frame to numeric
for (i in 1:ncol(conv.mat)){
  conv.mat[,i]<-as.numeric(as.character(conv.mat[,i]))
}

#compare colnames of the matrix to your popmap to verify you're subsetting correctly
colnames(conv.mat) == sample.info$ID #should be all true

  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [46] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [61] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [76] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [91] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[106] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[121] TRUE TRUE

#make vector to fill with number of pairwise fixed diffs
f<-c()

#this generic for loop will calculate the number of fixed diffs between each of your designated pops
for (i in 1:nrow(heat)){
  #calc af of pop1 and pop2
  pop1.af<-(rowSums(conv.mat[,spec.assignments == heat$X1[i]], na.rm=T)/(rowSums(is.na(conv.mat[,spec.assignments == heat$X1[i]]) == FALSE)))/2
  pop2.af<-(rowSums(conv.mat[,spec.assignments == heat$X2[i]], na.rm=T)/(rowSums(is.na(conv.mat[,spec.assignments == heat$X2[i]]) == FALSE)))/2
  #store number of fixed differences
  f[i]<-sum(is.na(abs(pop1.af - pop2.af)) == FALSE & abs(pop1.af - pop2.af) == 1) #find fixed SNPs and add to vector
}

#make sure this worked correctly
f

 [1]    0  608    4   22 1123  575 1481 1281  608    0  497  487 1044  671 1396
[16] 1167    4  497    0    0  914  440 1276 1076   22  487    0    0  891  426
[31] 1249 1037 1123 1044  914  891    0  740 1163  119  575  671  440  426  740
[46]    0 1021  884 1481 1396 1276 1249 1163 1021    0 1332 1281 1167 1076 1037
[61]  119  884 1332    0

#add number of fixed diffs to your existing df
heat$fixed<-f

### this code will get you the vector needed to combine FST values and fixed differences into a single vector split by 
#define n as the number of taxa used in your pairwise Fst comparison
n<-8 #here 3
i<-1 #always begin incrementer (i) at 1
x<-c() #always begin with an empty vector
#while loop that will make the appropriate vector and store it in the variable 'x'
while (i < n){
  #the first set of numbers is simply 2:n
  if(i == 1){
    x<-c(2:n)
    i=i+1
  }
  #the second set of numbers is (2+n+1):(2*n) which we add to the existing vector
  if(i == 2){
    x<-c(x,(2+n+1):(2*n))
    i=i+1
  }
  
    if(n == 3){break} #handle the edge case where n=3 and the code proceeds to the next step even though it is in violation of the outside while loop, because it tests all internal statements before looping back to the top to test the while loop condition
  
  #we then add (2+((i-1)*(n+1))):(i*n) to the vector, where i=3, incrememnt i by 1, and continue adding this vector to the growing vector until i = n-1
  if(i > 2){
    x<-c(x,(2+((i-1)*(n+1))):(i*n))
    i=i+1
  }
}

#order your Fst and fixed difference values correctly in a single mixed vector to plot the Fst values above and # of fixed differences below the diagonal in the heatmap, using the vector you just created (named 'x')
heat$mixed<-heat$value
heat$mixed[x]<-heat$fixed[x]

#plot with labels
ggplot(data = heat, aes(x=X1, y=X2, fill=value)) + 
  geom_tile()+
  geom_text(data=heat,aes(label=round(mixed, 2)), size=4)+
  theme_minimal()+
  scale_fill_gradient2(low = "white", high = "red", space = "Lab", name="Fst") +
  theme(axis.text.x = element_text(angle = 45, vjust=.9, hjust = .9, size=12),
        axis.text.y = element_text(angle = 45, hjust = 1, size=12),
        axis.title.x = element_blank(), axis.title.y = element_blank())

Warning: Removed 8 rows containing missing values (`geom_text()`).

#ggsave plot
#ggsave("~/Desktop/cali.zosterops.rad/fst.heatmap.pdf", width = 5, height = 4, units= "in")