calc fst and fixed differences

read in your filtered vcf file

library(vcfR)

## 
##    *****       ***   vcfR   ***       *****
##    This is vcfR 1.12.0 
##      browseVignettes('vcfR') # Documentation
##      citation('vcfR') # Citation
##    *****       *****      *****       *****

library(adegenet)

## Loading required package: ade4

## 
##    /// adegenet 2.1.5 is loaded ////////////
## 
##    > overview: '?adegenet'
##    > tutorials/doc/questions: 'adegenetWeb()' 
##    > bug reports/feature requests: adegenetIssues()

library(StAMPP)

## Loading required package: pegas

## Loading required package: ape

## Registered S3 method overwritten by 'pegas':
##   method      from
##   print.amova ade4

## 
## Attaching package: 'pegas'

## The following object is masked from 'package:ape':
## 
##     mst

## The following object is masked from 'package:ade4':
## 
##     amova

## The following objects are masked from 'package:vcfR':
## 
##     getINFO, write.vcf

library(ggplot2)
#read vcf
v<-read.vcfR("~/Desktop/phil.dicaeum/filtered.85.vcf")

## Scanning file to determine attributes.
## File attributes:
##   meta lines: 14
##   header_line: 15
##   variant count: 18499
##   column count: 69
## 
Meta line 14 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
##   Character matrix gt rows: 18499
##   Character matrix gt cols: 69
##   skip: 0
##   nrows: 18499
##   row_num: 0
## 
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant 6000
Processed variant 7000
Processed variant 8000
Processed variant 9000
Processed variant 10000
Processed variant 11000
Processed variant 12000
Processed variant 13000
Processed variant 14000
Processed variant 15000
Processed variant 16000
Processed variant 17000
Processed variant 18000
Processed variant: 18499
## All variants processed

#read in your sample info file
pops<-read.csv("~/Desktop/phil.dicaeum/dicaeum.sampling.csv")
#it should look roughly like this:
head(pops)

##                   ID Tissue    Species      Taxa           State
## 1 D_hypoleucum_18070  18070 hypoleucum     Luzon Camarines Norte
## 2 D_hypoleucum_18191  18191 hypoleucum Zamboanga                
## 3 D_hypoleucum_14061  14061 hypoleucum  Mindanao         Dinagat
## 4 D_hypoleucum_14037  14037 hypoleucum  Mindanao         Dinagat
## 5 D_hypoleucum_14079  14079 hypoleucum  Mindanao         Dinagat
## 6 D_hypoleucum_14065  14065 hypoleucum  Mindanao         Dinagat
##                                                                 Specificlocality
## 1              Luzon Island; Municipality Labo; Barangay Tulay na Lupa, Mt. Labo
## 2       Mindanao Island, City of Zamboanga; Brgy. Pasonanca, Pasonanca Nat. Park
## 3 Dinagat Island, Muncipality of Loreto, Brgy. Santiago, Sitio Cambinliw, Sudlon
## 4 Dinagat Island, Muncipality of Loreto, Brgy. Santiago, Sitio Cambinliw, Sudlon
## 5 Dinagat Island, Muncipality of Loreto, Brgy. Santiago, Sitio Cambinliw, Sudlon
## 6   Dinagat Island, Municipality Loreto, Brgy. Santiago, Sitio Cambinliu, Sudlon
##    Latitude Longitude Altitude
## 1 14.039000  122.7860      580
## 2  7.018483  122.0273      750
## 3 10.343680  125.6181      200
## 4 10.343680  125.6181      200
## 5 10.343680  125.6181      200
## 6 10.343680  125.6181      190

#retain only samples that passed all filtering protocols (assumes that the 'ID' column is identical to sample names in the vcf)
pops<-pops[pops$ID %in% colnames(v@gt),]
#reorder the sample info file to match the order of samples in the vcf
pops<-pops[order(match(pops$ID,colnames(v@gt)[-1])),]

I have already performed a principal components analysis, which separated the three groups perfectly based on geography

#see PCA below
knitr::include_graphics("/Users/devder/Desktop/phil.dicaeum/dicaeum.pca.png")

#now I want to know the pairwise Fst between these three visible groups (only 3 total comparisons with three taxa)
#start by removing the outgroup

#remove two nigrilore samples from the vcf
v.sub<-v[,colnames(v@gt) != "D_nigrilore_KU28413" & colnames(v@gt) != "D_nigrilore_KU28414"]
#remove the two samples from the sample info file
pops.sub<-pops[pops$ID != "D_nigrilore_KU28413" & pops$ID != "D_nigrilore_KU28414",]

#convert vcfR to genlight
gen<-vcfR2genlight(v.sub)

#assign samples to the three groups shown above
gen@pop<-as.factor(pops.sub$Taxa)
#calculate pairwise Fst using the stampp package
di.heat<-stamppFst(gen)
#extract the pairwise matrix
m<-di.heat$Fsts
#fill in upper triangle of the matrix
m[upper.tri(m)] <- t(m)[upper.tri(m)]

#melt to tidy format for ggplotting
heat <- reshape::melt(m)

## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by the
## caller; using TRUE

## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by the
## caller; using TRUE

#plot as heatmap with exact values labeling each cell
ggplot(data = heat, aes(x=X1, y=X2, fill=value)) + 
  geom_tile()+
  geom_text(data=heat,aes(label=round(value, 2)))+
  theme_minimal()+
  scale_fill_gradient2(low = "white", high = "red", space = "Lab", name="Fst") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        axis.text.y = element_text(angle = 45, hjust = 1))

## Warning: Removed 3 rows containing missing values (geom_text).

add in pairwise fixed differences

#identify the number of fixed differences between pops
#convert vcf to genotype matrix
mat<-extract.gt(v.sub)
conv.mat<-mat
conv.mat[conv.mat == "0/0"]<-0
conv.mat[conv.mat == "0/1"]<-1
conv.mat[conv.mat == "1/1"]<-2
conv.mat<-as.data.frame(conv.mat)
#convert to matrix numeric
for (i in 1:ncol(conv.mat)){
  conv.mat[,i]<-as.numeric(as.character(conv.mat[,i]))
}

#compare colnames of the matrix to your popmap to verify you're subsetting correctly
colnames(conv.mat) == pops.sub$ID #should be all true

##  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [46] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE

#make vector to fill with number of pairwise fixed diffs
f<-c()

#this generic for loop will calculate the number of fixed diffs between each of your designated pops
for (i in 1:nrow(heat)){
  #calc af of pop1 and pop2
  pop1.af<-(rowSums(conv.mat[,pops.sub$Taxa == heat$X1[i]], na.rm=T)/(rowSums(is.na(conv.mat[,pops.sub$Taxa == heat$X1[i]]) == FALSE)))/2
  pop2.af<-(rowSums(conv.mat[,pops.sub$Taxa == heat$X2[i]], na.rm=T)/(rowSums(is.na(conv.mat[,pops.sub$Taxa == heat$X2[i]]) == FALSE)))/2
  #store number of fixed differences
  f[i]<-sum(is.na(abs(pop1.af - pop2.af)) == FALSE & abs(pop1.af - pop2.af) == 1) #find fixed SNPs and add to vector
}

#make sure this worked correctly
f

## [1]  0 50  5 50  0  9  5  9  0

#add number of fixed diffs to your existing df
heat$fixed<-f

### this code will get you the vector needed to combine FST values and fixed differences into a single vector split by 
#define n as the number of taxa used in your pairwise Fst comparison
n<-3 #here 3
i<-1 #always begin incrementer (i) at 1
x<-c() #always begin with an empty vector
#while loop that will make the appropriate vector and store it in the variable 'x'
while (i < n){
  #the first set of numbers is simply 2:n
  if(i == 1){
    x<-c(2:n)
    i=i+1
  }
  #the second set of numbers is (2+n+1):(2*n) which we add to the existing vector
  if(i == 2){
    x<-c(x,(2+n+1):(2*n))
    i=i+1
  }
  
    if(n == 3){break} #handle the edge case where n=3 and the code proceeds to the next step even though it is in violation of the outside while loop, because it tests all internal statements before looping back to the top to test the while loop condition
  
  #we then add (2+((i-1)*(n+1))):(i*n) to the vector, where i=3, incrememnt i by 1, and continue adding this vector to the growing vector until i = n-1
  if(i > 2){
    x<-c(x,(2+((i-1)*(n+1))):(i*n))
    i=i+1
  }
}

#order your Fst and fixed difference values correctly in a single mixed vector to plot the Fst values above and # of fixed differences below the diagonal in the heatmap, using the vector you just created (named 'x')
heat$mixed<-heat$value
heat$mixed[x]<-heat$fixed[x]

#plot with labels
ggplot(data = heat, aes(x=X1, y=X2, fill=value)) + 
  geom_tile()+
  geom_text(data=heat,aes(label=round(mixed, 2)), size=4)+
  theme_minimal()+
  scale_fill_gradient2(low = "white", high = "red", space = "Lab", name="Fst") +
  theme(axis.text.x = element_text(angle = 45, vjust=.9, hjust = .9, size=12),
        axis.text.y = element_text(angle = 45, hjust = 1, size=12),
        axis.title.x = element_blank(), axis.title.y = element_blank())

## Warning: Removed 3 rows containing missing values (geom_text).

I’m going to put these values on the PCA using photoshop to make a single cohesive figure

#see final product
knitr::include_graphics("/Users/devder/Desktop/phil.dicaeum/pca.copy.png")

calc fst and fixed differences

Devon DeRaad

2022-11-30

read in your filtered vcf file

I have already performed a principal components analysis, which separated the three groups perfectly based on geography

add in pairwise fixed differences

I’m going to put these values on the PCA using photoshop to make a single cohesive figure