read in your filtered vcf file
library(vcfR)
##
## ***** *** vcfR *** *****
## This is vcfR 1.12.0
## browseVignettes('vcfR') # Documentation
## citation('vcfR') # Citation
## ***** ***** ***** *****
library(adegenet)
## Loading required package: ade4
##
## /// adegenet 2.1.5 is loaded ////////////
##
## > overview: '?adegenet'
## > tutorials/doc/questions: 'adegenetWeb()'
## > bug reports/feature requests: adegenetIssues()
library(StAMPP)
## Loading required package: pegas
## Loading required package: ape
## Registered S3 method overwritten by 'pegas':
## method from
## print.amova ade4
##
## Attaching package: 'pegas'
## The following object is masked from 'package:ape':
##
## mst
## The following object is masked from 'package:ade4':
##
## amova
## The following objects are masked from 'package:vcfR':
##
## getINFO, write.vcf
library(ggplot2)
#read vcf
v<-read.vcfR("~/Desktop/phil.dicaeum/filtered.85.vcf")
## Scanning file to determine attributes.
## File attributes:
## meta lines: 14
## header_line: 15
## variant count: 18499
## column count: 69
##
Meta line 14 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
## Character matrix gt rows: 18499
## Character matrix gt cols: 69
## skip: 0
## nrows: 18499
## row_num: 0
##
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant 6000
Processed variant 7000
Processed variant 8000
Processed variant 9000
Processed variant 10000
Processed variant 11000
Processed variant 12000
Processed variant 13000
Processed variant 14000
Processed variant 15000
Processed variant 16000
Processed variant 17000
Processed variant 18000
Processed variant: 18499
## All variants processed
#read in your sample info file
pops<-read.csv("~/Desktop/phil.dicaeum/dicaeum.sampling.csv")
#it should look roughly like this:
head(pops)
## ID Tissue Species Taxa State
## 1 D_hypoleucum_18070 18070 hypoleucum Luzon Camarines Norte
## 2 D_hypoleucum_18191 18191 hypoleucum Zamboanga
## 3 D_hypoleucum_14061 14061 hypoleucum Mindanao Dinagat
## 4 D_hypoleucum_14037 14037 hypoleucum Mindanao Dinagat
## 5 D_hypoleucum_14079 14079 hypoleucum Mindanao Dinagat
## 6 D_hypoleucum_14065 14065 hypoleucum Mindanao Dinagat
## Specificlocality
## 1 Luzon Island; Municipality Labo; Barangay Tulay na Lupa, Mt. Labo
## 2 Mindanao Island, City of Zamboanga; Brgy. Pasonanca, Pasonanca Nat. Park
## 3 Dinagat Island, Muncipality of Loreto, Brgy. Santiago, Sitio Cambinliw, Sudlon
## 4 Dinagat Island, Muncipality of Loreto, Brgy. Santiago, Sitio Cambinliw, Sudlon
## 5 Dinagat Island, Muncipality of Loreto, Brgy. Santiago, Sitio Cambinliw, Sudlon
## 6 Dinagat Island, Municipality Loreto, Brgy. Santiago, Sitio Cambinliu, Sudlon
## Latitude Longitude Altitude
## 1 14.039000 122.7860 580
## 2 7.018483 122.0273 750
## 3 10.343680 125.6181 200
## 4 10.343680 125.6181 200
## 5 10.343680 125.6181 200
## 6 10.343680 125.6181 190
#retain only samples that passed all filtering protocols (assumes that the 'ID' column is identical to sample names in the vcf)
pops<-pops[pops$ID %in% colnames(v@gt),]
#reorder the sample info file to match the order of samples in the vcf
pops<-pops[order(match(pops$ID,colnames(v@gt)[-1])),]
add in pairwise fixed differences
#identify the number of fixed differences between pops
#convert vcf to genotype matrix
mat<-extract.gt(v.sub)
conv.mat<-mat
conv.mat[conv.mat == "0/0"]<-0
conv.mat[conv.mat == "0/1"]<-1
conv.mat[conv.mat == "1/1"]<-2
conv.mat<-as.data.frame(conv.mat)
#convert to matrix numeric
for (i in 1:ncol(conv.mat)){
conv.mat[,i]<-as.numeric(as.character(conv.mat[,i]))
}
#compare colnames of the matrix to your popmap to verify you're subsetting correctly
colnames(conv.mat) == pops.sub$ID #should be all true
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [46] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
#make vector to fill with number of pairwise fixed diffs
f<-c()
#this generic for loop will calculate the number of fixed diffs between each of your designated pops
for (i in 1:nrow(heat)){
#calc af of pop1 and pop2
pop1.af<-(rowSums(conv.mat[,pops.sub$Taxa == heat$X1[i]], na.rm=T)/(rowSums(is.na(conv.mat[,pops.sub$Taxa == heat$X1[i]]) == FALSE)))/2
pop2.af<-(rowSums(conv.mat[,pops.sub$Taxa == heat$X2[i]], na.rm=T)/(rowSums(is.na(conv.mat[,pops.sub$Taxa == heat$X2[i]]) == FALSE)))/2
#store number of fixed differences
f[i]<-sum(is.na(abs(pop1.af - pop2.af)) == FALSE & abs(pop1.af - pop2.af) == 1) #find fixed SNPs and add to vector
}
#make sure this worked correctly
f
## [1] 0 50 5 50 0 9 5 9 0
#add number of fixed diffs to your existing df
heat$fixed<-f
### this code will get you the vector needed to combine FST values and fixed differences into a single vector split by
#define n as the number of taxa used in your pairwise Fst comparison
n<-3 #here 3
i<-1 #always begin incrementer (i) at 1
x<-c() #always begin with an empty vector
#while loop that will make the appropriate vector and store it in the variable 'x'
while (i < n){
#the first set of numbers is simply 2:n
if(i == 1){
x<-c(2:n)
i=i+1
}
#the second set of numbers is (2+n+1):(2*n) which we add to the existing vector
if(i == 2){
x<-c(x,(2+n+1):(2*n))
i=i+1
}
if(n == 3){break} #handle the edge case where n=3 and the code proceeds to the next step even though it is in violation of the outside while loop, because it tests all internal statements before looping back to the top to test the while loop condition
#we then add (2+((i-1)*(n+1))):(i*n) to the vector, where i=3, incrememnt i by 1, and continue adding this vector to the growing vector until i = n-1
if(i > 2){
x<-c(x,(2+((i-1)*(n+1))):(i*n))
i=i+1
}
}
#order your Fst and fixed difference values correctly in a single mixed vector to plot the Fst values above and # of fixed differences below the diagonal in the heatmap, using the vector you just created (named 'x')
heat$mixed<-heat$value
heat$mixed[x]<-heat$fixed[x]
#plot with labels
ggplot(data = heat, aes(x=X1, y=X2, fill=value)) +
geom_tile()+
geom_text(data=heat,aes(label=round(mixed, 2)), size=4)+
theme_minimal()+
scale_fill_gradient2(low = "white", high = "red", space = "Lab", name="Fst") +
theme(axis.text.x = element_text(angle = 45, vjust=.9, hjust = .9, size=12),
axis.text.y = element_text(angle = 45, hjust = 1, size=12),
axis.title.x = element_blank(), axis.title.y = element_blank())
## Warning: Removed 3 rows containing missing values (geom_text).
