Get input sample assignments
library(vcfR)
##
## ***** *** vcfR *** *****
## This is vcfR 1.12.0
## browseVignettes('vcfR') # Documentation
## citation('vcfR') # Citation
## ***** ***** ***** *****
#we will also read in the vcf to R:
v<-read.vcfR("~/Desktop/phil.dicaeum/filtered.85.vcf")
## Scanning file to determine attributes.
## File attributes:
## meta lines: 14
## header_line: 15
## variant count: 18499
## column count: 69
##
Meta line 14 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
## Character matrix gt rows: 18499
## Character matrix gt cols: 69
## skip: 0
## nrows: 18499
## row_num: 0
##
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant 6000
Processed variant 7000
Processed variant 8000
Processed variant 9000
Processed variant 10000
Processed variant 11000
Processed variant 12000
Processed variant 13000
Processed variant 14000
Processed variant 15000
Processed variant 16000
Processed variant 17000
Processed variant 18000
Processed variant: 18499
## All variants processed
#also read in your sample info file
pops<-read.csv("~/Desktop/phil.dicaeum/dicaeum.sampling.csv")
#it should look roughly like this:
head(pops)
## ID Tissue Species Taxa State
## 1 D_hypoleucum_18070 18070 hypoleucum Luzon Camarines Norte
## 2 D_hypoleucum_18191 18191 hypoleucum Zamboanga
## 3 D_hypoleucum_14061 14061 hypoleucum Mindanao Dinagat
## 4 D_hypoleucum_14037 14037 hypoleucum Mindanao Dinagat
## 5 D_hypoleucum_14079 14079 hypoleucum Mindanao Dinagat
## 6 D_hypoleucum_14065 14065 hypoleucum Mindanao Dinagat
## Specificlocality
## 1 Luzon Island; Municipality Labo; Barangay Tulay na Lupa, Mt. Labo
## 2 Mindanao Island, City of Zamboanga; Brgy. Pasonanca, Pasonanca Nat. Park
## 3 Dinagat Island, Muncipality of Loreto, Brgy. Santiago, Sitio Cambinliw, Sudlon
## 4 Dinagat Island, Muncipality of Loreto, Brgy. Santiago, Sitio Cambinliw, Sudlon
## 5 Dinagat Island, Muncipality of Loreto, Brgy. Santiago, Sitio Cambinliw, Sudlon
## 6 Dinagat Island, Municipality Loreto, Brgy. Santiago, Sitio Cambinliu, Sudlon
## Latitude Longitude Altitude
## 1 14.039000 122.7860 580
## 2 7.018483 122.0273 750
## 3 10.343680 125.6181 200
## 4 10.343680 125.6181 200
## 5 10.343680 125.6181 200
## 6 10.343680 125.6181 190
#retain only samples that passed all filtering protocols (assumes that the 'ID' column is identical to sample names in the vcf)
pops<-pops[pops$ID %in% colnames(v@gt),]
#now we can make a popmap file for Dsuite
popmap<-pops[,c(1,4)]
#print popmap to screen
popmap
## ID Taxa
## 1 D_hypoleucum_18070 Luzon
## 2 D_hypoleucum_18191 Zamboanga
## 4 D_hypoleucum_14037 Mindanao
## 5 D_hypoleucum_14079 Mindanao
## 6 D_hypoleucum_14065 Mindanao
## 9 D_hypoleucum_14120 Mindanao
## 12 D_hypoleucum_18159 Zamboanga
## 13 D_hypoleucum_14075 Mindanao
## 14 D_hypoleucum_20218 Luzon
## 16 D_hypoleucum_19177 Zamboanga
## 17 D_hypoleucum_19178 Zamboanga
## 19 D_hypoleucum_20213 Luzon
## 20 D_hypoleucum_FMNH454949 Luzon
## 21 D_hypoleucum_25637 Luzon
## 22 D_hypoleucum_1271 Mindanao
## 23 D_hypoleucum_1273 Mindanao
## 24 D_hypoleucum_1275 Mindanao
## 25 D_hypoleucum_25921 Luzon
## 26 D_hypoleucum_3274 Mindanao
## 27 D_hypoleucum_26984 Luzon
## 28 D_hypoleucum_3208 Mindanao
## 29 D_hypoleucum_3158 Mindanao
## 30 D_hypoleucum_2253 Mindanao
## 31 D_hypoleucum_3095 Mindanao
## 32 D_hypoleucum_462070 Luzon
## 33 D_hypoleucum_25675 Luzon
## 35 D_hypoleucum_454950 Luzon
## 36 D_hypoleucum_25880 Luzon
## 39 D_hypoleucum_3314 Mindanao
## 40 D_hypoleucum_357608 Mindanao
## 41 D_hypoleucum_17969 Luzon
## 42 D_hypoleucum_357615 Mindanao
## 43 D_hypoleucum_357612 Mindanao
## 44 D_hypoleucum_19638 Luzon
## 45 D_hypoleucum_25868 Luzon
## 49 D_hypoleucum_25672 Luzon
## 50 D_hypoleucum_26975 Luzon
## 51 D_hypoleucum_2229 Mindanao
## 52 D_hypoleucum_2067 Mindanao
## 54 D_hypoleucum_28329 Mindanao
## 55 D_hypoleucum_28361 Mindanao
## 56 D_hypoleucum_28376 Mindanao
## 58 D_hypoleucum_2208 Mindanao
## 60 D_hypoleucum_28294 Mindanao
## 61 D_hypoleucum_1956 Mindanao
## 66 D_hypoleucum_25670 Luzon
## 68 D_hypoleucum_20921 Mindanao
## 69 D_hypoleucum_20193 Luzon
## 70 D_hypoleucum_28416 Mindanao
## 71 D_hypoleucum_18193 Zamboanga
## 73 D_hypoleucum_27450 Mindanao
## 74 D_hypoleucum_19046 Mindanao
## 75 D_hypoleucum_19136 Mindanao
## 77 D_hypoleucum_28676 Mindanao
## 78 D_hypoleucum_29945 Zamboanga
## 79 D_hypoleucum_31636 Mindanao
## 80 D_hypoleucum_29951 Zamboanga
## 81 D_hypoleucum_31644 Mindanao
## 127 D_nigrilore_KU28413 nigrilore
## 128 D_nigrilore_KU28414 nigrilore
#we are going to manually change "nigrilore" to "Outgroup"
popmap[popmap$Taxa == "nigrilore",]<-"Outgroup"
#write out this table and copy it into your cluster working directory next to your vcf (make sure to remove column and row names)
#write.table(popmap, file="~/Downloads/dicaeum.pops.txt", sep="\t", quote=F, row.names = F, col.names=F)
Visualize your results
#make a 'plot order' text file. Mine just looks like this:
Zamboanga
Mindanao
Luzon
Outgroup
##copy in ruby scripts. I already have them downloaded in another directory.
cp /home/d669d153/work/todi.subset.2022/dsuite/plot_d.rb .
cp /home/d669d153/work/todi.subset.2022/dsuite/plot_f4ratio.rb .
#If you don't, you can download them with:
wget https://raw.githubusercontent.com/millanek/tutorials/master/analysis_of_introgression_with_snp_data/src/plot_d.rb
wget https://raw.githubusercontent.com/millanek/tutorials/master/analysis_of_introgression_with_snp_data/src/plot_f4ratio.rb
##plot heatmap of D
#the text file is an input file you generated by running Dtrios, that will always end with '_BBAA.txt'
ruby plot_d.rb pops_all.samples_BBAA.txt plot_order.txt 0.7 all.samples_BBAA_D.svg
##plot heatmap of f4
ruby plot_f4ratio.rb pops_all.samples_BBAA.txt plot_order.txt 0.2 all.samples_BBAA_f4ratio.svg
##run fbranch on the guidetree, with the Dtrios output ending in 'tree.txt' (guidetree must be rooted on #outgroup)
/home/d669d153/work/Dsuite/Build/Dsuite Fbranch todi.nwk.tre pops_all.samples_tree.txt > unlinked_Fbranch.txt
#activate python
module load python
#plot the fbranch heatmap with guidetree above
/home/d669d153/work/Dsuite/utils/dtools.py all.samples_Fbranch.txt nwk.tre
check out results
#heatmap of D
knitr::include_graphics("/Users/devder/Desktop/phil.dicaeum/dsuite/unlinked_BBAA_D.svg")

#heatmap of F4
knitr::include_graphics("/Users/devder/Desktop/phil.dicaeum/dsuite/unlinked_BBAA_f4ratio.svg")

#plot of fbranch statistic
knitr::include_graphics("/Users/devder/Desktop/phil.dicaeum/dsuite/fbranch.svg")

Now we are going to repeat the above exercise with samples from
Zamboanga removed
#remove the five samples from Mt. busa from the vcf
v.nobusa<-v[,colnames(v@gt)!="D_hypoleucum_2253" & colnames(v@gt)!="D_hypoleucum_2229" & colnames(v@gt)!="D_hypoleucum_2067" & colnames(v@gt)!="D_hypoleucum_2208" & colnames(v@gt)!="D_hypoleucum_1956"]
#remove SNPs that have become invariant because of sample removal
library(SNPfiltR)
## This is SNPfiltR v.1.0.0
##
## Detailed usage information is available at: devonderaad.github.io/SNPfiltR/
##
## If you use SNPfiltR in your published work, please cite the following papers:
##
## DeRaad, D.A. (2022), SNPfiltR: an R package for interactive and reproducible SNP filtering. Molecular Ecology Resources, 00, 1-15. http://doi.org/10.1111/1755-0998.13618
##
## Knaus, Brian J., and Niklaus J. Grunwald. 2017. VCFR: a package to manipulate and visualize variant call format data in R. Molecular Ecology Resources, 17.1:44-53. http://doi.org/10.1111/1755-0998.12549
##
## Attaching package: 'SNPfiltR'
## The following object is masked _by_ '.GlobalEnv':
##
## popmap
v.nobusa<-min_mac(v.nobusa, min.mac = 1)
## 5.01% of SNPs fell below a minor allele count of 1 and were removed from the VCF

#check vcf
v.nobusa
## ***** Object of Class vcfR *****
## 55 samples
## 2586 CHROMs
## 17,573 variants
## Object size: 80.6 Mb
## 11.05 percent missing data
## ***** ***** *****
#write out
#vcfR::write.vcf(v.nobusa, "~/Desktop/phil.dicaeum/vcf.nomtbusa.vcf.gz")
#update the popmap
popmap<-popmap[c(popmap$ID!="D_hypoleucum_2253" & popmap$ID!="D_hypoleucum_2229" & popmap$ID!="D_hypoleucum_2067" & popmap$ID!="D_hypoleucum_2208" & popmap$ID!="D_hypoleucum_1956"),]
#write out
#write.table(popmap, file="~/Downloads/dicaeum.nomtbusa.pops.txt", sep="\t", quote=F, row.names = F, col.names=F)
#copy both of these to a subfolder eg:
#scp -r /Users/devder/Desktop/phil.dicaeum/vcf.nomtbusa.vcf d669d153@hpc.crc.ku.edu:/home/d669d153/work/phil.dicaeum/dsuite/nomtbusa/
#no need to update the tree, as the tips haven't changed, we've just removed a few samples from one of the tips
#just copy the tree file into your new subdirectory for this run
Visualize your results
#use the same 'plot order' text file
##copy in ruby scripts. I already have them downloaded in another directory.
cp /home/d669d153/work/todi.subset.2022/dsuite/plot_d.rb .
cp /home/d669d153/work/todi.subset.2022/dsuite/plot_f4ratio.rb .
##plot heatmap of D
#the text file is an input file you generated by running Dtrios, that will always end with '_BBAA.txt'
ruby plot_d.rb pops_no.mtbusa_BBAA.txt plot_order.txt 0.7 no.mtbusa_BBAA_D.svg
##plot heatmap of f4
ruby plot_f4ratio.rb pops_no.mtbusa_BBAA.txt plot_order.txt 0.2 no.mtbusa_BBAA_f4ratio.svg
##run fbranch on the guidetree, with the Dtrios output ending in 'tree.txt' (guidetree must be rooted on #outgroup)
/home/d669d153/work/Dsuite/Build/Dsuite Fbranch todi.nwk.tre pops_no.mtbusa_tree.txt > unlinked_Fbranch.txt
#activate python
module load python
#plot the fbranch heatmap with guidetree above
/home/d669d153/work/Dsuite/utils/dtools.py no.mtbusa_Fbranch.txt nwk.tre
check out results
#heatmap of D
knitr::include_graphics("/Users/devder/Desktop/phil.dicaeum/dsuite/nomtbusa/unlinked_BBAA_D.svg")

#heatmap of F4
knitr::include_graphics("/Users/devder/Desktop/phil.dicaeum/dsuite/nomtbusa/unlinked_BBAA_f4ratio.svg")

#plot of fbranch statistic
knitr::include_graphics("/Users/devder/Desktop/phil.dicaeum/dsuite/nomtbusa/fbranch.svg")
