using your filtered vcf file to generate a phylip with seuqnece data
for all filtered loci (including invariant sites), for only the samples
that passed all filtering protocols
library(vcfR)
##
## ***** *** vcfR *** *****
## This is vcfR 1.12.0
## browseVignettes('vcfR') # Documentation
## citation('vcfR') # Citation
## ***** ***** ***** *****
#read in vcf with filtered, unlinked loci that we want to use for gene tree estimation
vcfR<-read.vcfR("~/Desktop/phil.dicaeum/filtered.85.unlinked.vcf")
## Scanning file to determine attributes.
## File attributes:
## meta lines: 14
## header_line: 15
## variant count: 2590
## column count: 69
##
Meta line 14 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
## Character matrix gt rows: 2590
## Character matrix gt cols: 69
## skip: 0
## nrows: 2590
## row_num: 0
##
Processed variant 1000
Processed variant 2000
Processed variant: 2590
## All variants processed
#check out the metadata
head(vcfR@fix)
## CHROM POS ID REF ALT QUAL FILTER INFO
## [1,] "1" "24" NA "G" "T" NA "PASS" "NS=51;AF=0.010"
## [2,] "7" "10" NA "G" "T" NA "PASS" "NS=58;AF=0.009"
## [3,] "8" "14" NA "C" "T" NA "PASS" "NS=56;AF=0.027"
## [4,] "10" "7" NA "C" "T" NA "PASS" "NS=56;AF=0.018"
## [5,] "11" "21" NA "A" "G" NA "PASS" "NS=55;AF=0.009"
## [6,] "12" "57" NA "G" "A" NA "PASS" "NS=54;AF=0.009"
#we want to isolate the third column, which contains the name of each locus
head(vcfR@fix[,1])
## [1] "1" "7" "8" "10" "11" "12"
#get a list of just locus names to use as whitelist for stacks
whitelist<-sub(":.*", "", vcfR@fix[,1])
#make sure each locus is unique
length(unique(whitelist)) == length(whitelist)
## [1] TRUE
#make sure the locus names look right
whitelist[1:5]
## [1] "1" "7" "8" "10" "11"
#write out whitelist for stacks
#write.table(whitelist, file = "~/Desktop/phil.dicaeum/2590.whitelist.txt", quote = F, row.names = F, col.names = F)
#generate popmap including only the samples in this filtered vcf file, assigning each sample to a unique pop so that we keep all tips distinct
#but phylip format limits names to 10 characters or less! So don't forget to make population names less than 10 characters
#here I use this code to do that:
colnames(vcfR@gt)[-1]
## [1] "D_hypoleucum_18070" "D_hypoleucum_18191"
## [3] "D_hypoleucum_14037" "D_hypoleucum_14079"
## [5] "D_hypoleucum_14065" "D_hypoleucum_14120"
## [7] "D_hypoleucum_18159" "D_hypoleucum_14075"
## [9] "D_hypoleucum_20218" "D_hypoleucum_19177"
## [11] "D_hypoleucum_19178" "D_hypoleucum_20213"
## [13] "D_hypoleucum_FMNH454949" "D_hypoleucum_25637"
## [15] "D_hypoleucum_1271" "D_hypoleucum_1273"
## [17] "D_hypoleucum_1275" "D_hypoleucum_25921"
## [19] "D_hypoleucum_3274" "D_hypoleucum_26984"
## [21] "D_hypoleucum_3208" "D_hypoleucum_3158"
## [23] "D_hypoleucum_2253" "D_hypoleucum_3095"
## [25] "D_hypoleucum_462070" "D_hypoleucum_25675"
## [27] "D_hypoleucum_454950" "D_hypoleucum_25880"
## [29] "D_hypoleucum_3314" "D_hypoleucum_357608"
## [31] "D_hypoleucum_357615" "D_hypoleucum_357612"
## [33] "D_hypoleucum_19638" "D_hypoleucum_25868"
## [35] "D_hypoleucum_17969" "D_hypoleucum_25672"
## [37] "D_hypoleucum_26975" "D_hypoleucum_2229"
## [39] "D_hypoleucum_2067" "D_hypoleucum_28329"
## [41] "D_hypoleucum_28361" "D_hypoleucum_28376"
## [43] "D_hypoleucum_2208" "D_hypoleucum_28294"
## [45] "D_hypoleucum_1956" "D_hypoleucum_25670"
## [47] "D_hypoleucum_20921" "D_hypoleucum_20193"
## [49] "D_hypoleucum_28416" "D_hypoleucum_18193"
## [51] "D_hypoleucum_27450" "D_hypoleucum_19046"
## [53] "D_hypoleucum_19136" "D_hypoleucum_28676"
## [55] "D_nigrilore_KU28413" "D_nigrilore_KU28414"
## [57] "D_hypoleucum_29945" "D_hypoleucum_31636"
## [59] "D_hypoleucum_29951" "D_hypoleucum_31644"
gsub("D_hypoleucum_","hyp",colnames(vcfR@gt)[-1])
## [1] "hyp18070" "hyp18191" "hyp14037"
## [4] "hyp14079" "hyp14065" "hyp14120"
## [7] "hyp18159" "hyp14075" "hyp20218"
## [10] "hyp19177" "hyp19178" "hyp20213"
## [13] "hypFMNH454949" "hyp25637" "hyp1271"
## [16] "hyp1273" "hyp1275" "hyp25921"
## [19] "hyp3274" "hyp26984" "hyp3208"
## [22] "hyp3158" "hyp2253" "hyp3095"
## [25] "hyp462070" "hyp25675" "hyp454950"
## [28] "hyp25880" "hyp3314" "hyp357608"
## [31] "hyp357615" "hyp357612" "hyp19638"
## [34] "hyp25868" "hyp17969" "hyp25672"
## [37] "hyp26975" "hyp2229" "hyp2067"
## [40] "hyp28329" "hyp28361" "hyp28376"
## [43] "hyp2208" "hyp28294" "hyp1956"
## [46] "hyp25670" "hyp20921" "hyp20193"
## [49] "hyp28416" "hyp18193" "hyp27450"
## [52] "hyp19046" "hyp19136" "hyp28676"
## [55] "D_nigrilore_KU28413" "D_nigrilore_KU28414" "hyp29945"
## [58] "hyp31636" "hyp29951" "hyp31644"
#make popmap
p<-gsub("D_hypoleucum_","hyp",colnames(vcfR@gt)[-1])
p<-gsub("D_nigrilore_","ni",p)
df<-data.frame(ind=colnames(vcfR@gt)[-1],pop=p)
#write out popmap for stacks
#write.table(df, file = "~/Desktop/phil.dicaeum/iqtree.popmap.txt", quote = F, row.names = F, col.names = F, sep = "\t")
copy both of those files you just wrote to you local disk into your
project directory on the cluster
visualize your tree using the figtree GUI
#start by copying in the entire directory where you ran iqtree to your local machine, e.g.,
#scp -r d669d153@hpc.crc.ku.edu:/home/d669d153/work/phil.dicaeum/iqtree /Users/devder/Desktop/phil.dicaeum/
#open the tree in figtree
#unrooted tree looks like this:
knitr::include_graphics("/Users/devder/Desktop/phil.dicaeum/unrooted.iqtree.png")

#After manually rooting on the outgroup, color coding internal branches based on bootstrap support values, adding key bootstrap values directly, and labeling clades
knitr::include_graphics("/Users/devder/Desktop/phil.dicaeum/iqtree/iqtree.png")
