iqtree

using your filtered vcf file to generate a phylip with seuqnece data for all filtered loci (including invariant sites), for only the samples that passed all filtering protocols

library(vcfR)

## 
##    *****       ***   vcfR   ***       *****
##    This is vcfR 1.12.0 
##      browseVignettes('vcfR') # Documentation
##      citation('vcfR') # Citation
##    *****       *****      *****       *****

#read in vcf with filtered, unlinked loci that we want to use for gene tree estimation
vcfR<-read.vcfR("~/Desktop/phil.dicaeum/filtered.85.unlinked.vcf")

## Scanning file to determine attributes.
## File attributes:
##   meta lines: 14
##   header_line: 15
##   variant count: 2590
##   column count: 69
## 
Meta line 14 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
##   Character matrix gt rows: 2590
##   Character matrix gt cols: 69
##   skip: 0
##   nrows: 2590
##   row_num: 0
## 
Processed variant 1000
Processed variant 2000
Processed variant: 2590
## All variants processed

#check out the metadata
head(vcfR@fix)

##      CHROM POS  ID REF ALT QUAL FILTER INFO            
## [1,] "1"   "24" NA "G" "T" NA   "PASS" "NS=51;AF=0.010"
## [2,] "7"   "10" NA "G" "T" NA   "PASS" "NS=58;AF=0.009"
## [3,] "8"   "14" NA "C" "T" NA   "PASS" "NS=56;AF=0.027"
## [4,] "10"  "7"  NA "C" "T" NA   "PASS" "NS=56;AF=0.018"
## [5,] "11"  "21" NA "A" "G" NA   "PASS" "NS=55;AF=0.009"
## [6,] "12"  "57" NA "G" "A" NA   "PASS" "NS=54;AF=0.009"

#we want to isolate the third column, which contains the name of each locus
head(vcfR@fix[,1])

## [1] "1"  "7"  "8"  "10" "11" "12"

#get a list of just locus names to use as whitelist for stacks
whitelist<-sub(":.*", "", vcfR@fix[,1])
#make sure each locus is unique
length(unique(whitelist)) == length(whitelist)

## [1] TRUE

#make sure the locus names look right
whitelist[1:5]

## [1] "1"  "7"  "8"  "10" "11"

#write out whitelist for stacks
#write.table(whitelist, file = "~/Desktop/phil.dicaeum/2590.whitelist.txt", quote = F, row.names = F, col.names = F)

#generate popmap including only the samples in this filtered vcf file, assigning each sample to a unique pop so that we keep all tips distinct
#but phylip format limits names to 10 characters or less! So don't forget to make population names less than 10 characters
#here I use this code to do that:
colnames(vcfR@gt)[-1]

##  [1] "D_hypoleucum_18070"      "D_hypoleucum_18191"     
##  [3] "D_hypoleucum_14037"      "D_hypoleucum_14079"     
##  [5] "D_hypoleucum_14065"      "D_hypoleucum_14120"     
##  [7] "D_hypoleucum_18159"      "D_hypoleucum_14075"     
##  [9] "D_hypoleucum_20218"      "D_hypoleucum_19177"     
## [11] "D_hypoleucum_19178"      "D_hypoleucum_20213"     
## [13] "D_hypoleucum_FMNH454949" "D_hypoleucum_25637"     
## [15] "D_hypoleucum_1271"       "D_hypoleucum_1273"      
## [17] "D_hypoleucum_1275"       "D_hypoleucum_25921"     
## [19] "D_hypoleucum_3274"       "D_hypoleucum_26984"     
## [21] "D_hypoleucum_3208"       "D_hypoleucum_3158"      
## [23] "D_hypoleucum_2253"       "D_hypoleucum_3095"      
## [25] "D_hypoleucum_462070"     "D_hypoleucum_25675"     
## [27] "D_hypoleucum_454950"     "D_hypoleucum_25880"     
## [29] "D_hypoleucum_3314"       "D_hypoleucum_357608"    
## [31] "D_hypoleucum_357615"     "D_hypoleucum_357612"    
## [33] "D_hypoleucum_19638"      "D_hypoleucum_25868"     
## [35] "D_hypoleucum_17969"      "D_hypoleucum_25672"     
## [37] "D_hypoleucum_26975"      "D_hypoleucum_2229"      
## [39] "D_hypoleucum_2067"       "D_hypoleucum_28329"     
## [41] "D_hypoleucum_28361"      "D_hypoleucum_28376"     
## [43] "D_hypoleucum_2208"       "D_hypoleucum_28294"     
## [45] "D_hypoleucum_1956"       "D_hypoleucum_25670"     
## [47] "D_hypoleucum_20921"      "D_hypoleucum_20193"     
## [49] "D_hypoleucum_28416"      "D_hypoleucum_18193"     
## [51] "D_hypoleucum_27450"      "D_hypoleucum_19046"     
## [53] "D_hypoleucum_19136"      "D_hypoleucum_28676"     
## [55] "D_nigrilore_KU28413"     "D_nigrilore_KU28414"    
## [57] "D_hypoleucum_29945"      "D_hypoleucum_31636"     
## [59] "D_hypoleucum_29951"      "D_hypoleucum_31644"

gsub("D_hypoleucum_","hyp",colnames(vcfR@gt)[-1])

##  [1] "hyp18070"            "hyp18191"            "hyp14037"           
##  [4] "hyp14079"            "hyp14065"            "hyp14120"           
##  [7] "hyp18159"            "hyp14075"            "hyp20218"           
## [10] "hyp19177"            "hyp19178"            "hyp20213"           
## [13] "hypFMNH454949"       "hyp25637"            "hyp1271"            
## [16] "hyp1273"             "hyp1275"             "hyp25921"           
## [19] "hyp3274"             "hyp26984"            "hyp3208"            
## [22] "hyp3158"             "hyp2253"             "hyp3095"            
## [25] "hyp462070"           "hyp25675"            "hyp454950"          
## [28] "hyp25880"            "hyp3314"             "hyp357608"          
## [31] "hyp357615"           "hyp357612"           "hyp19638"           
## [34] "hyp25868"            "hyp17969"            "hyp25672"           
## [37] "hyp26975"            "hyp2229"             "hyp2067"            
## [40] "hyp28329"            "hyp28361"            "hyp28376"           
## [43] "hyp2208"             "hyp28294"            "hyp1956"            
## [46] "hyp25670"            "hyp20921"            "hyp20193"           
## [49] "hyp28416"            "hyp18193"            "hyp27450"           
## [52] "hyp19046"            "hyp19136"            "hyp28676"           
## [55] "D_nigrilore_KU28413" "D_nigrilore_KU28414" "hyp29945"           
## [58] "hyp31636"            "hyp29951"            "hyp31644"

#make popmap
p<-gsub("D_hypoleucum_","hyp",colnames(vcfR@gt)[-1])
p<-gsub("D_nigrilore_","ni",p)
df<-data.frame(ind=colnames(vcfR@gt)[-1],pop=p)

#write out popmap for stacks
#write.table(df, file = "~/Desktop/phil.dicaeum/iqtree.popmap.txt", quote = F, row.names = F, col.names = F, sep = "\t")

copy both of those files you just wrote to you local disk into your project directory on the cluster

use this bash code to generate the input phylip file you need to use for treebuilding

#run this bash code in a terminal window in the directory where you want the output to be (should be fast enough that you don't need to submit a job), specifying the path to where you ran your optimized Stacks iteration to the -P flag
#(whitelist includes the loci you want to keep and popmap includes the samples you want to keep based on filtering)
#--phylip-var-all flag indicates to output the phylip including invariant sites (best for phylogenetic reconstruction)
/home/d669d153/work/stacks-2.41/populations -P /home/d669d153/work/phil.dicaeum/stacks_n8 -O . -M iqtree.popmap.txt --whitelist 2590.whitelist.txt --phylip-var-all

#annoyingly, Stacks adds a line at the end of the file that says something like '# Stacks v2.41;  Phylip interleaved; December 02, 2022'
#this is not standard for a phylip file, and will cause an error if you try to use this file to build a tree. Luckily, you can use the following sed one-liner below to easily remove this trailing line and write the cleaned output to a new file
sed '/^#/ d' populations.all.phylip > pops.phy

now simply submit this job to determine the optimal model of sequence evolution and generate a concatenated maximum likelihood tree for your dataset using the optimized model and calculating bootstrap support for internal branches

#!/bin/sh
#
#SBATCH --job-name=iqtree               # Job Name
#SBATCH --nodes=1             # 40 nodes
#SBATCH --ntasks-per-node=15               # 40 CPU allocation per Task
#SBATCH --partition=sixhour         # Name of the Slurm partition used
#SBATCH --chdir=/home/d669d153/work/phil.dicaeum/iqtree  # Set working d$
#SBATCH --mem-per-cpu=2gb            # memory requested
#SBATCH --time=360

#-s specifies the input sequence data
#-m MFP specifies to perform model testing and use the best model of sequence evolution
#-bb specifies performing 1000 ultrafast bootstraps to assess support
#-nt AUTO allows the program to use the optimal number of threads (15 specified here)
/home/d669d153/work/iqtree-2.2.0-Linux/bin/iqtree2 -s pops.phy -m MFP -bb 1000 -nt AUTO

this should only take a couple of hours to finish

visualize your tree using the figtree GUI

#start by copying in the entire directory where you ran iqtree to your local machine, e.g.,
#scp -r d669d153@hpc.crc.ku.edu:/home/d669d153/work/phil.dicaeum/iqtree /Users/devder/Desktop/phil.dicaeum/
#open the tree in figtree
#unrooted tree looks like this:
knitr::include_graphics("/Users/devder/Desktop/phil.dicaeum/unrooted.iqtree.png")

#After manually rooting on the outgroup, color coding internal branches based on bootstrap support values, adding key bootstrap values directly, and labeling clades
knitr::include_graphics("/Users/devder/Desktop/phil.dicaeum/iqtree/iqtree.png")

iqtree

Devon DeRaad

2022-12-02

using your filtered vcf file to generate a phylip with seuqnece data for all filtered loci (including invariant sites), for only the samples that passed all filtering protocols

copy both of those files you just wrote to you local disk into your project directory on the cluster

use this bash code to generate the input phylip file you need to use for treebuilding

now simply submit this job to determine the optimal model of sequence evolution and generate a concatenated maximum likelihood tree for your dataset using the optimized model and calculating bootstrap support for internal branches

this should only take a couple of hours to finish

visualize your tree using the figtree GUI