run.ASTRAL-III

Much of this approach is conceptually based off of the excellent tutorial from Michael Matschiner (https://github.com/mmatschiner/tutorials/blob/master/ml_species_tree_inference/README.md#astral). I highly suggest checking this tutorial out, especially if this is your first time building species trees.

#workflow describing the process of generating individual gene trees when you did your genotype filtering in R
#start by going back to gstacks to get your whole locus sequences (same as iqtree)
library(ape)
library(vcfR)

## 
##    *****       ***   vcfR   ***       *****
##    This is vcfR 1.12.0 
##      browseVignettes('vcfR') # Documentation
##      citation('vcfR') # Citation
##    *****       *****      *****       *****

#read in vcf with filtered, unlinked loci that we want to use for gene tree estimation
vcfR<-read.vcfR("~/Desktop/phil.dicaeum/filtered.85.unlinked.vcf")

## Scanning file to determine attributes.
## File attributes:
##   meta lines: 14
##   header_line: 15
##   variant count: 2590
##   column count: 69
## 
Meta line 14 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
##   Character matrix gt rows: 2590
##   Character matrix gt cols: 69
##   skip: 0
##   nrows: 2590
##   row_num: 0
## 
Processed variant 1000
Processed variant 2000
Processed variant: 2590
## All variants processed

#check out the metadata
head(vcfR@fix)

##      CHROM POS  ID REF ALT QUAL FILTER INFO            
## [1,] "1"   "24" NA "G" "T" NA   "PASS" "NS=51;AF=0.010"
## [2,] "7"   "10" NA "G" "T" NA   "PASS" "NS=58;AF=0.009"
## [3,] "8"   "14" NA "C" "T" NA   "PASS" "NS=56;AF=0.027"
## [4,] "10"  "7"  NA "C" "T" NA   "PASS" "NS=56;AF=0.018"
## [5,] "11"  "21" NA "A" "G" NA   "PASS" "NS=55;AF=0.009"
## [6,] "12"  "57" NA "G" "A" NA   "PASS" "NS=54;AF=0.009"

#we want to isolate the third column, which contains the name of each locus
head(vcfR@fix[,1])

## [1] "1"  "7"  "8"  "10" "11" "12"

#get a list of just locus names to use as whitelist for stacks
whitelist<-sub(":.*", "", vcfR@fix[,1])
#make sure each locus is unique
length(unique(whitelist)) == length(whitelist)

## [1] TRUE

#write out whitelist for stacks
#write.table(whitelist, file = "~/Desktop/phil.dicaeum/2590.whitelist.txt", quote = F, row.names = F, col.names = F)

#generate popmap including only the samples in this filtered vcf file, assigning each sample to a unique pop so that we keep all tips distinct
#but phylip format limits names to 10 characters or less! So don't forget to make population names less than 10 characters
#here I use this code to do that:
colnames(vcfR@gt)[-1]

##  [1] "D_hypoleucum_18070"      "D_hypoleucum_18191"     
##  [3] "D_hypoleucum_14037"      "D_hypoleucum_14079"     
##  [5] "D_hypoleucum_14065"      "D_hypoleucum_14120"     
##  [7] "D_hypoleucum_18159"      "D_hypoleucum_14075"     
##  [9] "D_hypoleucum_20218"      "D_hypoleucum_19177"     
## [11] "D_hypoleucum_19178"      "D_hypoleucum_20213"     
## [13] "D_hypoleucum_FMNH454949" "D_hypoleucum_25637"     
## [15] "D_hypoleucum_1271"       "D_hypoleucum_1273"      
## [17] "D_hypoleucum_1275"       "D_hypoleucum_25921"     
## [19] "D_hypoleucum_3274"       "D_hypoleucum_26984"     
## [21] "D_hypoleucum_3208"       "D_hypoleucum_3158"      
## [23] "D_hypoleucum_2253"       "D_hypoleucum_3095"      
## [25] "D_hypoleucum_462070"     "D_hypoleucum_25675"     
## [27] "D_hypoleucum_454950"     "D_hypoleucum_25880"     
## [29] "D_hypoleucum_3314"       "D_hypoleucum_357608"    
## [31] "D_hypoleucum_357615"     "D_hypoleucum_357612"    
## [33] "D_hypoleucum_19638"      "D_hypoleucum_25868"     
## [35] "D_hypoleucum_17969"      "D_hypoleucum_25672"     
## [37] "D_hypoleucum_26975"      "D_hypoleucum_2229"      
## [39] "D_hypoleucum_2067"       "D_hypoleucum_28329"     
## [41] "D_hypoleucum_28361"      "D_hypoleucum_28376"     
## [43] "D_hypoleucum_2208"       "D_hypoleucum_28294"     
## [45] "D_hypoleucum_1956"       "D_hypoleucum_25670"     
## [47] "D_hypoleucum_20921"      "D_hypoleucum_20193"     
## [49] "D_hypoleucum_28416"      "D_hypoleucum_18193"     
## [51] "D_hypoleucum_27450"      "D_hypoleucum_19046"     
## [53] "D_hypoleucum_19136"      "D_hypoleucum_28676"     
## [55] "D_nigrilore_KU28413"     "D_nigrilore_KU28414"    
## [57] "D_hypoleucum_29945"      "D_hypoleucum_31636"     
## [59] "D_hypoleucum_29951"      "D_hypoleucum_31644"

gsub("D_hypoleucum_","hyp",colnames(vcfR@gt)[-1])

##  [1] "hyp18070"            "hyp18191"            "hyp14037"           
##  [4] "hyp14079"            "hyp14065"            "hyp14120"           
##  [7] "hyp18159"            "hyp14075"            "hyp20218"           
## [10] "hyp19177"            "hyp19178"            "hyp20213"           
## [13] "hypFMNH454949"       "hyp25637"            "hyp1271"            
## [16] "hyp1273"             "hyp1275"             "hyp25921"           
## [19] "hyp3274"             "hyp26984"            "hyp3208"            
## [22] "hyp3158"             "hyp2253"             "hyp3095"            
## [25] "hyp462070"           "hyp25675"            "hyp454950"          
## [28] "hyp25880"            "hyp3314"             "hyp357608"          
## [31] "hyp357615"           "hyp357612"           "hyp19638"           
## [34] "hyp25868"            "hyp17969"            "hyp25672"           
## [37] "hyp26975"            "hyp2229"             "hyp2067"            
## [40] "hyp28329"            "hyp28361"            "hyp28376"           
## [43] "hyp2208"             "hyp28294"            "hyp1956"            
## [46] "hyp25670"            "hyp20921"            "hyp20193"           
## [49] "hyp28416"            "hyp18193"            "hyp27450"           
## [52] "hyp19046"            "hyp19136"            "hyp28676"           
## [55] "D_nigrilore_KU28413" "D_nigrilore_KU28414" "hyp29945"           
## [58] "hyp31636"            "hyp29951"            "hyp31644"

#make popmap
p<-gsub("D_hypoleucum_","hyp",colnames(vcfR@gt)[-1])
p<-gsub("D_nigrilore_","ni",p)
df<-data.frame(ind=colnames(vcfR@gt)[-1],pop=p)

#write out popmap for stacks
#write.table(df, file = "~/Desktop/phil.dicaeum/iqtree.popmap.txt", quote = F, row.names = F, col.names = F, sep = "\t")

copy both of those files you just wrote to you local disk into your project directory on the cluster

Run this bash code on the cluster where you performed your Stacks assembly

#run this bash code in a terminal window in the directory where you want the output to be (should be fast enough that you don't need to submit a job), specifying the path to where you ran your optimized Stacks iteration to the -P flag
#(whitelist includes the loci you want to keep and popmap includes the samples you want to keep based on filtering)
#--phylip-var-all flag indicates to output the phylip including invariant sites (best for phylogenetic reconstruction)
/home/d669d153/work/stacks-2.41/populations -P /home/d669d153/work/phil.dicaeum/stacks_n8 -O . -M iqtree.popmap.txt --whitelist 2590.whitelist.txt --phylip-var-all

#annoyingly, Stacks adds a line at the end of the file that says something like '# Stacks v2.41;  Phylip interleaved; December 02, 2022'
#this is not standard for a phylip file, and will cause an error if you try to use this file to build a tree. Luckily, you can use the following sed one-liner below to easily remove this trailing line and write the cleaned output to a new file
sed '/^#/ d' populations.all.phylip > pops.phy

now copy the resulting phylip and partition files into your local working directory

Read the resulting phylip into R

#read in the phylip with all loci concatenated
dna<-read.dna(file = "~/Desktop/phil.dicaeum/iqtree/pops.phy")
as.list(dna)

## 60 DNA sequences in binary format stored in a list.
## 
## All sequences of same length: 247982 
## 
## Labels:
## hyp18070
## hyp18191
## hyp14037
## hyp14079
## hyp14065
## hyp14120
## ...
## 
## More than 10 million bases: not printing base composition.
## (Total: 14.88 Mb)

dim(dna)

## [1]     60 247982

head(dna)

## 6 DNA sequences in binary format stored in a matrix.
## 
## All sequences of same length: 247982 
## 
## Labels:
## hyp18070
## hyp18191
## hyp14037
## hyp14079
## hyp14065
## hyp14120
## 
## Base composition:
##     a     c     g     t 
## 0.309 0.178 0.192 0.321 
## (Total: 1.49 Mb)

#read in the partition file generated by Stacks
parts<-read.table("~/Desktop/phil.dicaeum/iqtree/populations.all.partitions.phylip", sep = ",")
head(parts)

##    V1          V2
## 1 DNA     p1=1-96
## 2 DNA   p2=97-196
## 3 DNA  p3=197-293
## 4 DNA  p4=294-388
## 5 DNA  p5=389-487
## 6 DNA  p6=488-582

#we can see that splitting based on the comma delimiter turns this text file into a two column dataframe
#we will use some regular expressions to clean up this dataframe so that we can use it to split our phylip into individual locus alignments
parts$parts<-sub("=.*","",parts$V2)
parts$begin<-sub(".*=","",parts$V2)
parts$begin<-sub("-.*","",parts$begin)
parts$end<-sub(".*=","",parts$V2)
parts$end<-sub(".*-","",parts$end)
head(parts)

##    V1          V2 parts begin end
## 1 DNA     p1=1-96    p1     1  96
## 2 DNA   p2=97-196    p2    97 196
## 3 DNA  p3=197-293    p3   197 293
## 4 DNA  p4=294-388    p4   294 388
## 5 DNA  p5=389-487    p5   389 487
## 6 DNA  p6=488-582    p6   488 582

#now we can see that we have separate columns describing the name of the partion (parts), the first base for that partition (begin) and the last base for that partition (end)

#uncomment and run this the first time you follow this tutorial. Once the fasta files are created, you can comment this out, as it only needs to be run once
#write a for loop that pulls out one partition at a time from the phylip and writes it to a local folder as a distinct fasta file
#for (i in 1:nrow(parts)){
#  locus<- dna[,c(parts$begin[i]:parts$end[i])]
#  #you will need to change this path to make it point to an empty directory where you want these fastas dumped
#  write.FASTA(x = locus, file=paste0("~/Desktop/phil.dicaeum/astral/fastas/locus.",i,".fasta"))
#  print(i)
#}

#make sure those files were created in the directory we want them in
list.files(path = "~/Desktop/phil.dicaeum/astral/fastas")[1:5] #print first 5 files in this directory

## [1] "locus.1.fasta"    "locus.10.fasta"   "locus.100.fasta"  "locus.1000.fasta"
## [5] "locus.1001.fasta"

convert the fasta alignments you just wrote out into nexus format in a local terminal window

#following steps come from: https://github.com/mmatschiner/tutorials/blob/master/ml_species_tree_inference/README.md#astral
#To remove sequences that contain only missing information from all alignments, and at the same time translate all alignments into Nexus format,
#use the Python script 'convert.py' (https://github.com/mmatschiner/tutorials/blob/master/ml_species_tree_inference/src/convert.py) in a for loop with the following commands: (run in terminal, one level above the directory you created to hold your fasta files in the previous chunk)

#necessary python script can easily be pulled down from github into your working directory by running:
#wget https://raw.githubusercontent.com/mmatschiner/tutorials/master/ml_species_tree_inference/src/convert.py

#make directory to hold nexus files
mkdir nex

#execute python script in a loop
for i in fastas/*.fasta
do
gene_id=`basename ${i%.fasta}`
python3 convert.py ${i} nex/${gene_id}.nex -f nexus -m 0.9
done

#run this in the nexus directory to make sure that missing data is correctly recognized as being encoded by 'N' rather than '?'
cd nex
for i in *
  do
sed -i '' 's/ missing=?/ missing=N/' $i
done

#make sure a random input nexus alignment looks good
cat locus.55.nex

Now copy this entire ‘nex’ directory onto the cluster, so that each nexus alignment can be read into IQ-TREE. Then, use IQ-TREE to generate a maximum-likelihood gene tree for each alignment on the cluster.

start IQ-TREE in a loop so that it analyzes one gene alignment after the other, use the following script: (run on the cluster, 2725 gene trees takes ~8 hours to generate on my laptop)

#!/bin/sh
#
#SBATCH --job-name=dicaeum.gene.trees               # Job Name
#SBATCH --nodes=1             # 40 nodes
#SBATCH --ntasks-per-node=1               # 1 CPU allocation per Task
#SBATCH --partition=bi            # Name of the Slurm partition used
#SBATCH --chdir=/home/d669d153/work/phil.dicaeum/astral     # Set working d$
#SBATCH --mem-per-cpu=1gb            # memory requested
#SBATCH --time=10000

#To be able to later use bootstrapping in ASTRAL, we will also generate sets of trees from bootstrapped gene alignments in the same IQ-TREE analysis, by using the option -B 
#we do not specify a substition model with the -m option and therefore allow IQ-TREE to automatically select the best-fitting model.
#we will now ensure that the bootstrap trees are actually written to a file, and not just summarized in the output, by specifying the option --wbt.
for i in /home/d669d153/work/phil.dicaeum/astral/nex/*.nex
do
/home/d669d153/work/iqtree-2.2.0-Linux/bin/iqtree2 -s ${i} -bb 1000 -wbt
done

alternatively, you can try using an array to infer all gene trees simultaneously. This creates a lot of jobs and can create a lot of output files so be careful of setting your array to the exact number of loci you have, and watch where you’re executing this script, and your exact paths, if you choose to try it.

#!/bin/sh
#
#SBATCH --job-name=dicaeum.gene.trees               # Job Name
#SBATCH --nodes=1             # 40 nodes
#SBATCH --ntasks-per-node=1               # 1 CPU allocation per Task
#SBATCH --partition=sixhour            # Name of the Slurm partition used
#SBATCH --chdir=/home/d669d153/work/phil.dicaeum/astral     # Set working d$
#SBATCH --mem-per-cpu=1gb            # memory requested
#SBATCH --array=1-2590
#SBATCH --time=60

#before submitting the entire array, test that this works as you hope:
#/home/d669d153/work/iqtree-2.2.0-Linux/bin/iqtree2 -s /home/d669d153/work/phil.dicaeum/astral/nex/1.nex -bb 1000 -wbt

#To be able to later use bootstrapping in ASTRAL, we will also generate sets of trees from bootstrapped gene alignments in the same IQ-TREE analysis, by using the option -B 
#we do not specify a substition model with the -m option and therefore allow IQ-TREE to automatically select the best-fitting model.
#we will now ensure that the bootstrap trees are actually written to a file, and not just summarized in the output, by specifying the option --wbt.
/home/d669d153/work/iqtree-2.2.0-Linux/bin/iqtree2 -s /home/d669d153/work/phil.dicaeum/astral/nex/locus.$SLURM_ARRAY_TASK_ID.nex -bb 1000 -wbt

clean up output directory

#check that a tree was generated for every locus, this should return the exact # of loci you input
ls nex/*.treefile | wc -l

#if you used an array, you will likely need to remove the overwhelming number of slurm output files
rm slurm-*

#The IQ-TREE analyses will have generated a number of files for each gene, containing run info, a distance matrix, starting trees, and so on.
#The only output files required for our further analyses are those ending in .treefile (the maximum-likelihood gene trees with branch lengths) and .ufboot (the set of bootstrap trees without branch lengths). To clean up the directory and keep only the important files
#use the following commands: (at this point, you should be one level above the 'nex' directory, i.e., at the same level your iqtree script was executed at; for me '/home/d669d153/work/phil.dicaeum/astral')
rm nex/*.bionj
rm nex/*.ckp.gz
rm nex/*.contree
rm nex/*.iqtree
rm nex/*.log
rm nex/*.mldist
rm nex/*.model.gz
rm nex/*.splits.nex
rm nex/*.uniqueseq.phy

#Next, have a look at one of the files containing the maximum-likelihood trees, e.g. with the following command:
less nex/locus.1.nex.treefile

#Since ASTRAL will require as input a single file containing all gene trees, combine all files with maximum-likelihood trees into a single file named ml_best.trees, using the following command:
cat nex/*.treefile > ml_best.trees
#To further clean up the directory, you could then also remove all files that contain the maximum-likelihood trees for single genes, using
#rm nex/*.treefile

#We will first use the set of bootstrapped trees to estimate node support on the species tree.
#To do so, ASTRAL requires as input a single file with the names of all the files containing bootstrapped trees. We can generate such a file with the following command:
ls nex/*.ufboot > ml_boot.txt

#we also need a namemap in order to assign tips to species in astral
#easiest to just do by hand unfortunately as the required format is very un-table-like
#example of a correctly formatted ASTRAL namemap: (https://github.com/DevonDeRaad/aph.rad/blob/master/astral/namemap.7spec.txt)
#save as namemap.txt

execute ASTRAL in the same directory using this job

#!/bin/sh
#
#SBATCH --job-name=dicaeum.astral               # Job Name
#SBATCH --nodes=1             # 40 nodes
#SBATCH --ntasks-per-node=1               # 1 CPU allocation per Task
#SBATCH --partition=bi            # Name of the Slurm partition used
#SBATCH --chdir=/home/d669d153/work/phil.dicaeum/astral     # Set working d$
#SBATCH --mem-per-cpu=5gb            # memory requested
#SBATCH --time=1000

#We can then run ASTRAL with two input files: The file containing the maximum-likelihood trees for each gene (ml_best.trees), and the file containing the names of all files with bootstrapped trees (ml_boot.txt).
#The first of these is to be specified with ASTRAL's option -i, and the second should be given with option -b.
#In addition, we'll use option -o to set the name of the output file to species_boot.trees:
module load java
java -jar /home/d669d153/work/Astral/astral.5.7.7.jar -i ml_best.trees -a namemap.txt -b ml_boot.txt -o species_boot.trees

#Have a look at the output file species_boot.trees using a text editor (or again the command less). You'll see that it contains 102 lines. 
#The first 100 of these lines represent species trees in Newick format estimated for each the first 100 bootstrapped trees of each gene.
#On line 101 is a consensus tree for these 100 trees. Finally, the last line contains the species tree estimated from the maximum-likelihood gene trees,
#annotated with node support based on the bootstrap trees. Thus, the last line contains the species tree that we'll use for interpretation.
#However, before visualizing the species tree in FigTree, first conduct the second ASTRAL analysis based on the maximum-likelihood trees alone, to generate a species tree with posterior probability values for each internal branch.
#Do so using the following command:
java -jar /home/d669d153/work/Astral/astral.5.7.7.jar -i ml_best.trees -a namemap.txt -o species_pp.tre
#Also calculate quartet frequencies (by specifying the '-t 16' flag) for each internal branch on the species tree with pp values (here 'species_pp.tre')
java -jar /home/d669d153/work/Astral/astral.5.7.7.jar -q species_pp.tre -i ml_best.trees -t 16 -a namemap.txt -o species_pp_quartets.tre
#Since we are interested only in the last of the trees from file species_boot.trees as well as the tree from file species_pp.tre,
#we'll generate a new file named species.trees that contains both of these two trees using the following commands:
tail -n 1 species_boot.trees > species.trees
cat species_pp.tre >> species.trees
#move quartet info to distinct file
mv freqQuad.csv freqQuad.no.contraction.csv

#now try contracting branches with extremely low support using newick utilities and see if that makes a difference
/panfs/pfs.local/work/bi/bin/newick_utils/bin/nw_ed  ml_best.trees 'i & b<=10' o > ml_best_BS10.trees
#re-run astral and export PP tree
java -jar /home/d669d153/work/Astral/astral.5.7.7.jar -i ml_best_BS10.trees -a namemap.txt -o species_pp_BS10.tre
#calculate quartet frequencies on this branch-support contracted tree
java -jar /home/d669d153/work/Astral/astral.5.7.7.jar -q species_pp_BS10.tre -i ml_best_BS10.trees -t 16 -a namemap.txt -o species_pp_BS10_quartets.tre
#move quartet info to distinct file
mv freqQuad.csv freqQuad.BS10.csv

visualize astral output tree without contracted branches in figtree

knitr::include_graphics("/Users/devder/Desktop/astral.tree.png")

knitr::include_graphics("/Users/devder/Desktop/astral.tree.with.quartet frequencies.png")

visualize astral output tree with low support branches contracted

knitr::include_graphics("/Users/devder/Desktop/phil.dicaeum/astral/species.tree.bs10.png")

to assess the effect of including the apparently introgressed samples from Mt. Busa, I will try pruning them from the alignment and seeing if that affects the resulting species tree and quartet frequencies

#for each nexus alignment, remove each line beginning with one of the Mt. Busa sample names using sed
for i in /home/d669d153/work/phil.dicaeum/astral/mtbusa.removed/nex/*.nex
do
sed -i '/^ *hyp1956/d' ${i}
sed -i '/^ *hyp2208/d' ${i}
sed -i '/^ *hyp2067/d' ${i}
sed -i '/^ *hyp2229/d' ${i}
sed -i '/^ *hyp2253/d' ${i}
done

#in each nexus file change the number of taxa from 60 to 55
for i in /home/d669d153/work/phil.dicaeum/astral/mtbusa.removed/nex/*.nex
do
sed -i -e 's/ntax=60/ntax=55/g' ${i}
done

now create a new set of gene trees from these pruned alignments with the following script

#!/bin/sh
#
#SBATCH --job-name=dicaeum.gene.trees               # Job Name
#SBATCH --nodes=1             # 40 nodes
#SBATCH --ntasks-per-node=1               # 1 CPU allocation per Task
#SBATCH --partition=sixhour            # Name of the Slurm partition used
#SBATCH --chdir=/home/d669d153/work/phil.dicaeum/astral/mtbusa.removed     # Set working d$
#SBATCH --mem-per-cpu=1gb            # memory requested
#SBATCH --array=1-2590
#SBATCH --time=60

#before submitting the entire array, test that this works as you hope:
#/home/d669d153/work/iqtree-2.2.0-Linux/bin/iqtree2 -s /home/d669d153/work/phil.dicaeum/astral/nex/1.nex -bb 1000 -wbt

#To be able to later use bootstrapping in ASTRAL, we will also generate sets of trees from bootstrapped gene alignments in the same IQ-TREE analysis, by using the option -B 
#we do not specify a substition model with the -m option and therefore allow IQ-TREE to automatically select the best-fitting model.
#we will now ensure that the bootstrap trees are actually written to a file, and not just summarized in the output, by specifying the option --wbt.
/home/d669d153/work/iqtree-2.2.0-Linux/bin/iqtree2 -s /home/d669d153/work/phil.dicaeum/astral/mtbusa.removed/nex/locus.$SLURM_ARRAY_TASK_ID.nex -bb 1000 -wbt

clean up output directory

#check that a tree was generated for every locus, this should return the exact # of loci you input
ls nex/*.treefile | wc -l

#if you used an array, you will likely need to remove the overwhelming number of slurm output files
rm slurm-*

#The IQ-TREE analyses will have generated a number of files for each gene, containing run info, a distance matrix, starting trees, and so on.
#The only output files required for our further analyses are those ending in .treefile (the maximum-likelihood gene trees with branch lengths) and .ufboot (the set of bootstrap trees without branch lengths). To clean up the directory and keep only the important files
#use the following commands: (at this point, you should be one level above the 'nex' directory, i.e., at the same level your iqtree script was executed at; for me '/home/d669d153/work/phil.dicaeum/astral')
rm nex/*.bionj
rm nex/*.ckp.gz
rm nex/*.contree
rm nex/*.iqtree
rm nex/*.log
rm nex/*.mldist
rm nex/*.model.gz
rm nex/*.splits.nex
rm nex/*.uniqueseq.phy

#Next, have a look at one of the files containing the maximum-likelihood trees, e.g. with the following command:
less nex/locus.1.nex.treefile

#Since ASTRAL will require as input a single file containing all gene trees, combine all files with maximum-likelihood trees into a single file named ml_best.trees, using the following command:
cat nex/*.treefile > ml_best.trees
#To further clean up the directory, you could then also remove all files that contain the maximum-likelihood trees for single genes, using
#rm nex/*.treefile

#We will first use the set of bootstrapped trees to estimate node support on the species tree.
#To do so, ASTRAL requires as input a single file with the names of all the files containing bootstrapped trees. We can generate such a file with the following command:
ls nex/*.ufboot > ml_boot.txt

#clean up namemap for astral by removing those 5 samples we trimmed:hyp1956,hyp2208,hyp2067,hyp2229,hyp2253

execute ASTRAL in the same directory using this job

#!/bin/sh
#
#SBATCH --job-name=dicaeum.astral               # Job Name
#SBATCH --nodes=1             # 40 nodes
#SBATCH --ntasks-per-node=1               # 1 CPU allocation per Task
#SBATCH --partition=bi            # Name of the Slurm partition used
#SBATCH --chdir=/home/d669d153/work/phil.dicaeum/astral/mtbusa.removed     # Set working d$
#SBATCH --mem-per-cpu=5gb            # memory requested
#SBATCH --time=1000

#We can then run ASTRAL with two input files: The file containing the maximum-likelihood trees for each gene (ml_best.trees), and the file containing the names of all files with bootstrapped trees (ml_boot.txt).
#The first of these is to be specified with ASTRAL's option -i, and the second should be given with option -b.
#In addition, we'll use option -o to set the name of the output file to species_boot.trees:
module load java
java -jar /home/d669d153/work/Astral/astral.5.7.7.jar -i ml_best.trees -a namemap.txt -b ml_boot.txt -o species_boot.trees

#Have a look at the output file species_boot.trees using a text editor (or again the command less). You'll see that it contains 102 lines. 
#The first 100 of these lines represent species trees in Newick format estimated for each the first 100 bootstrapped trees of each gene.
#On line 101 is a consensus tree for these 100 trees. Finally, the last line contains the species tree estimated from the maximum-likelihood gene trees,
#annotated with node support based on the bootstrap trees. Thus, the last line contains the species tree that we'll use for interpretation.
#However, before visualizing the species tree in FigTree, first conduct the second ASTRAL analysis based on the maximum-likelihood trees alone, to generate a species tree with posterior probability values for each internal branch.
#Do so using the following command:
java -jar /home/d669d153/work/Astral/astral.5.7.7.jar -i ml_best.trees -a namemap.txt -o species_pp.tre
#Also calculate quartet frequencies (by specifying the '-t 16' flag) for each internal branch on the species tree with pp values (here 'species_pp.tre')
java -jar /home/d669d153/work/Astral/astral.5.7.7.jar -q species_pp.tre -i ml_best.trees -t 16 -a namemap.txt -o species_pp_quartets.tre
#Since we are interested only in the last of the trees from file species_boot.trees as well as the tree from file species_pp.tre,
#we'll generate a new file named species.trees that contains both of these two trees using the following commands:
tail -n 1 species_boot.trees > species.trees
cat species_pp.tre >> species.trees
#move quartet info so it doesn't get overwritten
#move quartet info to distinct file
mv freqQuad.csv freqQuad.no.contraction.csv

#now try contracting branches with extremely low support using newick utilities and see if that makes a difference
/panfs/pfs.local/work/bi/bin/newick_utils/bin/nw_ed  ml_best.trees 'i & b<=10' o > ml_best_BS10.trees
#re-run astral and export PP tree
java -jar /home/d669d153/work/Astral/astral.5.7.7.jar -i ml_best_BS10.trees -a namemap.txt -o species_pp_BS10.tre
#calculate quartet frequencies on this branch-support contracted tree
java -jar /home/d669d153/work/Astral/astral.5.7.7.jar -q species_pp_BS10.tre -i ml_best_BS10.trees -t 16 -a namemap.txt -o species_pp_BS10_quartets.tre
#move quartet info to distinct file
mv freqQuad.csv freqQuad.BS10.csv

visualize astral output tree without contracted branches in figtree

knitr::include_graphics("/Users/devder/Desktop/species.tree.no.mtbusa.png")

run.ASTRAL-III

Devon DeRaad

2022-12-09

copy both of those files you just wrote to you local disk into your project directory on the cluster

Run this bash code on the cluster where you performed your Stacks assembly

now copy the resulting phylip and partition files into your local working directory

Read the resulting phylip into R

convert the fasta alignments you just wrote out into nexus format in a local terminal window

Now copy this entire ‘nex’ directory onto the cluster, so that each nexus alignment can be read into IQ-TREE. Then, use IQ-TREE to generate a maximum-likelihood gene tree for each alignment on the cluster.

start IQ-TREE in a loop so that it analyzes one gene alignment after the other, use the following script: (run on the cluster, 2725 gene trees takes ~8 hours to generate on my laptop)

clean up output directory

execute ASTRAL in the same directory using this job

visualize astral output tree without contracted branches in figtree

visualize astral output tree with low support branches contracted

to assess the effect of including the apparently introgressed samples from Mt. Busa, I will try pruning them from the alignment and seeing if that affects the resulting species tree and quartet frequencies

now create a new set of gene trees from these pruned alignments with the following script

clean up output directory

execute ASTRAL in the same directory using this job

visualize astral output tree without contracted branches in figtree