0.1 Read in thinned (i.e., linkage-filtered) vcf to r and remove outgroup
Code
library(vcfR)
***** *** vcfR *** *****
This is vcfR 1.14.0
browseVignettes('vcfR') # Documentation
citation('vcfR') # Citation
***** ***** ***** *****
Code
library(SNPfiltR)
This is SNPfiltR v.1.0.2
Detailed usage information is available at: devonderaad.github.io/SNPfiltR/
If you use SNPfiltR in your published work, please cite the following papers:
DeRaad, D.A. (2022), SNPfiltR: an R package for interactive and reproducible SNP filtering. Molecular Ecology Resources, 22, 2443-2453. http://doi.org/10.1111/1755-0998.13618
Knaus, Brian J., and Niklaus J. Grunwald. 2017. VCFR: a package to manipulate and visualize variant call format data in R. Molecular Ecology Resources, 17.1:44-53. http://doi.org/10.1111/1755-0998.12549
Code
library(ggplot2)#read in thinned vcfv<-read.vcfR("~/Desktop/grosbeak.rad/grosbeak.filtered.unlinked.snps.vcf.gz")#now move your filtered thinned vcf to the UCLA cluster (in a separate terminal window) like so:#scp /Users/devonderaad/Desktop/grosbeak.rad/grosbeak.filtered.unlinked.snps.vcf.gz dderaad@dtn.hoffman2.idre.ucla.edu:/u/project/aguillon/dderaad/grosbeak.rad/admixture/
0.2 Use this bash code to execute ADMIXTURE on the cluster
Code
#use these bash commands to unzip the vcf files you just wrote out before going on to the next stepgunzip grosbeak.filtered.unlinked.snps.vcf.gz#use this thinned vcf file to execute ADMIXTURE on the cluster using this script submitted as a slurm job:#### submit_job.sh START #####!/bin/bash#$ -cwd#$ -o ./joblog.$JOB_ID.txt #set the job log output file#$ -j y #set error = Merged with joblog#$ -l h_rt=1:00:00,h_data=3G #specify requested resources (h_rt gives time request in 'hrs:mins:secs' format) (h_data specifies requested RAM per task) (highp=TRUE means run it on Aguillon Lab owned nodes)#$ -pe shared 10 #specify number of CPUs requested#load necessary modules. /u/local/Modules/default/init/modules.sh#load plink #(v1.90b6.24)module load plink/1.90b624#use plink to convert vcf directly to bed format:plink--vcf grosbeak.filtered.unlinked.snps.vcf --double-id--allow-extra-chr--make-bed--out binary_fileset#fix chromosome namescut-f2- binary_fileset.bim > tempawk'BEGIN{FS=OFS="\t"}{print value 1 OFS $0}' temp > binary_fileset.bim#run admixture for a K of 1-10, using cross-validation, with 10 threadsfor K in 1 2 3 4 5 6 7 8 9 10;do/u/project/aguillon/shared_bin/admixture_linux-1.3.0/admixture--cv-j10 binary_fileset.bed $K|tee log${K}.out;done#Which K iteration is optimal according to ADMIXTURE ?grep-h CV log*.out > log.errors.txt
0.3 Assess the best run based on cross-validation
Code
#Now copy your entire admixture directory into your local machine, in bash, using a command like this:#scp -r dderaad@dtn.hoffman2.idre.ucla.edu:/u/project/aguillon/dderaad/grosbeak.rad/admixture /Users/devonderaad/Desktop/grosbeak.rad/#read in ADMIXTURE results to R#setwd to the admixture directory you brought in from the clustersetwd("~/Desktop/grosbeak.rad/admixture")#read in log error values to determine optimal Klog<-read.table("log.errors.txt")[,c(3:4)]#use double backslash to interpret the opening parentheses literally in the regular expressionlog$V3<-gsub("\\(K=", "", log$V3)log$V3<-gsub("):", "", log$V3)#interpret K values as numericallog$V3<-as.numeric(log$V3)#rename columnscolnames(log)<-c("Kvalue","cross.validation.error")#make plot showing the cross validation error across K values 1:10ggplot(data=log, aes(x=Kvalue, y=cross.validation.error, group=1)) +geom_line(linetype ="dashed")+geom_point()+ylab("cross-validation error")+xlab("K")+scale_x_continuous(breaks =c(1:10))+theme_classic()
Code
#lowest CV value is the ideal K value
0.4 Visualize results as bar charts
Code
#setwd to the admixture directory you brought in from the clustersetwd("~/Desktop/grosbeak.rad/admixture")#read in input filesampling<-read.table("binary_fileset.fam")[,1]#get list of input samples in order they appearsampling
#read in all ten runs and save each dataframe in a listruns<-list()#read in log filesfor (i in1:10){ runs[[i]]<-read.table(paste0("binary_fileset.", i, ".Q"))}#plot runs 1:5par(mfrow=c(1,1))for (i in1:5){barplot(t(as.matrix(runs[[i]])), col=rainbow(i), ylab="Ancestry", border="black")}
Code
#isolate run 2 (best according to CV)run2<-runs[[2]]#add sample info in the correct order (same as input vcf)run2$sample<-colnames(v@gt)[-1]#read in sample data#read in locality and mito infosamps<-read.csv("~/Desktop/grosbeak.data.csv")samps<-samps[samps$passed.genomic.filtering =="TRUE",] #retain only samples that passed filteringsamps$sample_id == run2$sample #check if sample info table order matches the vcf
#add q-values to the sampling data.frame now that orders matchsamps$lud.q<-run2$V1samps$mel.q<-run2$V2#reorder sample sheet by locality and then by q-value within each localitysamps<-samps[order(samps$site, samps$lud.q),]#use the following info to construct a vector that splits the barplot by sampling sitetable(samps$site)
***** Object of Class vcfR *****
138 samples
1027 CHROMs
3,584 variants
Object size: 40.7 Mb
3.403 percent missing data
***** ***** *****
Code
#must make chromosome names non-numeric for plink or it will throw an errorv@fix[,1]<-paste("a", v@fix[,1], sep="")#make another file with no singletonsv.x<-min_mac(v, min.mac =2)
31.25% of SNPs fell below a minor allele count of 2 and were removed from the VCF
Code
#get infov.x
***** Object of Class vcfR *****
138 samples
850 CHROMs
2,464 variants
Object size: 29.9 Mb
3.457 percent missing data
***** ***** *****
Code
#write to disk#vcfR::write.vcf(v.x, file="~/Desktop/grosbeak.rad/thinned.mac2.vcf.gz")#now move to the UCLA cluster (in a separate terminal window) like so:#scp /Users/devonderaad/Desktop/grosbeak.rad/thinned.mac2.vcf.gz dderaad@dtn.hoffman2.idre.ucla.edu:/u/project/aguillon/dderaad/grosbeak.rad/admixture.mac/
###Run admixture on the cluster
Code
#### submit_job.sh START #####!/bin/bash#$ -cwd#$ -o ./joblog.$JOB_ID.txt #set the job log output file#$ -j y #set error = Merged with joblog#$ -l h_rt=1:00:00,h_data=3G #specify requested resources (h_rt gives time request in 'hrs:mins:secs' format) (h_data specifies requested RAM per task) (highp=TRUE means run it on Aguillon Lab owned nodes)#$ -pe shared 10 #specify number of CPUs requested#load necessary modules. /u/local/Modules/default/init/modules.sh#load plink #(v1.90b6.24)module load plink/1.90b624#use plink to convert vcf directly to bed format:plink--vcf thinned.mac2.vcf --double-id--allow-extra-chr--make-bed--out binary_fileset#fix chromosome namescut-f2- binary_fileset.bim > tempawk'BEGIN{FS=OFS="\t"}{print value 1 OFS $0}' temp > binary_fileset.bim#run admixture for a K of 1-10, using cross-validation, with 10 threadsfor K in 1 2 3 4 5 6 7 8 9 10;do/u/project/aguillon/shared_bin/admixture_linux-1.3.0/admixture--cv-j10 binary_fileset.bed $K|tee log${K}.out;done#Which K iteration is optimal according to ADMIXTURE ?grep-h CV log*.out > log.errors.txt
0.6 Assess the best run based on cross-validation
Code
#Now copy your entire admixture directory into your local machine, in bash, using a command like this:#scp -r dderaad@dtn.hoffman2.idre.ucla.edu:/u/project/aguillon/dderaad/grosbeak.rad/admixture.mac /Users/devonderaad/Desktop/grosbeak.rad/#read in ADMIXTURE results to R#setwd to the admixture directory you brought in from the clustersetwd("~/Desktop/grosbeak.rad/admixture.mac")#read in log error values to determine optimal Klog<-read.table("log.errors.txt")[,c(3:4)]#use double backslash to interpret the opening parentheses literally in the regular expressionlog$V3<-gsub("\\(K=", "", log$V3)log$V3<-gsub("):", "", log$V3)#interpret K values as numericallog$V3<-as.numeric(log$V3)#rename columnscolnames(log)<-c("Kvalue","cross.validation.error")#make plot showing the cross validation error across K values 1:10ggplot(data=log, aes(x=Kvalue, y=cross.validation.error, group=1)) +geom_line(linetype ="dashed")+geom_point()+ylab("cross-validation error")+xlab("K")+scale_x_continuous(breaks =c(1:10))+theme_classic()
Code
#lowest CV value is the ideal K value
0.7 Visualize results as bar charts
Code
#setwd to the admixture directory you brought in from the clustersetwd("~/Desktop/grosbeak.rad/admixture.mac")#read in input filesampling<-read.table("binary_fileset.fam")[,1]#get list of input samples in order they appearsampling
#read in all ten runs and save each dataframe in a listruns<-list()#read in log filesfor (i in1:10){ runs[[i]]<-read.table(paste0("binary_fileset.", i, ".Q"))}#plot runs 1:5par(mfrow=c(1,1))for (i in1:5){barplot(t(as.matrix(runs[[i]])), col=rainbow(i), ylab="Ancestry", border="black")}
Code
#isolate run 2 (best according to CV)run2<-runs[[2]]#add sample info in the correct order (same as input vcf)run2$sample<-colnames(v@gt)[-1]#samps$sample_id == run2$sample #check if sample info table order matches the vcf
#add q-values to the sampling data.frame now that orders matchsamps$mac.lud.q<-run2$V1samps$mac.mel.q<-run2$V2#reorder sample sheet by locality and then by q-value within each localitysamps<-samps[order(samps$site, samps$lud.q),]#use the following info to construct a vector that splits the barplot by sampling sitetable(samps$site)
#plot barplots organized based on sampling details with MAC filterbarplot(t(as.matrix(samps[,c(42,43)])), col=c("#ef3b2c","#fff319"), ylab="Ancestry", border="black", names.arg=samps$site, cex.names=0.35,space=c(0,rep(0, times=7),3,rep(0, times=18),3,rep(0, times=2),3,rep(0, times=15),3,rep(0, times=9),3,rep(0, times=3),3,rep(0, times=7),3,rep(0, times=8),3,rep(0, times=9),3,rep(0, times=8),3,rep(0, times=9),3,rep(0, times=22),3,rep(0, times=8)))
Code
#plot barplots organized based on sampling details without MAC filterbarplot(t(as.matrix(samps[,c(40,41)])), col=c("#ef3b2c","#fff319"), ylab="Ancestry", border="black", names.arg=samps$site, cex.names=0.35,space=c(0,rep(0, times=7),3,rep(0, times=18),3,rep(0, times=2),3,rep(0, times=15),3,rep(0, times=9),3,rep(0, times=3),3,rep(0, times=7),3,rep(0, times=8),3,rep(0, times=9),3,rep(0, times=8),3,rep(0, times=9),3,rep(0, times=22),3,rep(0, times=8)))
Code
#check the correlation between singletons removed vs notplot(samps$lud.q,samps$mac.lud.q, pch=10, cex=1.5)abline(a=0, b=1)
Code
cor(samps$lud.q,samps$mac.lud.q)
[1] 0.9999769
0.8 test for bimodality by site
Code
#across the transecthist(samps$lud.q, breaks =50, main="Histogram of Ludovicianus ancestry (with singletons)",ylim=c(0,70))
Code
hist(samps$mac.lud.q, breaks =50, main="Histogram of Ludovicianus ancestry (no singletons)",ylim=c(0,70))
Code
#try different bins (20)hist(samps$mac.lud.q, breaks =20, main="Histogram of Ludovicianus ancestry (no singletons)",ylim=c(0,70))
Code
#try different bins (100)hist(samps$mac.lud.q, breaks =100, main="Histogram of Ludovicianus ancestry (no singletons)",ylim=c(0,70))
Code
library(diptest)#Dip test to see if we can reject unimodalitydip.test(samps$mac.lud.q, simulate.p.value =TRUE, B =10000)
Hartigans' dip test for unimodality / multimodality with simulated
p-value (based on 10000 replicates)
data: samps$mac.lud.q
D = 0.16543, p-value < 2.2e-16
alternative hypothesis: non-unimodal, i.e., at least bimodal
Code
dip.test(samps$lud.q, simulate.p.value =TRUE, B =10000)
Hartigans' dip test for unimodality / multimodality with simulated
p-value (based on 10000 replicates)
data: samps$lud.q
D = 0.16588, p-value < 2.2e-16
alternative hypothesis: non-unimodal, i.e., at least bimodal
Code
#repeat for individual siteshist(samps$mac.lud.q[samps$site ==8], breaks =10, main="Histogram of Ludovicianus ancestry at site 8 (no singletons)",xlim=c(0,1))
Code
#Dip test to see if we can reject unimodalitydip.test(samps$mac.lud.q[samps$site ==8], simulate.p.value =TRUE, B =10000)
Hartigans' dip test for unimodality / multimodality with simulated
p-value (based on 10000 replicates)
data: samps$mac.lud.q[samps$site == 8]
D = 0.11004, p-value = 0.2939
alternative hypothesis: non-unimodal, i.e., at least bimodal
Code
#repeat for individual siteshist(samps$mac.lud.q[samps$site ==9], breaks =10, main="Histogram of Ludovicianus ancestry at site 9 (no singletons)",xlim=c(0,1))
Code
#Dip test to see if we can reject unimodalitydip.test(samps$mac.lud.q[samps$site ==9], simulate.p.value =TRUE, B =10000)
Hartigans' dip test for unimodality / multimodality with simulated
p-value (based on 10000 replicates)
data: samps$mac.lud.q[samps$site == 9]
D = 0.082275, p-value = 0.8827
alternative hypothesis: non-unimodal, i.e., at least bimodal
Code
#combine sites 8 & 9 (either side of the transition)hist(samps$mac.lud.q[samps$site ==8| samps$site ==9], breaks =20, main="Histogram of Ludovicianus ancestry sites 8&9 (no singletons)",xlim=c(0,1))
Code
#Dip test to see if we can reject unimodalitydip.test(samps$mac.lud.q[samps$site ==8| samps$site ==9], simulate.p.value =TRUE, B =10000)
Hartigans' dip test for unimodality / multimodality with simulated
p-value (based on 10000 replicates)
data: samps$mac.lud.q[samps$site == 8 | samps$site == 9]
D = 0.17822, p-value < 2.2e-16
alternative hypothesis: non-unimodal, i.e., at least bimodal
Code
#combine sites 7 - 9 (either side of the transition)hist(samps$mac.lud.q[samps$site ==8| samps$site ==9| samps$site ==7], breaks =20, main="Histogram of Ludovicianus ancestry sites 7-9 (no singletons)",xlim=c(0,1))
Code
#Dip test to see if we can reject unimodalitydip.test(samps$mac.lud.q[samps$site ==8| samps$site ==9| samps$site ==7], simulate.p.value =TRUE, B =10000)
Hartigans' dip test for unimodality / multimodality with simulated
p-value (based on 10000 replicates)
data: samps$mac.lud.q[samps$site == 8 | samps$site == 9 | samps$site == 7]
D = 0.13431, p-value = 1e-04
alternative hypothesis: non-unimodal, i.e., at least bimodal
Code
#combine sites 6 - 9 (either side of the transition)hist(samps$mac.lud.q[samps$site ==8| samps$site ==9| samps$site ==7| samps$site ==6], breaks =20, main="Histogram of Ludovicianus ancestry sites 7-9 (no singletons)",xlim=c(0,1))
Code
#Dip test to see if we can reject unimodalitydip.test(samps$mac.lud.q[samps$site ==8| samps$site ==9| samps$site ==7| samps$site ==6], simulate.p.value =TRUE, B =10000)
Hartigans' dip test for unimodality / multimodality with simulated
p-value (based on 10000 replicates)
data: samps$mac.lud.q[samps$site == 8 | samps$site == 9 | samps$site == 7 | samps$site == 6]
D = 0.11677, p-value = 5e-04
alternative hypothesis: non-unimodal, i.e., at least bimodal
Code
#combine sites 6 - 10 (either side of the transition)hist(samps$mac.lud.q[samps$site ==8| samps$site ==9| samps$site ==7| samps$site ==6| samps$site ==10], breaks =20, main="Histogram of Ludovicianus ancestry sites 7-9 (no singletons)",xlim=c(0,1))
Code
#Dip test to see if we can reject unimodalitydip.test(samps$mac.lud.q[samps$site ==8| samps$site ==9| samps$site ==7| samps$site ==6| samps$site ==10], simulate.p.value =TRUE, B =10000)
Hartigans' dip test for unimodality / multimodality with simulated
p-value (based on 10000 replicates)
data: samps$mac.lud.q[samps$site == 8 | samps$site == 9 | samps$site == 7 | samps$site == 6 | samps$site == 10]
D = 0.16727, p-value < 2.2e-16
alternative hypothesis: non-unimodal, i.e., at least bimodal
0.9 visualize mean ancestry across the transect (i.e., quick and dirty geographic cline)
Code
x<-c()for (i in1:12){ x[i]<-mean(samps$mac.lud.q[samps$site == i])}x<-c(mean(samps$mac.lud.q[samps$site ==0]),x)plot(c(0:12),x)
0.10 Make a cohesive figure summarizing these results for the paper
#plot barplots organized based on sampling details with MAC filterbarplot(t(as.matrix(samps[,c(42,43)])), col=c("#ef3b2c","#fff319"), ylab="ludovicianus ancestry", border="black", names.arg=samps$site, cex.names=0.35,space=c(0,rep(0, times=7),3,rep(0, times=18),3,rep(0, times=2),3,rep(0, times=15),3,rep(0, times=9),3,rep(0, times=3),3,rep(0, times=7),3,rep(0, times=8),3,rep(0, times=9),3,rep(0, times=8),3,rep(0, times=9),3,rep(0, times=22),3,rep(0, times=8)))#histogram from the entire transecthist(samps$mac.lud.q, breaks =50, xlab="ludovicianus ancestry (all sites)", main=NULL)#combine sites 8 & 9 (either side of the transition)hist(samps$mac.lud.q[samps$site ==8| samps$site ==9], breaks =25, xlab="ludovicianus ancestry (sites 8 & 9)", xlim=c(0,1), main=NULL)#run dip tests to see if we can reject unimodality for the two histograms#all sites:dip.test(samps$mac.lud.q, simulate.p.value =FALSE)
Hartigans' dip test for unimodality / multimodality
data: samps$mac.lud.q
D = 0.16543, p-value < 2.2e-16
alternative hypothesis: non-unimodal, i.e., at least bimodal
Hartigans' dip test for unimodality / multimodality
data: samps$mac.lud.q[samps$site == 8 | samps$site == 9]
D = 0.17822, p-value = 2.731e-07
alternative hypothesis: non-unimodal, i.e., at least bimodal
Code
## 1. Open a pdf file#pdf("~/Desktop/grosbeak.rad/admixture.hists.pdf", width=9, height=6.5) ## 2. Create a plotlayout(matrix(c(1,1,2,3), 2, 2, byrow = T))#plot barplots organized based on sampling details with MAC filterbarplot(t(as.matrix(samps[,c(42,43)])), col=c("#ef3b2c","#fff319"), ylab="ludovicianus ancestry", border="black", names.arg=samps$site, cex.names=0.35,space=c(0,rep(0, times=7),3,rep(0, times=18),3,rep(0, times=2),3,rep(0, times=15),3,rep(0, times=9),3,rep(0, times=3),3,rep(0, times=7),3,rep(0, times=8),3,rep(0, times=9),3,rep(0, times=8),3,rep(0, times=9),3,rep(0, times=22),3,rep(0, times=8)))#histogram from the entire transecthist(samps$mac.lud.q, breaks =50, xlab="ludovicianus ancestry (all sites)", main=NULL)#combine sites 8 & 9 (either side of the transition)hist(samps$mac.lud.q[samps$site ==8| samps$site ==9], breaks =25, xlab="ludovicianus ancestry (sites 8 & 9)", xlim=c(0,1), main=NULL)
Code
## 3. Close the pdf file#dev.off()
Source Code
---title: "grosbeak.ADMIXTURE"format: html: code-fold: show code-tools: truetoc: truetoc-title: Document Contentsnumber-sections: trueembed-resources: true---### Read in thinned (i.e., linkage-filtered) vcf to r and remove outgroup```{r, results=FALSE}library(vcfR)library(SNPfiltR)library(ggplot2)#read in thinned vcfv<-read.vcfR("~/Desktop/grosbeak.rad/grosbeak.filtered.unlinked.snps.vcf.gz")#now move your filtered thinned vcf to the UCLA cluster (in a separate terminal window) like so:#scp /Users/devonderaad/Desktop/grosbeak.rad/grosbeak.filtered.unlinked.snps.vcf.gz dderaad@dtn.hoffman2.idre.ucla.edu:/u/project/aguillon/dderaad/grosbeak.rad/admixture/```### Use this bash code to execute ADMIXTURE on the cluster```{bash, eval=FALSE}#use these bash commands to unzip the vcf files you just wrote out before going on to the next stepgunzip grosbeak.filtered.unlinked.snps.vcf.gz#use this thinned vcf file to execute ADMIXTURE on the cluster using this script submitted as a slurm job:#### submit_job.sh START #####!/bin/bash#$ -cwd#$ -o ./joblog.$JOB_ID.txt #set the job log output file#$ -j y #set error = Merged with joblog#$ -l h_rt=1:00:00,h_data=3G #specify requested resources (h_rt gives time request in 'hrs:mins:secs' format) (h_data specifies requested RAM per task) (highp=TRUE means run it on Aguillon Lab owned nodes)#$ -pe shared 10 #specify number of CPUs requested#load necessary modules. /u/local/Modules/default/init/modules.sh#load plink #(v1.90b6.24)module load plink/1.90b624#use plink to convert vcf directly to bed format:plink--vcf grosbeak.filtered.unlinked.snps.vcf --double-id--allow-extra-chr--make-bed--out binary_fileset#fix chromosome namescut-f2- binary_fileset.bim > tempawk'BEGIN{FS=OFS="\t"}{print value 1 OFS $0}' temp > binary_fileset.bim#run admixture for a K of 1-10, using cross-validation, with 10 threadsfor K in 1 2 3 4 5 6 7 8 9 10;do/u/project/aguillon/shared_bin/admixture_linux-1.3.0/admixture--cv-j10 binary_fileset.bed $K|tee log${K}.out;done#Which K iteration is optimal according to ADMIXTURE ?grep-h CV log*.out > log.errors.txt```### Assess the best run based on cross-validation```{r}#Now copy your entire admixture directory into your local machine, in bash, using a command like this:#scp -r dderaad@dtn.hoffman2.idre.ucla.edu:/u/project/aguillon/dderaad/grosbeak.rad/admixture /Users/devonderaad/Desktop/grosbeak.rad/#read in ADMIXTURE results to R#setwd to the admixture directory you brought in from the clustersetwd("~/Desktop/grosbeak.rad/admixture")#read in log error values to determine optimal Klog<-read.table("log.errors.txt")[,c(3:4)]#use double backslash to interpret the opening parentheses literally in the regular expressionlog$V3<-gsub("\\(K=", "", log$V3)log$V3<-gsub("):", "", log$V3)#interpret K values as numericallog$V3<-as.numeric(log$V3)#rename columnscolnames(log)<-c("Kvalue","cross.validation.error")#make plot showing the cross validation error across K values 1:10ggplot(data=log, aes(x=Kvalue, y=cross.validation.error, group=1)) +geom_line(linetype ="dashed")+geom_point()+ylab("cross-validation error")+xlab("K")+scale_x_continuous(breaks =c(1:10))+theme_classic()#lowest CV value is the ideal K value```### Visualize results as bar charts```{r}#setwd to the admixture directory you brought in from the clustersetwd("~/Desktop/grosbeak.rad/admixture")#read in input filesampling<-read.table("binary_fileset.fam")[,1]#get list of input samples in order they appearsampling#read in all ten runs and save each dataframe in a listruns<-list()#read in log filesfor (i in1:10){ runs[[i]]<-read.table(paste0("binary_fileset.", i, ".Q"))}#plot runs 1:5par(mfrow=c(1,1))for (i in1:5){barplot(t(as.matrix(runs[[i]])), col=rainbow(i), ylab="Ancestry", border="black")}#isolate run 2 (best according to CV)run2<-runs[[2]]#add sample info in the correct order (same as input vcf)run2$sample<-colnames(v@gt)[-1]#read in sample data#read in locality and mito infosamps<-read.csv("~/Desktop/grosbeak.data.csv")samps<-samps[samps$passed.genomic.filtering =="TRUE",] #retain only samples that passed filteringsamps$sample_id == run2$sample #check if sample info table order matches the vcfsamps<-samps[match(run2$sample,samps$sample_id),] #use 'match' to match orderssamps$sample_id == run2$sample #check if this worked#add q-values to the sampling data.frame now that orders matchsamps$lud.q<-run2$V1samps$mel.q<-run2$V2#reorder sample sheet by locality and then by q-value within each localitysamps<-samps[order(samps$site, samps$lud.q),]#use the following info to construct a vector that splits the barplot by sampling sitetable(samps$site)#plot barplots organized based on sampling detailsbarplot(t(as.matrix(samps[,c(40,41)])), col=c("#ef3b2c","#fff319"), ylab="Ancestry", border="black", names.arg=samps$site, cex.names=0.35,space=c(0,rep(0, times=7),3,rep(0, times=18),3,rep(0, times=2),3,rep(0, times=15),3,rep(0, times=9),3,rep(0, times=3),3,rep(0, times=7),3,rep(0, times=8),3,rep(0, times=9),3,rep(0, times=8),3,rep(0, times=9),3,rep(0, times=22),3,rep(0, times=8)))```### repeat with singletons removed```{r}#get vcf infov#must make chromosome names non-numeric for plink or it will throw an errorv@fix[,1]<-paste("a", v@fix[,1], sep="")#make another file with no singletonsv.x<-min_mac(v, min.mac =2)#get infov.x#write to disk#vcfR::write.vcf(v.x, file="~/Desktop/grosbeak.rad/thinned.mac2.vcf.gz")#now move to the UCLA cluster (in a separate terminal window) like so:#scp /Users/devonderaad/Desktop/grosbeak.rad/thinned.mac2.vcf.gz dderaad@dtn.hoffman2.idre.ucla.edu:/u/project/aguillon/dderaad/grosbeak.rad/admixture.mac/```###Run admixture on the cluster```{bash, eval=FALSE}#### submit_job.sh START #####!/bin/bash#$ -cwd#$ -o ./joblog.$JOB_ID.txt #set the job log output file#$ -j y #set error = Merged with joblog#$ -l h_rt=1:00:00,h_data=3G #specify requested resources (h_rt gives time request in 'hrs:mins:secs' format) (h_data specifies requested RAM per task) (highp=TRUE means run it on Aguillon Lab owned nodes)#$ -pe shared 10 #specify number of CPUs requested#load necessary modules. /u/local/Modules/default/init/modules.sh#load plink #(v1.90b6.24)module load plink/1.90b624#use plink to convert vcf directly to bed format:plink--vcf thinned.mac2.vcf --double-id--allow-extra-chr--make-bed--out binary_fileset#fix chromosome namescut-f2- binary_fileset.bim > tempawk'BEGIN{FS=OFS="\t"}{print value 1 OFS $0}' temp > binary_fileset.bim#run admixture for a K of 1-10, using cross-validation, with 10 threadsfor K in 1 2 3 4 5 6 7 8 9 10;do/u/project/aguillon/shared_bin/admixture_linux-1.3.0/admixture--cv-j10 binary_fileset.bed $K|tee log${K}.out;done#Which K iteration is optimal according to ADMIXTURE ?grep-h CV log*.out > log.errors.txt```### Assess the best run based on cross-validation```{r}#Now copy your entire admixture directory into your local machine, in bash, using a command like this:#scp -r dderaad@dtn.hoffman2.idre.ucla.edu:/u/project/aguillon/dderaad/grosbeak.rad/admixture.mac /Users/devonderaad/Desktop/grosbeak.rad/#read in ADMIXTURE results to R#setwd to the admixture directory you brought in from the clustersetwd("~/Desktop/grosbeak.rad/admixture.mac")#read in log error values to determine optimal Klog<-read.table("log.errors.txt")[,c(3:4)]#use double backslash to interpret the opening parentheses literally in the regular expressionlog$V3<-gsub("\\(K=", "", log$V3)log$V3<-gsub("):", "", log$V3)#interpret K values as numericallog$V3<-as.numeric(log$V3)#rename columnscolnames(log)<-c("Kvalue","cross.validation.error")#make plot showing the cross validation error across K values 1:10ggplot(data=log, aes(x=Kvalue, y=cross.validation.error, group=1)) +geom_line(linetype ="dashed")+geom_point()+ylab("cross-validation error")+xlab("K")+scale_x_continuous(breaks =c(1:10))+theme_classic()#lowest CV value is the ideal K value```### Visualize results as bar charts```{r}#setwd to the admixture directory you brought in from the clustersetwd("~/Desktop/grosbeak.rad/admixture.mac")#read in input filesampling<-read.table("binary_fileset.fam")[,1]#get list of input samples in order they appearsampling#read in all ten runs and save each dataframe in a listruns<-list()#read in log filesfor (i in1:10){ runs[[i]]<-read.table(paste0("binary_fileset.", i, ".Q"))}#plot runs 1:5par(mfrow=c(1,1))for (i in1:5){barplot(t(as.matrix(runs[[i]])), col=rainbow(i), ylab="Ancestry", border="black")}#isolate run 2 (best according to CV)run2<-runs[[2]]#add sample info in the correct order (same as input vcf)run2$sample<-colnames(v@gt)[-1]#samps$sample_id == run2$sample #check if sample info table order matches the vcfsamps<-samps[match(run2$sample,samps$sample_id),] #use 'match' to match orderssamps$sample_id == run2$sample #check if this worked#add q-values to the sampling data.frame now that orders matchsamps$mac.lud.q<-run2$V1samps$mac.mel.q<-run2$V2#reorder sample sheet by locality and then by q-value within each localitysamps<-samps[order(samps$site, samps$lud.q),]#use the following info to construct a vector that splits the barplot by sampling sitetable(samps$site)#plot barplots organized based on sampling details with MAC filterbarplot(t(as.matrix(samps[,c(42,43)])), col=c("#ef3b2c","#fff319"), ylab="Ancestry", border="black", names.arg=samps$site, cex.names=0.35,space=c(0,rep(0, times=7),3,rep(0, times=18),3,rep(0, times=2),3,rep(0, times=15),3,rep(0, times=9),3,rep(0, times=3),3,rep(0, times=7),3,rep(0, times=8),3,rep(0, times=9),3,rep(0, times=8),3,rep(0, times=9),3,rep(0, times=22),3,rep(0, times=8)))#plot barplots organized based on sampling details without MAC filterbarplot(t(as.matrix(samps[,c(40,41)])), col=c("#ef3b2c","#fff319"), ylab="Ancestry", border="black", names.arg=samps$site, cex.names=0.35,space=c(0,rep(0, times=7),3,rep(0, times=18),3,rep(0, times=2),3,rep(0, times=15),3,rep(0, times=9),3,rep(0, times=3),3,rep(0, times=7),3,rep(0, times=8),3,rep(0, times=9),3,rep(0, times=8),3,rep(0, times=9),3,rep(0, times=22),3,rep(0, times=8)))#check the correlation between singletons removed vs notplot(samps$lud.q,samps$mac.lud.q, pch=10, cex=1.5)abline(a=0, b=1)cor(samps$lud.q,samps$mac.lud.q)```### test for bimodality by site```{r}#across the transecthist(samps$lud.q, breaks =50, main="Histogram of Ludovicianus ancestry (with singletons)",ylim=c(0,70))hist(samps$mac.lud.q, breaks =50, main="Histogram of Ludovicianus ancestry (no singletons)",ylim=c(0,70))#try different bins (20)hist(samps$mac.lud.q, breaks =20, main="Histogram of Ludovicianus ancestry (no singletons)",ylim=c(0,70))#try different bins (100)hist(samps$mac.lud.q, breaks =100, main="Histogram of Ludovicianus ancestry (no singletons)",ylim=c(0,70))library(diptest)#Dip test to see if we can reject unimodalitydip.test(samps$mac.lud.q, simulate.p.value =TRUE, B =10000)dip.test(samps$lud.q, simulate.p.value =TRUE, B =10000)#repeat for individual siteshist(samps$mac.lud.q[samps$site ==8], breaks =10, main="Histogram of Ludovicianus ancestry at site 8 (no singletons)",xlim=c(0,1))#Dip test to see if we can reject unimodalitydip.test(samps$mac.lud.q[samps$site ==8], simulate.p.value =TRUE, B =10000)#repeat for individual siteshist(samps$mac.lud.q[samps$site ==9], breaks =10, main="Histogram of Ludovicianus ancestry at site 9 (no singletons)",xlim=c(0,1))#Dip test to see if we can reject unimodalitydip.test(samps$mac.lud.q[samps$site ==9], simulate.p.value =TRUE, B =10000)#combine sites 8 & 9 (either side of the transition)hist(samps$mac.lud.q[samps$site ==8| samps$site ==9], breaks =20, main="Histogram of Ludovicianus ancestry sites 8&9 (no singletons)",xlim=c(0,1))#Dip test to see if we can reject unimodalitydip.test(samps$mac.lud.q[samps$site ==8| samps$site ==9], simulate.p.value =TRUE, B =10000)#combine sites 7 - 9 (either side of the transition)hist(samps$mac.lud.q[samps$site ==8| samps$site ==9| samps$site ==7], breaks =20, main="Histogram of Ludovicianus ancestry sites 7-9 (no singletons)",xlim=c(0,1))#Dip test to see if we can reject unimodalitydip.test(samps$mac.lud.q[samps$site ==8| samps$site ==9| samps$site ==7], simulate.p.value =TRUE, B =10000)#combine sites 6 - 9 (either side of the transition)hist(samps$mac.lud.q[samps$site ==8| samps$site ==9| samps$site ==7| samps$site ==6], breaks =20, main="Histogram of Ludovicianus ancestry sites 7-9 (no singletons)",xlim=c(0,1))#Dip test to see if we can reject unimodalitydip.test(samps$mac.lud.q[samps$site ==8| samps$site ==9| samps$site ==7| samps$site ==6], simulate.p.value =TRUE, B =10000)#combine sites 6 - 10 (either side of the transition)hist(samps$mac.lud.q[samps$site ==8| samps$site ==9| samps$site ==7| samps$site ==6| samps$site ==10], breaks =20, main="Histogram of Ludovicianus ancestry sites 7-9 (no singletons)",xlim=c(0,1))#Dip test to see if we can reject unimodalitydip.test(samps$mac.lud.q[samps$site ==8| samps$site ==9| samps$site ==7| samps$site ==6| samps$site ==10], simulate.p.value =TRUE, B =10000)```### visualize mean ancestry across the transect (i.e., quick and dirty geographic cline)```{r}x<-c()for (i in1:12){ x[i]<-mean(samps$mac.lud.q[samps$site == i])}x<-c(mean(samps$mac.lud.q[samps$site ==0]),x)plot(c(0:12),x)```### Make a cohesive figure summarizing these results for the paper```{r}layout(matrix(c(1,1,2,3), 2, 2, byrow = T))layout.show(n=3) #plot barplots organized based on sampling details with MAC filterbarplot(t(as.matrix(samps[,c(42,43)])), col=c("#ef3b2c","#fff319"), ylab="ludovicianus ancestry", border="black", names.arg=samps$site, cex.names=0.35,space=c(0,rep(0, times=7),3,rep(0, times=18),3,rep(0, times=2),3,rep(0, times=15),3,rep(0, times=9),3,rep(0, times=3),3,rep(0, times=7),3,rep(0, times=8),3,rep(0, times=9),3,rep(0, times=8),3,rep(0, times=9),3,rep(0, times=22),3,rep(0, times=8)))#histogram from the entire transecthist(samps$mac.lud.q, breaks =50, xlab="ludovicianus ancestry (all sites)", main=NULL)#combine sites 8 & 9 (either side of the transition)hist(samps$mac.lud.q[samps$site ==8| samps$site ==9], breaks =25, xlab="ludovicianus ancestry (sites 8 & 9)", xlim=c(0,1), main=NULL)#run dip tests to see if we can reject unimodality for the two histograms#all sites:dip.test(samps$mac.lud.q, simulate.p.value =FALSE)#sites 8&9:dip.test(samps$mac.lud.q[samps$site ==8| samps$site ==9], simulate.p.value =FALSE)## 1. Open a pdf file#pdf("~/Desktop/grosbeak.rad/admixture.hists.pdf", width=9, height=6.5) ## 2. Create a plotlayout(matrix(c(1,1,2,3), 2, 2, byrow = T))#plot barplots organized based on sampling details with MAC filterbarplot(t(as.matrix(samps[,c(42,43)])), col=c("#ef3b2c","#fff319"), ylab="ludovicianus ancestry", border="black", names.arg=samps$site, cex.names=0.35,space=c(0,rep(0, times=7),3,rep(0, times=18),3,rep(0, times=2),3,rep(0, times=15),3,rep(0, times=9),3,rep(0, times=3),3,rep(0, times=7),3,rep(0, times=8),3,rep(0, times=9),3,rep(0, times=8),3,rep(0, times=9),3,rep(0, times=22),3,rep(0, times=8)))#histogram from the entire transecthist(samps$mac.lud.q, breaks =50, xlab="ludovicianus ancestry (all sites)", main=NULL)#combine sites 8 & 9 (either side of the transition)hist(samps$mac.lud.q[samps$site ==8| samps$site ==9], breaks =25, xlab="ludovicianus ancestry (sites 8 & 9)", xlim=c(0,1), main=NULL)## 3. Close the pdf file#dev.off()```