0.1 Read in thinned (i.e., linkage-filtered) vcf to r and remove outgroup

Code
library(vcfR)

   *****       ***   vcfR   ***       *****
   This is vcfR 1.14.0 
     browseVignettes('vcfR') # Documentation
     citation('vcfR') # Citation
   *****       *****      *****       *****
Code
library(SNPfiltR)
This is SNPfiltR v.1.0.2

Detailed usage information is available at: devonderaad.github.io/SNPfiltR/ 

If you use SNPfiltR in your published work, please cite the following papers: 

DeRaad, D.A. (2022), SNPfiltR: an R package for interactive and reproducible SNP filtering. Molecular Ecology Resources, 22, 2443-2453. http://doi.org/10.1111/1755-0998.13618 

Knaus, Brian J., and Niklaus J. Grunwald. 2017. VCFR: a package to manipulate and visualize variant call format data in R. Molecular Ecology Resources, 17.1:44-53. http://doi.org/10.1111/1755-0998.12549
Code
library(ggplot2)
#read in thinned vcf
v<-read.vcfR("~/Desktop/grosbeak.rad/grosbeak.filtered.unlinked.snps.vcf.gz")

#now move your filtered thinned vcf to the UCLA cluster (in a separate terminal window) like so:
#scp /Users/devonderaad/Desktop/grosbeak.rad/grosbeak.filtered.unlinked.snps.vcf.gz dderaad@dtn.hoffman2.idre.ucla.edu:/u/project/aguillon/dderaad/grosbeak.rad/admixture/

0.2 Use this bash code to execute ADMIXTURE on the cluster

Code
#use these bash commands to unzip the vcf files you just wrote out before going on to the next step
gunzip grosbeak.filtered.unlinked.snps.vcf.gz

#use this thinned vcf file to execute ADMIXTURE on the cluster using this script submitted as a slurm job:

#### submit_job.sh START ####
#!/bin/bash
#$ -cwd
#$ -o ./joblog.$JOB_ID.txt                   #set the job log output file
#$ -j y                                      #set error = Merged with joblog
#$ -l h_rt=1:00:00,h_data=3G      #specify requested resources (h_rt gives time request in 'hrs:mins:secs' format) (h_data specifies requested RAM per task) (highp=TRUE means run it on Aguillon Lab owned nodes)
#$ -pe shared 10                              #specify number of CPUs requested

#load necessary modules
. /u/local/Modules/default/init/modules.sh
#load plink #(v1.90b6.24)
module load plink/1.90b624

#use plink to convert vcf directly to bed format:
plink --vcf grosbeak.filtered.unlinked.snps.vcf --double-id --allow-extra-chr --make-bed --out binary_fileset
#fix chromosome names
cut -f2- binary_fileset.bim  > temp
awk 'BEGIN{FS=OFS="\t"}{print value 1 OFS $0}' temp > binary_fileset.bim

#run admixture for a K of 1-10, using cross-validation, with 10 threads
for K in 1 2 3 4 5 6 7 8 9 10; 
do /u/project/aguillon/shared_bin/admixture_linux-1.3.0/admixture --cv -j10 binary_fileset.bed $K | tee log${K}.out;
done

#Which K iteration is optimal according to ADMIXTURE ?
grep -h CV log*.out > log.errors.txt

0.3 Assess the best run based on cross-validation

Code
#Now copy your entire admixture directory into your local machine, in bash, using a command like this:
#scp -r dderaad@dtn.hoffman2.idre.ucla.edu:/u/project/aguillon/dderaad/grosbeak.rad/admixture /Users/devonderaad/Desktop/grosbeak.rad/

#read in ADMIXTURE results to R
#setwd to the admixture directory you brought in from the cluster
setwd("~/Desktop/grosbeak.rad/admixture")

#read in log error values to determine optimal K
log<-read.table("log.errors.txt")[,c(3:4)]
#use double backslash to interpret the opening parentheses literally in the regular expression
log$V3<-gsub("\\(K=", "", log$V3)
log$V3<-gsub("):", "", log$V3)
#interpret K values as numerical
log$V3<-as.numeric(log$V3)
#rename columns
colnames(log)<-c("Kvalue","cross.validation.error")

#make plot showing the cross validation error across K values 1:10
ggplot(data=log, aes(x=Kvalue, y=cross.validation.error, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  ylab("cross-validation error")+
  xlab("K")+
  scale_x_continuous(breaks = c(1:10))+
  theme_classic()

Code
#lowest CV value is the ideal K value

0.4 Visualize results as bar charts

Code
#setwd to the admixture directory you brought in from the cluster
setwd("~/Desktop/grosbeak.rad/admixture")

#read in input file
sampling<-read.table("binary_fileset.fam")[,1]
#get list of input samples in order they appear
sampling
  [1] "P_hybrid_44696"         "P_hybrid_44703"         "P_hybrid_44707"        
  [4] "P_hybrid_44708"         "P_hybrid_44709"         "P_hybrid_44712"        
  [7] "P_hybrid_44762"         "P_hybrid_44771"         "P_hybrid_44781"        
 [10] "P_hybrid_45171"         "P_hybrid_45173"         "P_hybrid_45174"        
 [13] "P_ludovicianus_11998"   "P_ludovicianus_21721"   "P_ludovicianus_25286"  
 [16] "P_ludovicianus_26595"   "P_ludovicianus_33988"   "P_ludovicianus_34776"  
 [19] "P_ludovicianus_34779"   "P_ludovicianus_34782"   "P_ludovicianus_34830"  
 [22] "P_ludovicianus_44704"   "P_ludovicianus_44705"   "P_ludovicianus_44706"  
 [25] "P_ludovicianus_44710"   "P_ludovicianus_44711"   "P_ludovicianus_44713"  
 [28] "P_ludovicianus_44714"   "P_ludovicianus_44715"   "P_ludovicianus_44716"  
 [31] "P_ludovicianus_44719"   "P_ludovicianus_44720"   "P_ludovicianus_44722"  
 [34] "P_ludovicianus_44723"   "P_ludovicianus_44726"   "P_ludovicianus_44727"  
 [37] "P_ludovicianus_44729"   "P_ludovicianus_44730"   "P_ludovicianus_44731"  
 [40] "P_ludovicianus_44732"   "P_ludovicianus_44733"   "P_ludovicianus_44734"  
 [43] "P_ludovicianus_44737"   "P_ludovicianus_44738"   "P_ludovicianus_44739"  
 [46] "P_ludovicianus_44740"   "P_ludovicianus_44741"   "P_ludovicianus_44742"  
 [49] "P_ludovicianus_44743"   "P_ludovicianus_44744"   "P_ludovicianus_44745"  
 [52] "P_ludovicianus_44746"   "P_ludovicianus_44747"   "P_ludovicianus_44748"  
 [55] "P_ludovicianus_44749"   "P_ludovicianus_44753"   "P_ludovicianus_44754"  
 [58] "P_ludovicianus_44761"   "P_ludovicianus_44775"   "P_melanocephalus_34890"
 [61] "P_melanocephalus_43110" "P_melanocephalus_43276" "P_melanocephalus_44346"
 [64] "P_melanocephalus_44347" "P_melanocephalus_44471" "P_melanocephalus_44651"
 [67] "P_melanocephalus_44660" "P_melanocephalus_44666" "P_melanocephalus_44676"
 [70] "P_melanocephalus_44677" "P_melanocephalus_44678" "P_melanocephalus_44679"
 [73] "P_melanocephalus_44680" "P_melanocephalus_44681" "P_melanocephalus_44683"
 [76] "P_melanocephalus_44684" "P_melanocephalus_44685" "P_melanocephalus_44686"
 [79] "P_melanocephalus_44687" "P_melanocephalus_44688" "P_melanocephalus_44689"
 [82] "P_melanocephalus_44692" "P_melanocephalus_44693" "P_melanocephalus_44694"
 [85] "P_melanocephalus_44695" "P_melanocephalus_44697" "P_melanocephalus_44699"
 [88] "P_melanocephalus_44752" "P_melanocephalus_44760" "P_melanocephalus_44763"
 [91] "P_melanocephalus_44764" "P_melanocephalus_44765" "P_melanocephalus_44766"
 [94] "P_melanocephalus_44769" "P_melanocephalus_44770" "P_melanocephalus_44772"
 [97] "P_melanocephalus_44773" "P_melanocephalus_44774" "P_melanocephalus_44776"
[100] "P_melanocephalus_44777" "P_melanocephalus_44778" "P_melanocephalus_44779"
[103] "P_melanocephalus_44780" "P_melanocephalus_44782" "P_melanocephalus_44787"
[106] "P_melanocephalus_44788" "P_melanocephalus_44789" "P_melanocephalus_44790"
[109] "P_melanocephalus_44791" "P_melanocephalus_44792" "P_melanocephalus_44793"
[112] "P_melanocephalus_44794" "P_melanocephalus_44795" "P_melanocephalus_44798"
[115] "P_melanocephalus_44799" "P_melanocephalus_44801" "P_melanocephalus_44802"
[118] "P_melanocephalus_44803" "P_melanocephalus_44804" "P_melanocephalus_44805"
[121] "P_melanocephalus_44806" "P_melanocephalus_44808" "P_melanocephalus_44809"
[124] "P_melanocephalus_44810" "P_melanocephalus_44837" "P_melanocephalus_44840"
[127] "P_melanocephalus_44843" "P_melanocephalus_44845" "P_melanocephalus_44846"
[130] "P_melanocephalus_44847" "P_melanocephalus_44848" "P_melanocephalus_44853"
[133] "P_melanocephalus_44854" "P_melanocephalus_45175" "P_melanocephalus_45200"
[136] "P_melanocephalus_45232" "P_melanocephalus_45709" "P_melanocephalus_45926"
Code
#read in all ten runs and save each dataframe in a list
runs<-list()
#read in log files
for (i in 1:10){
  runs[[i]]<-read.table(paste0("binary_fileset.", i, ".Q"))
}

#plot runs 1:5
par(mfrow=c(1,1))
for (i in 1:5){
barplot(t(as.matrix(runs[[i]])), col=rainbow(i), ylab="Ancestry", border="black")
}

Code
#isolate run 2 (best according to CV)
run2<-runs[[2]]
#add sample info in the correct order (same as input vcf)
run2$sample<-colnames(v@gt)[-1]

#read in sample data
#read in locality and mito info
samps<-read.csv("~/Desktop/grosbeak.data.csv")
samps<-samps[samps$passed.genomic.filtering == "TRUE",] #retain only samples that passed filtering
samps$sample_id == run2$sample #check if sample info table order matches the vcf
  [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [85] FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE
 [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[133] FALSE FALSE FALSE FALSE FALSE FALSE
Code
samps<-samps[match(run2$sample,samps$sample_id),] #use 'match' to match orders
samps$sample_id == run2$sample #check if this worked
  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [46] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [61] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [76] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [91] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[106] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[121] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[136] TRUE TRUE TRUE
Code
#add q-values to the sampling data.frame now that orders match
samps$lud.q<-run2$V1
samps$mel.q<-run2$V2

#reorder sample sheet by locality and then by q-value within each locality
samps<-samps[order(samps$site, samps$lud.q),]

#use the following info to construct a vector that splits the barplot by sampling site
table(samps$site)

 0  1  2  3  4  5  6  7  8  9 10 11 12 
 8 19  3 16 10  4  8  9 10  9 10 23  9 
Code
#plot barplots organized based on sampling details
barplot(t(as.matrix(samps[,c(40,41)])), col=c("#ef3b2c","#fff319"), ylab="Ancestry", border="black", names.arg=samps$site, cex.names=0.35,
        space=c(0,rep(0, times=7),3,rep(0, times=18),3,
                rep(0, times=2),3,rep(0, times=15),3,
                rep(0, times=9),3,rep(0, times=3),3,
                rep(0, times=7),3,rep(0, times=8),3,
                rep(0, times=9),3,rep(0, times=8),3,
                rep(0, times=9),3,rep(0, times=22),3,
                rep(0, times=8)))

0.5 repeat with singletons removed

Code
#get vcf info
v
***** Object of Class vcfR *****
138 samples
1027 CHROMs
3,584 variants
Object size: 40.7 Mb
3.403 percent missing data
*****        *****         *****
Code
#must make chromosome names non-numeric for plink or it will throw an error
v@fix[,1]<-paste("a", v@fix[,1], sep="")

#make another file with no singletons
v.x<-min_mac(v, min.mac = 2)
31.25% of SNPs fell below a minor allele count of 2 and were removed from the VCF

Code
#get info
v.x
***** Object of Class vcfR *****
138 samples
850 CHROMs
2,464 variants
Object size: 29.9 Mb
3.457 percent missing data
*****        *****         *****
Code
#write to disk
#vcfR::write.vcf(v.x, file="~/Desktop/grosbeak.rad/thinned.mac2.vcf.gz")

#now move to the UCLA cluster (in a separate terminal window) like so:
#scp /Users/devonderaad/Desktop/grosbeak.rad/thinned.mac2.vcf.gz dderaad@dtn.hoffman2.idre.ucla.edu:/u/project/aguillon/dderaad/grosbeak.rad/admixture.mac/

###Run admixture on the cluster

Code
#### submit_job.sh START ####
#!/bin/bash
#$ -cwd
#$ -o ./joblog.$JOB_ID.txt                   #set the job log output file
#$ -j y                                      #set error = Merged with joblog
#$ -l h_rt=1:00:00,h_data=3G      #specify requested resources (h_rt gives time request in 'hrs:mins:secs' format) (h_data specifies requested RAM per task) (highp=TRUE means run it on Aguillon Lab owned nodes)
#$ -pe shared 10                              #specify number of CPUs requested

#load necessary modules
. /u/local/Modules/default/init/modules.sh
#load plink #(v1.90b6.24)
module load plink/1.90b624

#use plink to convert vcf directly to bed format:
plink --vcf thinned.mac2.vcf --double-id --allow-extra-chr --make-bed --out binary_fileset
#fix chromosome names
cut -f2- binary_fileset.bim  > temp
awk 'BEGIN{FS=OFS="\t"}{print value 1 OFS $0}' temp > binary_fileset.bim

#run admixture for a K of 1-10, using cross-validation, with 10 threads
for K in 1 2 3 4 5 6 7 8 9 10; 
do /u/project/aguillon/shared_bin/admixture_linux-1.3.0/admixture --cv -j10 binary_fileset.bed $K | tee log${K}.out;
done

#Which K iteration is optimal according to ADMIXTURE ?
grep -h CV log*.out > log.errors.txt

0.6 Assess the best run based on cross-validation

Code
#Now copy your entire admixture directory into your local machine, in bash, using a command like this:
#scp -r dderaad@dtn.hoffman2.idre.ucla.edu:/u/project/aguillon/dderaad/grosbeak.rad/admixture.mac /Users/devonderaad/Desktop/grosbeak.rad/

#read in ADMIXTURE results to R
#setwd to the admixture directory you brought in from the cluster
setwd("~/Desktop/grosbeak.rad/admixture.mac")

#read in log error values to determine optimal K
log<-read.table("log.errors.txt")[,c(3:4)]
#use double backslash to interpret the opening parentheses literally in the regular expression
log$V3<-gsub("\\(K=", "", log$V3)
log$V3<-gsub("):", "", log$V3)
#interpret K values as numerical
log$V3<-as.numeric(log$V3)
#rename columns
colnames(log)<-c("Kvalue","cross.validation.error")

#make plot showing the cross validation error across K values 1:10
ggplot(data=log, aes(x=Kvalue, y=cross.validation.error, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  ylab("cross-validation error")+
  xlab("K")+
  scale_x_continuous(breaks = c(1:10))+
  theme_classic()

Code
#lowest CV value is the ideal K value

0.7 Visualize results as bar charts

Code
#setwd to the admixture directory you brought in from the cluster
setwd("~/Desktop/grosbeak.rad/admixture.mac")

#read in input file
sampling<-read.table("binary_fileset.fam")[,1]
#get list of input samples in order they appear
sampling
  [1] "P_hybrid_44696"         "P_hybrid_44703"         "P_hybrid_44707"        
  [4] "P_hybrid_44708"         "P_hybrid_44709"         "P_hybrid_44712"        
  [7] "P_hybrid_44762"         "P_hybrid_44771"         "P_hybrid_44781"        
 [10] "P_hybrid_45171"         "P_hybrid_45173"         "P_hybrid_45174"        
 [13] "P_ludovicianus_11998"   "P_ludovicianus_21721"   "P_ludovicianus_25286"  
 [16] "P_ludovicianus_26595"   "P_ludovicianus_33988"   "P_ludovicianus_34776"  
 [19] "P_ludovicianus_34779"   "P_ludovicianus_34782"   "P_ludovicianus_34830"  
 [22] "P_ludovicianus_44704"   "P_ludovicianus_44705"   "P_ludovicianus_44706"  
 [25] "P_ludovicianus_44710"   "P_ludovicianus_44711"   "P_ludovicianus_44713"  
 [28] "P_ludovicianus_44714"   "P_ludovicianus_44715"   "P_ludovicianus_44716"  
 [31] "P_ludovicianus_44719"   "P_ludovicianus_44720"   "P_ludovicianus_44722"  
 [34] "P_ludovicianus_44723"   "P_ludovicianus_44726"   "P_ludovicianus_44727"  
 [37] "P_ludovicianus_44729"   "P_ludovicianus_44730"   "P_ludovicianus_44731"  
 [40] "P_ludovicianus_44732"   "P_ludovicianus_44733"   "P_ludovicianus_44734"  
 [43] "P_ludovicianus_44737"   "P_ludovicianus_44738"   "P_ludovicianus_44739"  
 [46] "P_ludovicianus_44740"   "P_ludovicianus_44741"   "P_ludovicianus_44742"  
 [49] "P_ludovicianus_44743"   "P_ludovicianus_44744"   "P_ludovicianus_44745"  
 [52] "P_ludovicianus_44746"   "P_ludovicianus_44747"   "P_ludovicianus_44748"  
 [55] "P_ludovicianus_44749"   "P_ludovicianus_44753"   "P_ludovicianus_44754"  
 [58] "P_ludovicianus_44761"   "P_ludovicianus_44775"   "P_melanocephalus_34890"
 [61] "P_melanocephalus_43110" "P_melanocephalus_43276" "P_melanocephalus_44346"
 [64] "P_melanocephalus_44347" "P_melanocephalus_44471" "P_melanocephalus_44651"
 [67] "P_melanocephalus_44660" "P_melanocephalus_44666" "P_melanocephalus_44676"
 [70] "P_melanocephalus_44677" "P_melanocephalus_44678" "P_melanocephalus_44679"
 [73] "P_melanocephalus_44680" "P_melanocephalus_44681" "P_melanocephalus_44683"
 [76] "P_melanocephalus_44684" "P_melanocephalus_44685" "P_melanocephalus_44686"
 [79] "P_melanocephalus_44687" "P_melanocephalus_44688" "P_melanocephalus_44689"
 [82] "P_melanocephalus_44692" "P_melanocephalus_44693" "P_melanocephalus_44694"
 [85] "P_melanocephalus_44695" "P_melanocephalus_44697" "P_melanocephalus_44699"
 [88] "P_melanocephalus_44752" "P_melanocephalus_44760" "P_melanocephalus_44763"
 [91] "P_melanocephalus_44764" "P_melanocephalus_44765" "P_melanocephalus_44766"
 [94] "P_melanocephalus_44769" "P_melanocephalus_44770" "P_melanocephalus_44772"
 [97] "P_melanocephalus_44773" "P_melanocephalus_44774" "P_melanocephalus_44776"
[100] "P_melanocephalus_44777" "P_melanocephalus_44778" "P_melanocephalus_44779"
[103] "P_melanocephalus_44780" "P_melanocephalus_44782" "P_melanocephalus_44787"
[106] "P_melanocephalus_44788" "P_melanocephalus_44789" "P_melanocephalus_44790"
[109] "P_melanocephalus_44791" "P_melanocephalus_44792" "P_melanocephalus_44793"
[112] "P_melanocephalus_44794" "P_melanocephalus_44795" "P_melanocephalus_44798"
[115] "P_melanocephalus_44799" "P_melanocephalus_44801" "P_melanocephalus_44802"
[118] "P_melanocephalus_44803" "P_melanocephalus_44804" "P_melanocephalus_44805"
[121] "P_melanocephalus_44806" "P_melanocephalus_44808" "P_melanocephalus_44809"
[124] "P_melanocephalus_44810" "P_melanocephalus_44837" "P_melanocephalus_44840"
[127] "P_melanocephalus_44843" "P_melanocephalus_44845" "P_melanocephalus_44846"
[130] "P_melanocephalus_44847" "P_melanocephalus_44848" "P_melanocephalus_44853"
[133] "P_melanocephalus_44854" "P_melanocephalus_45175" "P_melanocephalus_45200"
[136] "P_melanocephalus_45232" "P_melanocephalus_45709" "P_melanocephalus_45926"
Code
#read in all ten runs and save each dataframe in a list
runs<-list()
#read in log files
for (i in 1:10){
  runs[[i]]<-read.table(paste0("binary_fileset.", i, ".Q"))
}

#plot runs 1:5
par(mfrow=c(1,1))
for (i in 1:5){
barplot(t(as.matrix(runs[[i]])), col=rainbow(i), ylab="Ancestry", border="black")
}

Code
#isolate run 2 (best according to CV)
run2<-runs[[2]]
#add sample info in the correct order (same as input vcf)
run2$sample<-colnames(v@gt)[-1]

#
samps$sample_id == run2$sample #check if sample info table order matches the vcf
  [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [85] FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[133] FALSE FALSE FALSE FALSE FALSE FALSE
Code
samps<-samps[match(run2$sample,samps$sample_id),] #use 'match' to match orders
samps$sample_id == run2$sample #check if this worked
  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [46] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [61] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [76] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [91] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[106] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[121] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[136] TRUE TRUE TRUE
Code
#add q-values to the sampling data.frame now that orders match
samps$mac.lud.q<-run2$V1
samps$mac.mel.q<-run2$V2

#reorder sample sheet by locality and then by q-value within each locality
samps<-samps[order(samps$site, samps$lud.q),]

#use the following info to construct a vector that splits the barplot by sampling site
table(samps$site)

 0  1  2  3  4  5  6  7  8  9 10 11 12 
 8 19  3 16 10  4  8  9 10  9 10 23  9 
Code
#plot barplots organized based on sampling details with MAC filter
barplot(t(as.matrix(samps[,c(42,43)])), col=c("#ef3b2c","#fff319"), ylab="Ancestry", border="black", names.arg=samps$site, cex.names=0.35,
        space=c(0,rep(0, times=7),3,rep(0, times=18),3,
                rep(0, times=2),3,rep(0, times=15),3,
                rep(0, times=9),3,rep(0, times=3),3,
                rep(0, times=7),3,rep(0, times=8),3,
                rep(0, times=9),3,rep(0, times=8),3,
                rep(0, times=9),3,rep(0, times=22),3,
                rep(0, times=8)))

Code
#plot barplots organized based on sampling details without MAC filter
barplot(t(as.matrix(samps[,c(40,41)])), col=c("#ef3b2c","#fff319"), ylab="Ancestry", border="black", names.arg=samps$site, cex.names=0.35,
        space=c(0,rep(0, times=7),3,rep(0, times=18),3,
                rep(0, times=2),3,rep(0, times=15),3,
                rep(0, times=9),3,rep(0, times=3),3,
                rep(0, times=7),3,rep(0, times=8),3,
                rep(0, times=9),3,rep(0, times=8),3,
                rep(0, times=9),3,rep(0, times=22),3,
                rep(0, times=8)))

Code
#check the correlation between singletons removed vs not
plot(samps$lud.q,samps$mac.lud.q, pch=10, cex=1.5)
abline(a=0, b=1)

Code
cor(samps$lud.q,samps$mac.lud.q)
[1] 0.9999769

0.8 test for bimodality by site

Code
#across the transect
hist(samps$lud.q, breaks = 50, main="Histogram of Ludovicianus ancestry (with singletons)",ylim=c(0,70))

Code
hist(samps$mac.lud.q, breaks = 50, main="Histogram of Ludovicianus ancestry (no singletons)",ylim=c(0,70))

Code
#try different bins (20)
hist(samps$mac.lud.q, breaks = 20, main="Histogram of Ludovicianus ancestry (no singletons)",ylim=c(0,70))

Code
#try different bins (100)
hist(samps$mac.lud.q, breaks = 100, main="Histogram of Ludovicianus ancestry (no singletons)",ylim=c(0,70))

Code
library(diptest)
#Dip test to see if we can reject unimodality
dip.test(samps$mac.lud.q, simulate.p.value = TRUE, B = 10000)

    Hartigans' dip test for unimodality / multimodality with simulated
    p-value (based on 10000 replicates)

data:  samps$mac.lud.q
D = 0.16543, p-value < 2.2e-16
alternative hypothesis: non-unimodal, i.e., at least bimodal
Code
dip.test(samps$lud.q, simulate.p.value = TRUE, B = 10000)

    Hartigans' dip test for unimodality / multimodality with simulated
    p-value (based on 10000 replicates)

data:  samps$lud.q
D = 0.16588, p-value < 2.2e-16
alternative hypothesis: non-unimodal, i.e., at least bimodal
Code
#repeat for individual sites
hist(samps$mac.lud.q[samps$site == 8], breaks = 10, main="Histogram of Ludovicianus ancestry at site 8 (no singletons)",xlim=c(0,1))

Code
#Dip test to see if we can reject unimodality
dip.test(samps$mac.lud.q[samps$site == 8], simulate.p.value = TRUE, B = 10000)

    Hartigans' dip test for unimodality / multimodality with simulated
    p-value (based on 10000 replicates)

data:  samps$mac.lud.q[samps$site == 8]
D = 0.11004, p-value = 0.2939
alternative hypothesis: non-unimodal, i.e., at least bimodal
Code
#repeat for individual sites
hist(samps$mac.lud.q[samps$site == 9], breaks = 10, main="Histogram of Ludovicianus ancestry at site 9 (no singletons)",xlim=c(0,1))

Code
#Dip test to see if we can reject unimodality
dip.test(samps$mac.lud.q[samps$site == 9], simulate.p.value = TRUE, B = 10000)

    Hartigans' dip test for unimodality / multimodality with simulated
    p-value (based on 10000 replicates)

data:  samps$mac.lud.q[samps$site == 9]
D = 0.082275, p-value = 0.8827
alternative hypothesis: non-unimodal, i.e., at least bimodal
Code
#combine sites 8 & 9 (either side of the transition)
hist(samps$mac.lud.q[samps$site == 8 | samps$site == 9], breaks = 20, main="Histogram of Ludovicianus ancestry sites 8&9 (no singletons)",xlim=c(0,1))

Code
#Dip test to see if we can reject unimodality
dip.test(samps$mac.lud.q[samps$site == 8 | samps$site == 9], simulate.p.value = TRUE, B = 10000)

    Hartigans' dip test for unimodality / multimodality with simulated
    p-value (based on 10000 replicates)

data:  samps$mac.lud.q[samps$site == 8 | samps$site == 9]
D = 0.17822, p-value < 2.2e-16
alternative hypothesis: non-unimodal, i.e., at least bimodal
Code
#combine sites  7 - 9 (either side of the transition)
hist(samps$mac.lud.q[samps$site == 8 | samps$site == 9 | samps$site == 7], breaks = 20, main="Histogram of Ludovicianus ancestry sites 7-9 (no singletons)",xlim=c(0,1))

Code
#Dip test to see if we can reject unimodality
dip.test(samps$mac.lud.q[samps$site == 8 | samps$site == 9| samps$site == 7], simulate.p.value = TRUE, B = 10000)

    Hartigans' dip test for unimodality / multimodality with simulated
    p-value (based on 10000 replicates)

data:  samps$mac.lud.q[samps$site == 8 | samps$site == 9 | samps$site ==     7]
D = 0.13431, p-value = 1e-04
alternative hypothesis: non-unimodal, i.e., at least bimodal
Code
#combine sites  6 - 9 (either side of the transition)
hist(samps$mac.lud.q[samps$site == 8 | samps$site == 9 | samps$site == 7 | samps$site == 6], breaks = 20, main="Histogram of Ludovicianus ancestry sites 7-9 (no singletons)",xlim=c(0,1))

Code
#Dip test to see if we can reject unimodality
dip.test(samps$mac.lud.q[samps$site == 8 | samps$site == 9| samps$site == 7| samps$site == 6], simulate.p.value = TRUE, B = 10000)

    Hartigans' dip test for unimodality / multimodality with simulated
    p-value (based on 10000 replicates)

data:  samps$mac.lud.q[samps$site == 8 | samps$site == 9 | samps$site ==     7 | samps$site == 6]
D = 0.11677, p-value = 5e-04
alternative hypothesis: non-unimodal, i.e., at least bimodal
Code
#combine sites  6 - 10 (either side of the transition)
hist(samps$mac.lud.q[samps$site == 8 | samps$site == 9 | samps$site == 7 | samps$site == 6 | samps$site == 10], breaks = 20, main="Histogram of Ludovicianus ancestry sites 7-9 (no singletons)",xlim=c(0,1))

Code
#Dip test to see if we can reject unimodality
dip.test(samps$mac.lud.q[samps$site == 8 | samps$site == 9| samps$site == 7| samps$site == 6 | samps$site == 10], simulate.p.value = TRUE, B = 10000)

    Hartigans' dip test for unimodality / multimodality with simulated
    p-value (based on 10000 replicates)

data:  samps$mac.lud.q[samps$site == 8 | samps$site == 9 | samps$site ==     7 | samps$site == 6 | samps$site == 10]
D = 0.16727, p-value < 2.2e-16
alternative hypothesis: non-unimodal, i.e., at least bimodal

0.9 visualize mean ancestry across the transect (i.e., quick and dirty geographic cline)

Code
x<-c()
for (i in 1:12){
  x[i]<-mean(samps$mac.lud.q[samps$site == i])
}
x<-c(mean(samps$mac.lud.q[samps$site == 0]),x)
plot(c(0:12),x)

0.10 Make a cohesive figure summarizing these results for the paper

Code
layout(matrix(c(1,1,2,3), 2, 2, byrow = T))
layout.show(n=3) 

Code
#plot barplots organized based on sampling details with MAC filter
barplot(t(as.matrix(samps[,c(42,43)])), col=c("#ef3b2c","#fff319"), ylab="ludovicianus ancestry", border="black", names.arg=samps$site, cex.names=0.35,
        space=c(0,rep(0, times=7),3,rep(0, times=18),3,
                rep(0, times=2),3,rep(0, times=15),3,
                rep(0, times=9),3,rep(0, times=3),3,
                rep(0, times=7),3,rep(0, times=8),3,
                rep(0, times=9),3,rep(0, times=8),3,
                rep(0, times=9),3,rep(0, times=22),3,
                rep(0, times=8)))

#histogram from the entire transect
hist(samps$mac.lud.q, breaks = 50, xlab="ludovicianus ancestry (all sites)", main=NULL)
#combine sites 8 & 9 (either side of the transition)
hist(samps$mac.lud.q[samps$site == 8 | samps$site == 9], breaks = 25, xlab="ludovicianus ancestry (sites 8 & 9)", xlim=c(0,1), main=NULL)

#run dip tests to see if we can reject unimodality for the two histograms
#all sites:
dip.test(samps$mac.lud.q, simulate.p.value = FALSE)

    Hartigans' dip test for unimodality / multimodality

data:  samps$mac.lud.q
D = 0.16543, p-value < 2.2e-16
alternative hypothesis: non-unimodal, i.e., at least bimodal
Code
#sites 8&9:
dip.test(samps$mac.lud.q[samps$site == 8 | samps$site == 9], simulate.p.value = FALSE)

    Hartigans' dip test for unimodality / multimodality

data:  samps$mac.lud.q[samps$site == 8 | samps$site == 9]
D = 0.17822, p-value = 2.731e-07
alternative hypothesis: non-unimodal, i.e., at least bimodal
Code
## 1. Open a pdf file
#pdf("~/Desktop/grosbeak.rad/admixture.hists.pdf", width=9, height=6.5) 
## 2. Create a plot
layout(matrix(c(1,1,2,3), 2, 2, byrow = T))
#plot barplots organized based on sampling details with MAC filter
barplot(t(as.matrix(samps[,c(42,43)])), col=c("#ef3b2c","#fff319"), ylab="ludovicianus ancestry", border="black", names.arg=samps$site, cex.names=0.35,
        space=c(0,rep(0, times=7),3,rep(0, times=18),3,
                rep(0, times=2),3,rep(0, times=15),3,
                rep(0, times=9),3,rep(0, times=3),3,
                rep(0, times=7),3,rep(0, times=8),3,
                rep(0, times=9),3,rep(0, times=8),3,
                rep(0, times=9),3,rep(0, times=22),3,
                rep(0, times=8)))

#histogram from the entire transect
hist(samps$mac.lud.q, breaks = 50, xlab="ludovicianus ancestry (all sites)", main=NULL)
#combine sites 8 & 9 (either side of the transition)
hist(samps$mac.lud.q[samps$site == 8 | samps$site == 9], breaks = 25, xlab="ludovicianus ancestry (sites 8 & 9)", xlim=c(0,1), main=NULL)

Code
## 3. Close the pdf file
#dev.off()