read in thinned (i.e., linkage-filtered) vcf to r and remove
outgroup
library(vcfR)
##
## ***** *** vcfR *** *****
## This is vcfR 1.12.0
## browseVignettes('vcfR') # Documentation
## citation('vcfR') # Citation
## ***** ***** ***** *****
library(SNPfiltR)
## This is SNPfiltR v.1.0.0
##
## Detailed usage information is available at: devonderaad.github.io/SNPfiltR/
##
## If you use SNPfiltR in your published work, please cite the following papers:
##
## DeRaad, D.A. (2022), SNPfiltR: an R package for interactive and reproducible SNP filtering. Molecular Ecology Resources, 00, 1-15. http://doi.org/10.1111/1755-0998.13618
##
## Knaus, Brian J., and Niklaus J. Grunwald. 2017. VCFR: a package to manipulate and visualize variant call format data in R. Molecular Ecology Resources, 17.1:44-53. http://doi.org/10.1111/1755-0998.12549
library(ggplot2)
#read in thinned vcf
v<-read.vcfR("~/Desktop/phil.dicaeum/filtered.85.unlinked.vcf")
## Scanning file to determine attributes.
## File attributes:
## meta lines: 14
## header_line: 15
## variant count: 2590
## column count: 69
##
Meta line 14 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
## Character matrix gt rows: 2590
## Character matrix gt cols: 69
## skip: 0
## nrows: 2590
## row_num: 0
##
Processed variant 1000
Processed variant 2000
Processed variant: 2590
## All variants processed
#must make chromosome names non-numeric for plink or it will throw an error
v@fix[,1]<-paste("a", v@fix[,1], sep="")
#remove outgroup samples
v<-v[,colnames(v@gt) != "D_nigrilore_KU28413" & colnames(v@gt) != "D_nigrilore_KU28414"]
#remove invariant sites
v<-min_mac(v, min.mac = 1)
## 15.52% of SNPs fell below a minor allele count of 1 and were removed from the VCF

#make another file with no singletons
v.x<-min_mac(v, min.mac = 2)
## 51.23% of SNPs fell below a minor allele count of 2 and were removed from the VCF

#write to disk
#vcfR::write.vcf(v, file="~/Desktop/phil.dicaeum/thinned.nooutgroup.vcf.gz")
#vcfR::write.vcf(v.x, file="~/Desktop/phil.dicaeum/thinned.nooutgroup.mac.vcf.gz")
use this bash code to execute ADMIXTURE on the cluster
#use these bash commands to unzip the vcf files you just wrote out
gunzip thinned.nooutgroup.vcf.gz
gunzip thinned.nooutgroup.mac.vcf.gz
#use this thinned vcf file to execute ADMIXTURE on the cluster using this script submitted as a slurm job:
#!/bin/sh
#
#SBATCH --job-name=admixture # Job Name
#SBATCH --nodes=1 # 40 nodes
#SBATCH --ntasks-per-node=10 # 40 CPU allocation per Task
#SBATCH --partition=sixhour # Name of the Slurm partition used
#SBATCH --chdir=/home/d669d153/work/phil.dicaeum/admixture # Set working d$
#SBATCH --mem-per-cpu=1gb # memory requested
#SBATCH --time=360
#use plink to convert vcf directly to bed format:
/home/d669d153/work/plink --vcf thinned.nooutgroup.vcf --double-id --allow-extra-chr --make-bed --out binary_fileset
#fix chromosome names
cut -f2- binary_fileset.bim > temp
awk 'BEGIN{FS=OFS="\t"}{print value 1 OFS $0}' temp > binary_fileset.bim
#run admixture for a K of 1-10, using cross-validation, with 10 threads
for K in 1 2 3 4 5 6 7 8 9 10;
do /home/d669d153/work/admixture_linux-1.3.0/admixture --cv -j10 binary_fileset.bed $K | tee log${K}.out;
done
#Which K iteration is optimal according to ADMIXTURE ?
grep -h CV log*.out > log.errors.txt
visualize the results in R
#Now copy your entire admixture directory into your local machine, in bash, using a command like this:
#scp -r d669d153@hpc.crc.ku.edu:/home/d669d153/work/phil.dicaeum/admixture /Users/devder/Desktop/phil.dicaeum/
#read in ADMIXTURE results to R
#setwd to the admixture directory you brought in from the cluster
setwd("~/Desktop/phil.dicaeum/admixture")
#read in log error values to determine optimal K
log<-read.table("log.errors.txt")[,c(3:4)]
#use double backslash to interpret the opening parentheses literally in the regular expression
log$V3<-gsub("\\(K=", "", log$V3)
log$V3<-gsub("):", "", log$V3)
#interpret K values as numerical
log$V3<-as.numeric(log$V3)
#rename columns
colnames(log)<-c("Kvalue","cross.validation.error")
#make plot showing the cross validation error across K values 1:10
ggplot(data=log, aes(x=Kvalue, y=cross.validation.error, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
ylab("cross-validation error")+
xlab("K")+
scale_x_continuous(breaks = c(1:10))+
theme_classic()

#lowest CV value is the ideal K value
visualize results as bar charts
#setwd to the admixture directory you brought in from the cluster
setwd("~/Desktop/phil.dicaeum/admixture")
#read in input file
sampling<-read.table("binary_fileset.fam")[,1]
#get list of input samples in order they appear
sampling
## [1] "D_hypoleucum_18070" "D_hypoleucum_18191"
## [3] "D_hypoleucum_14037" "D_hypoleucum_14079"
## [5] "D_hypoleucum_14065" "D_hypoleucum_14120"
## [7] "D_hypoleucum_18159" "D_hypoleucum_14075"
## [9] "D_hypoleucum_20218" "D_hypoleucum_19177"
## [11] "D_hypoleucum_19178" "D_hypoleucum_20213"
## [13] "D_hypoleucum_FMNH454949" "D_hypoleucum_25637"
## [15] "D_hypoleucum_1271" "D_hypoleucum_1273"
## [17] "D_hypoleucum_1275" "D_hypoleucum_25921"
## [19] "D_hypoleucum_3274" "D_hypoleucum_26984"
## [21] "D_hypoleucum_3208" "D_hypoleucum_3158"
## [23] "D_hypoleucum_2253" "D_hypoleucum_3095"
## [25] "D_hypoleucum_462070" "D_hypoleucum_25675"
## [27] "D_hypoleucum_454950" "D_hypoleucum_25880"
## [29] "D_hypoleucum_3314" "D_hypoleucum_357608"
## [31] "D_hypoleucum_357615" "D_hypoleucum_357612"
## [33] "D_hypoleucum_19638" "D_hypoleucum_25868"
## [35] "D_hypoleucum_17969" "D_hypoleucum_25672"
## [37] "D_hypoleucum_26975" "D_hypoleucum_2229"
## [39] "D_hypoleucum_2067" "D_hypoleucum_28329"
## [41] "D_hypoleucum_28361" "D_hypoleucum_28376"
## [43] "D_hypoleucum_2208" "D_hypoleucum_28294"
## [45] "D_hypoleucum_1956" "D_hypoleucum_25670"
## [47] "D_hypoleucum_20921" "D_hypoleucum_20193"
## [49] "D_hypoleucum_28416" "D_hypoleucum_18193"
## [51] "D_hypoleucum_27450" "D_hypoleucum_19046"
## [53] "D_hypoleucum_19136" "D_hypoleucum_28676"
## [55] "D_hypoleucum_29945" "D_hypoleucum_31636"
## [57] "D_hypoleucum_29951" "D_hypoleucum_31644"
#read in all ten runs and save each dataframe in a list
runs<-list()
#read in log files
for (i in 1:10){
runs[[i]]<-read.table(paste0("binary_fileset.", i, ".Q"))
}
#plot each run
par(mfrow=c(1,1))
for (i in 1:5){
barplot(t(as.matrix(runs[[i]])), col=rainbow(i), ylab="Ancestry", border="black")
}





#read in sample data
pops<-read.csv("~/Desktop/phil.dicaeum/dicaeum.sampling.csv")
pops<-pops[pops$ID %in% colnames(v@gt),]
pops$Taxa[c(23,38,39,43,45)]<-"mtbusa"
#reorder the sample info file to match the order of samples in the vcf
pops<-pops[order(match(pops$ID,colnames(v@gt)[-1])),]
pops$Taxa[order(pops$Taxa)]
## [1] "Luzon" "Luzon" "Luzon" "Luzon" "Luzon" "Luzon"
## [7] "Luzon" "Luzon" "Luzon" "Luzon" "Luzon" "Luzon"
## [13] "Luzon" "Luzon" "Luzon" "Luzon" "Luzon" "Luzon"
## [19] "Mindanao" "Mindanao" "Mindanao" "Mindanao" "Mindanao" "Mindanao"
## [25] "Mindanao" "Mindanao" "Mindanao" "Mindanao" "Mindanao" "Mindanao"
## [31] "Mindanao" "Mindanao" "Mindanao" "Mindanao" "Mindanao" "Mindanao"
## [37] "Mindanao" "Mindanao" "Mindanao" "Mindanao" "Mindanao" "Mindanao"
## [43] "Mindanao" "Mindanao" "Mindanao" "Mindanao" "mtbusa" "mtbusa"
## [49] "mtbusa" "mtbusa" "mtbusa" "Zamboanga" "Zamboanga" "Zamboanga"
## [55] "Zamboanga" "Zamboanga" "Zamboanga" "Zamboanga"
#check sample order
pops$ID[order(pops$Taxa)]
## [1] "D_hypoleucum_18070" "D_hypoleucum_20218"
## [3] "D_hypoleucum_20213" "D_hypoleucum_FMNH454949"
## [5] "D_hypoleucum_25637" "D_hypoleucum_25921"
## [7] "D_hypoleucum_26984" "D_hypoleucum_462070"
## [9] "D_hypoleucum_25675" "D_hypoleucum_454950"
## [11] "D_hypoleucum_25880" "D_hypoleucum_19638"
## [13] "D_hypoleucum_25868" "D_hypoleucum_17969"
## [15] "D_hypoleucum_25672" "D_hypoleucum_26975"
## [17] "D_hypoleucum_25670" "D_hypoleucum_20193"
## [19] "D_hypoleucum_14037" "D_hypoleucum_14079"
## [21] "D_hypoleucum_14065" "D_hypoleucum_14120"
## [23] "D_hypoleucum_14075" "D_hypoleucum_1271"
## [25] "D_hypoleucum_1273" "D_hypoleucum_1275"
## [27] "D_hypoleucum_3274" "D_hypoleucum_3208"
## [29] "D_hypoleucum_3158" "D_hypoleucum_3095"
## [31] "D_hypoleucum_3314" "D_hypoleucum_357608"
## [33] "D_hypoleucum_357615" "D_hypoleucum_357612"
## [35] "D_hypoleucum_28329" "D_hypoleucum_28361"
## [37] "D_hypoleucum_28376" "D_hypoleucum_28294"
## [39] "D_hypoleucum_20921" "D_hypoleucum_28416"
## [41] "D_hypoleucum_27450" "D_hypoleucum_19046"
## [43] "D_hypoleucum_19136" "D_hypoleucum_28676"
## [45] "D_hypoleucum_31636" "D_hypoleucum_31644"
## [47] "D_hypoleucum_2253" "D_hypoleucum_2229"
## [49] "D_hypoleucum_2067" "D_hypoleucum_2208"
## [51] "D_hypoleucum_1956" "D_hypoleucum_18191"
## [53] "D_hypoleucum_18159" "D_hypoleucum_19177"
## [55] "D_hypoleucum_19178" "D_hypoleucum_18193"
## [57] "D_hypoleucum_29945" "D_hypoleucum_29951"
#reorder samples based on sampling locality for k=2/3
runs[[2]]<-runs[[2]][order(pops$Taxa),]
runs[[3]]<-runs[[3]][order(pops$Taxa),]
#plot barplots for the two most relevant runs
barplot(t(as.matrix(runs[[2]])), col=c("#B2182B","#4D4D4D"), ylab="Ancestry", border="black")

barplot(t(as.matrix(runs[[3]])), col=c("#B2182B","#4D4D4D","black"), ylab="Ancestry", border="black")

Repeat with singletons removed
#use this thinned vcf file to execute ADMIXTURE on the cluster using this script submitted as a slurm job:
#!/bin/sh
#
#SBATCH --job-name=admixture # Job Name
#SBATCH --nodes=1 # 40 nodes
#SBATCH --ntasks-per-node=10 # 40 CPU allocation per Task
#SBATCH --partition=sixhour # Name of the Slurm partition used
#SBATCH --chdir=/home/d669d153/work/phil.dicaeum/admixture.mac # Set working d$
#SBATCH --mem-per-cpu=1gb # memory requested
#SBATCH --time=360
#use plink to convert vcf directly to bed format:
/home/d669d153/work/plink --vcf thinned.nooutgroup.mac.vcf --double-id --allow-extra-chr --make-bed --out binary_fileset
#fix chromosome names
cut -f2- binary_fileset.bim > temp
awk 'BEGIN{FS=OFS="\t"}{print value 1 OFS $0}' temp > binary_fileset.bim
#run admixture for a K of 1-10, using cross-validation, with 10 threads
for K in 1 2 3 4 5 6 7 8 9 10;
do /home/d669d153/work/admixture_linux-1.3.0/admixture --cv -j10 binary_fileset.bed $K | tee log${K}.out;
done
#Which K iteration is optimal according to ADMIXTURE ?
grep -h CV log*.out > log.errors.txt
visualize the results in R
#Now copy your entire admixture directory into your local machine, in bash, using a command like this:
#scp -r d669d153@hpc.crc.ku.edu:/home/d669d153/work/phil.dicaeum/admixture.mac /Users/devder/Desktop/phil.dicaeum/
#read in ADMIXTURE results to R
#setwd to the admixture directory you brought in from the cluster
setwd("~/Desktop/phil.dicaeum/admixture.mac")
#read in log error values to determine optimal K
log<-read.table("log.errors.txt")[,c(3:4)]
#use double backslash to interpret the opening parentheses literally in the regular expression
log$V3<-gsub("\\(K=", "", log$V3)
log$V3<-gsub("):", "", log$V3)
#interpret K values as numerical
log$V3<-as.numeric(log$V3)
#rename columns
colnames(log)<-c("Kvalue","cross.validation.error")
#make plot showing the cross validation error across K values 1:10
ggplot(data=log, aes(x=Kvalue, y=cross.validation.error, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
ylab("cross-validation error")+
xlab("K")+
scale_x_continuous(breaks = c(1:10))+
theme_classic()

#lowest CV value is the ideal K value
visualize results as bar charts
#setwd to the admixture directory you brought in from the cluster
setwd("~/Desktop/phil.dicaeum/admixture.mac")
#read in input file
sampling<-read.table("binary_fileset.fam")[,1]
#get list of input samples in order they appear
sampling
## [1] "D_hypoleucum_18070" "D_hypoleucum_18191"
## [3] "D_hypoleucum_14037" "D_hypoleucum_14079"
## [5] "D_hypoleucum_14065" "D_hypoleucum_14120"
## [7] "D_hypoleucum_18159" "D_hypoleucum_14075"
## [9] "D_hypoleucum_20218" "D_hypoleucum_19177"
## [11] "D_hypoleucum_19178" "D_hypoleucum_20213"
## [13] "D_hypoleucum_FMNH454949" "D_hypoleucum_25637"
## [15] "D_hypoleucum_1271" "D_hypoleucum_1273"
## [17] "D_hypoleucum_1275" "D_hypoleucum_25921"
## [19] "D_hypoleucum_3274" "D_hypoleucum_26984"
## [21] "D_hypoleucum_3208" "D_hypoleucum_3158"
## [23] "D_hypoleucum_2253" "D_hypoleucum_3095"
## [25] "D_hypoleucum_462070" "D_hypoleucum_25675"
## [27] "D_hypoleucum_454950" "D_hypoleucum_25880"
## [29] "D_hypoleucum_3314" "D_hypoleucum_357608"
## [31] "D_hypoleucum_357615" "D_hypoleucum_357612"
## [33] "D_hypoleucum_19638" "D_hypoleucum_25868"
## [35] "D_hypoleucum_17969" "D_hypoleucum_25672"
## [37] "D_hypoleucum_26975" "D_hypoleucum_2229"
## [39] "D_hypoleucum_2067" "D_hypoleucum_28329"
## [41] "D_hypoleucum_28361" "D_hypoleucum_28376"
## [43] "D_hypoleucum_2208" "D_hypoleucum_28294"
## [45] "D_hypoleucum_1956" "D_hypoleucum_25670"
## [47] "D_hypoleucum_20921" "D_hypoleucum_20193"
## [49] "D_hypoleucum_28416" "D_hypoleucum_18193"
## [51] "D_hypoleucum_27450" "D_hypoleucum_19046"
## [53] "D_hypoleucum_19136" "D_hypoleucum_28676"
## [55] "D_hypoleucum_29945" "D_hypoleucum_31636"
## [57] "D_hypoleucum_29951" "D_hypoleucum_31644"
#read in all ten runs and save each dataframe in a list
runs<-list()
#read in log files
for (i in 1:10){
runs[[i]]<-read.table(paste0("binary_fileset.", i, ".Q"))
}
#plot each run
par(mfrow=c(1,1))
for (i in 1:5){
barplot(t(as.matrix(runs[[i]])), col=rainbow(i), ylab="Ancestry", border="black")
}





#reorder samples based on sampling locality for k=2/3
pops<-read.csv("~/Desktop/phil.dicaeum/dicaeum.sampling.csv")
pops<-pops[pops$ID %in% colnames(v@gt),]
pops$Taxa[c(23,38,39,43,45)]<-"mtbusa"
#reorder the sample info file to match the order of samples in the vcf
pops<-pops[order(match(pops$ID,colnames(v@gt)[-1])),]
pops$Taxa[order(pops$Taxa)]
## [1] "Luzon" "Luzon" "Luzon" "Luzon" "Luzon" "Luzon"
## [7] "Luzon" "Luzon" "Luzon" "Luzon" "Luzon" "Luzon"
## [13] "Luzon" "Luzon" "Luzon" "Luzon" "Luzon" "Luzon"
## [19] "Mindanao" "Mindanao" "Mindanao" "Mindanao" "Mindanao" "Mindanao"
## [25] "Mindanao" "Mindanao" "Mindanao" "Mindanao" "Mindanao" "Mindanao"
## [31] "Mindanao" "Mindanao" "Mindanao" "Mindanao" "Mindanao" "Mindanao"
## [37] "Mindanao" "Mindanao" "Mindanao" "Mindanao" "Mindanao" "Mindanao"
## [43] "Mindanao" "Mindanao" "Mindanao" "Mindanao" "mtbusa" "mtbusa"
## [49] "mtbusa" "mtbusa" "mtbusa" "Zamboanga" "Zamboanga" "Zamboanga"
## [55] "Zamboanga" "Zamboanga" "Zamboanga" "Zamboanga"
#check sample order
pops$ID[order(pops$Taxa)]
## [1] "D_hypoleucum_18070" "D_hypoleucum_20218"
## [3] "D_hypoleucum_20213" "D_hypoleucum_FMNH454949"
## [5] "D_hypoleucum_25637" "D_hypoleucum_25921"
## [7] "D_hypoleucum_26984" "D_hypoleucum_462070"
## [9] "D_hypoleucum_25675" "D_hypoleucum_454950"
## [11] "D_hypoleucum_25880" "D_hypoleucum_19638"
## [13] "D_hypoleucum_25868" "D_hypoleucum_17969"
## [15] "D_hypoleucum_25672" "D_hypoleucum_26975"
## [17] "D_hypoleucum_25670" "D_hypoleucum_20193"
## [19] "D_hypoleucum_14037" "D_hypoleucum_14079"
## [21] "D_hypoleucum_14065" "D_hypoleucum_14120"
## [23] "D_hypoleucum_14075" "D_hypoleucum_1271"
## [25] "D_hypoleucum_1273" "D_hypoleucum_1275"
## [27] "D_hypoleucum_3274" "D_hypoleucum_3208"
## [29] "D_hypoleucum_3158" "D_hypoleucum_3095"
## [31] "D_hypoleucum_3314" "D_hypoleucum_357608"
## [33] "D_hypoleucum_357615" "D_hypoleucum_357612"
## [35] "D_hypoleucum_28329" "D_hypoleucum_28361"
## [37] "D_hypoleucum_28376" "D_hypoleucum_28294"
## [39] "D_hypoleucum_20921" "D_hypoleucum_28416"
## [41] "D_hypoleucum_27450" "D_hypoleucum_19046"
## [43] "D_hypoleucum_19136" "D_hypoleucum_28676"
## [45] "D_hypoleucum_31636" "D_hypoleucum_31644"
## [47] "D_hypoleucum_2253" "D_hypoleucum_2229"
## [49] "D_hypoleucum_2067" "D_hypoleucum_2208"
## [51] "D_hypoleucum_1956" "D_hypoleucum_18191"
## [53] "D_hypoleucum_18159" "D_hypoleucum_19177"
## [55] "D_hypoleucum_19178" "D_hypoleucum_18193"
## [57] "D_hypoleucum_29945" "D_hypoleucum_29951"
runs[[2]]<-runs[[2]][order(pops$Taxa),]
runs[[3]]<-runs[[3]][order(pops$Taxa),]
#plot barplots for the two most relevant runs
barplot(t(as.matrix(runs[[2]])), col=c("white","gray"), ylab="Ancestry", border="black")

barplot(t(as.matrix(runs[[3]])), col=c("black","gray","white"), ylab="Ancestry", border="black")

#these look cleaner, we will use these for publication
#save barplots
#pdf("~/Desktop/phil.dicaeum/admixture/admix.plots.pdf", width = 8, height=3.3)
#par(mfrow=c(2,1))
#par(mar = c(3, 3, 0, 0), oma = c(1, 1, 1, 1)) #set margins
#barplot(t(as.matrix(runs[[2]])), col=c("white","gray"), ylab="Ancestry", border="black")
#barplot(t(as.matrix(runs[[3]])), col=c("black","gray","white"), ylab="Ancestry", border="black")
#dev.off()
cleaned up version
#see final product
knitr::include_graphics("/Users/devder/Desktop/phil.dicaeum/admix.barplots.png")
