run ADMIXTURE on KU cluster

read in thinned (i.e., linkage-filtered) vcf to r and remove outgroup

library(vcfR)

## 
##    *****       ***   vcfR   ***       *****
##    This is vcfR 1.12.0 
##      browseVignettes('vcfR') # Documentation
##      citation('vcfR') # Citation
##    *****       *****      *****       *****

library(SNPfiltR)

## This is SNPfiltR v.1.0.0
## 
## Detailed usage information is available at: devonderaad.github.io/SNPfiltR/ 
## 
## If you use SNPfiltR in your published work, please cite the following papers: 
## 
## DeRaad, D.A. (2022), SNPfiltR: an R package for interactive and reproducible SNP filtering. Molecular Ecology Resources, 00, 1-15. http://doi.org/10.1111/1755-0998.13618 
## 
## Knaus, Brian J., and Niklaus J. Grunwald. 2017. VCFR: a package to manipulate and visualize variant call format data in R. Molecular Ecology Resources, 17.1:44-53. http://doi.org/10.1111/1755-0998.12549

library(ggplot2)
#read in thinned vcf
v<-read.vcfR("~/Desktop/phil.dicaeum/filtered.85.unlinked.vcf")

## Scanning file to determine attributes.
## File attributes:
##   meta lines: 14
##   header_line: 15
##   variant count: 2590
##   column count: 69
## 
Meta line 14 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
##   Character matrix gt rows: 2590
##   Character matrix gt cols: 69
##   skip: 0
##   nrows: 2590
##   row_num: 0
## 
Processed variant 1000
Processed variant 2000
Processed variant: 2590
## All variants processed

#must make chromosome names non-numeric for plink or it will throw an error
v@fix[,1]<-paste("a", v@fix[,1], sep="")

#remove outgroup samples
v<-v[,colnames(v@gt) != "D_nigrilore_KU28413" & colnames(v@gt) != "D_nigrilore_KU28414"]
#remove invariant sites
v<-min_mac(v, min.mac = 1)

## 15.52% of SNPs fell below a minor allele count of 1 and were removed from the VCF

#make another file with no singletons
v.x<-min_mac(v, min.mac = 2)

## 51.23% of SNPs fell below a minor allele count of 2 and were removed from the VCF

#write to disk
#vcfR::write.vcf(v, file="~/Desktop/phil.dicaeum/thinned.nooutgroup.vcf.gz")
#vcfR::write.vcf(v.x, file="~/Desktop/phil.dicaeum/thinned.nooutgroup.mac.vcf.gz")

use this bash code to execute ADMIXTURE on the cluster

#use these bash commands to unzip the vcf files you just wrote out
gunzip thinned.nooutgroup.vcf.gz
gunzip thinned.nooutgroup.mac.vcf.gz

#use this thinned vcf file to execute ADMIXTURE on the cluster using this script submitted as a slurm job:
#!/bin/sh
#
#SBATCH --job-name=admixture               # Job Name
#SBATCH --nodes=1             # 40 nodes
#SBATCH --ntasks-per-node=10               # 40 CPU allocation per Task
#SBATCH --partition=sixhour         # Name of the Slurm partition used
#SBATCH --chdir=/home/d669d153/work/phil.dicaeum/admixture    # Set working d$
#SBATCH --mem-per-cpu=1gb            # memory requested
#SBATCH --time=360

#use plink to convert vcf directly to bed format:
/home/d669d153/work/plink --vcf thinned.nooutgroup.vcf --double-id --allow-extra-chr --make-bed --out binary_fileset
#fix chromosome names
cut -f2- binary_fileset.bim  > temp
awk 'BEGIN{FS=OFS="\t"}{print value 1 OFS $0}' temp > binary_fileset.bim

#run admixture for a K of 1-10, using cross-validation, with 10 threads
for K in 1 2 3 4 5 6 7 8 9 10; 
do /home/d669d153/work/admixture_linux-1.3.0/admixture --cv -j10 binary_fileset.bed $K | tee log${K}.out;
done

#Which K iteration is optimal according to ADMIXTURE ?
grep -h CV log*.out > log.errors.txt

visualize the results in R

#Now copy your entire admixture directory into your local machine, in bash, using a command like this:
#scp -r d669d153@hpc.crc.ku.edu:/home/d669d153/work/phil.dicaeum/admixture /Users/devder/Desktop/phil.dicaeum/

#read in ADMIXTURE results to R
#setwd to the admixture directory you brought in from the cluster
setwd("~/Desktop/phil.dicaeum/admixture")

#read in log error values to determine optimal K
log<-read.table("log.errors.txt")[,c(3:4)]
#use double backslash to interpret the opening parentheses literally in the regular expression
log$V3<-gsub("\\(K=", "", log$V3)
log$V3<-gsub("):", "", log$V3)
#interpret K values as numerical
log$V3<-as.numeric(log$V3)
#rename columns
colnames(log)<-c("Kvalue","cross.validation.error")

#make plot showing the cross validation error across K values 1:10
ggplot(data=log, aes(x=Kvalue, y=cross.validation.error, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  ylab("cross-validation error")+
  xlab("K")+
  scale_x_continuous(breaks = c(1:10))+
  theme_classic()

#lowest CV value is the ideal K value

visualize results as bar charts

#setwd to the admixture directory you brought in from the cluster
setwd("~/Desktop/phil.dicaeum/admixture")

#read in input file
sampling<-read.table("binary_fileset.fam")[,1]
#get list of input samples in order they appear
sampling

##  [1] "D_hypoleucum_18070"      "D_hypoleucum_18191"     
##  [3] "D_hypoleucum_14037"      "D_hypoleucum_14079"     
##  [5] "D_hypoleucum_14065"      "D_hypoleucum_14120"     
##  [7] "D_hypoleucum_18159"      "D_hypoleucum_14075"     
##  [9] "D_hypoleucum_20218"      "D_hypoleucum_19177"     
## [11] "D_hypoleucum_19178"      "D_hypoleucum_20213"     
## [13] "D_hypoleucum_FMNH454949" "D_hypoleucum_25637"     
## [15] "D_hypoleucum_1271"       "D_hypoleucum_1273"      
## [17] "D_hypoleucum_1275"       "D_hypoleucum_25921"     
## [19] "D_hypoleucum_3274"       "D_hypoleucum_26984"     
## [21] "D_hypoleucum_3208"       "D_hypoleucum_3158"      
## [23] "D_hypoleucum_2253"       "D_hypoleucum_3095"      
## [25] "D_hypoleucum_462070"     "D_hypoleucum_25675"     
## [27] "D_hypoleucum_454950"     "D_hypoleucum_25880"     
## [29] "D_hypoleucum_3314"       "D_hypoleucum_357608"    
## [31] "D_hypoleucum_357615"     "D_hypoleucum_357612"    
## [33] "D_hypoleucum_19638"      "D_hypoleucum_25868"     
## [35] "D_hypoleucum_17969"      "D_hypoleucum_25672"     
## [37] "D_hypoleucum_26975"      "D_hypoleucum_2229"      
## [39] "D_hypoleucum_2067"       "D_hypoleucum_28329"     
## [41] "D_hypoleucum_28361"      "D_hypoleucum_28376"     
## [43] "D_hypoleucum_2208"       "D_hypoleucum_28294"     
## [45] "D_hypoleucum_1956"       "D_hypoleucum_25670"     
## [47] "D_hypoleucum_20921"      "D_hypoleucum_20193"     
## [49] "D_hypoleucum_28416"      "D_hypoleucum_18193"     
## [51] "D_hypoleucum_27450"      "D_hypoleucum_19046"     
## [53] "D_hypoleucum_19136"      "D_hypoleucum_28676"     
## [55] "D_hypoleucum_29945"      "D_hypoleucum_31636"     
## [57] "D_hypoleucum_29951"      "D_hypoleucum_31644"

#read in all ten runs and save each dataframe in a list
runs<-list()
#read in log files
for (i in 1:10){
  runs[[i]]<-read.table(paste0("binary_fileset.", i, ".Q"))
}

#plot each run
par(mfrow=c(1,1))
for (i in 1:5){
barplot(t(as.matrix(runs[[i]])), col=rainbow(i), ylab="Ancestry", border="black")
}

#read in sample data
pops<-read.csv("~/Desktop/phil.dicaeum/dicaeum.sampling.csv")
pops<-pops[pops$ID %in% colnames(v@gt),]
pops$Taxa[c(23,38,39,43,45)]<-"mtbusa"
#reorder the sample info file to match the order of samples in the vcf
pops<-pops[order(match(pops$ID,colnames(v@gt)[-1])),]
pops$Taxa[order(pops$Taxa)]

##  [1] "Luzon"     "Luzon"     "Luzon"     "Luzon"     "Luzon"     "Luzon"    
##  [7] "Luzon"     "Luzon"     "Luzon"     "Luzon"     "Luzon"     "Luzon"    
## [13] "Luzon"     "Luzon"     "Luzon"     "Luzon"     "Luzon"     "Luzon"    
## [19] "Mindanao"  "Mindanao"  "Mindanao"  "Mindanao"  "Mindanao"  "Mindanao" 
## [25] "Mindanao"  "Mindanao"  "Mindanao"  "Mindanao"  "Mindanao"  "Mindanao" 
## [31] "Mindanao"  "Mindanao"  "Mindanao"  "Mindanao"  "Mindanao"  "Mindanao" 
## [37] "Mindanao"  "Mindanao"  "Mindanao"  "Mindanao"  "Mindanao"  "Mindanao" 
## [43] "Mindanao"  "Mindanao"  "Mindanao"  "Mindanao"  "mtbusa"    "mtbusa"   
## [49] "mtbusa"    "mtbusa"    "mtbusa"    "Zamboanga" "Zamboanga" "Zamboanga"
## [55] "Zamboanga" "Zamboanga" "Zamboanga" "Zamboanga"

#check sample order
pops$ID[order(pops$Taxa)]

##  [1] "D_hypoleucum_18070"      "D_hypoleucum_20218"     
##  [3] "D_hypoleucum_20213"      "D_hypoleucum_FMNH454949"
##  [5] "D_hypoleucum_25637"      "D_hypoleucum_25921"     
##  [7] "D_hypoleucum_26984"      "D_hypoleucum_462070"    
##  [9] "D_hypoleucum_25675"      "D_hypoleucum_454950"    
## [11] "D_hypoleucum_25880"      "D_hypoleucum_19638"     
## [13] "D_hypoleucum_25868"      "D_hypoleucum_17969"     
## [15] "D_hypoleucum_25672"      "D_hypoleucum_26975"     
## [17] "D_hypoleucum_25670"      "D_hypoleucum_20193"     
## [19] "D_hypoleucum_14037"      "D_hypoleucum_14079"     
## [21] "D_hypoleucum_14065"      "D_hypoleucum_14120"     
## [23] "D_hypoleucum_14075"      "D_hypoleucum_1271"      
## [25] "D_hypoleucum_1273"       "D_hypoleucum_1275"      
## [27] "D_hypoleucum_3274"       "D_hypoleucum_3208"      
## [29] "D_hypoleucum_3158"       "D_hypoleucum_3095"      
## [31] "D_hypoleucum_3314"       "D_hypoleucum_357608"    
## [33] "D_hypoleucum_357615"     "D_hypoleucum_357612"    
## [35] "D_hypoleucum_28329"      "D_hypoleucum_28361"     
## [37] "D_hypoleucum_28376"      "D_hypoleucum_28294"     
## [39] "D_hypoleucum_20921"      "D_hypoleucum_28416"     
## [41] "D_hypoleucum_27450"      "D_hypoleucum_19046"     
## [43] "D_hypoleucum_19136"      "D_hypoleucum_28676"     
## [45] "D_hypoleucum_31636"      "D_hypoleucum_31644"     
## [47] "D_hypoleucum_2253"       "D_hypoleucum_2229"      
## [49] "D_hypoleucum_2067"       "D_hypoleucum_2208"      
## [51] "D_hypoleucum_1956"       "D_hypoleucum_18191"     
## [53] "D_hypoleucum_18159"      "D_hypoleucum_19177"     
## [55] "D_hypoleucum_19178"      "D_hypoleucum_18193"     
## [57] "D_hypoleucum_29945"      "D_hypoleucum_29951"

#reorder samples based on sampling locality for k=2/3
runs[[2]]<-runs[[2]][order(pops$Taxa),]
runs[[3]]<-runs[[3]][order(pops$Taxa),]
#plot barplots for the two most relevant runs
barplot(t(as.matrix(runs[[2]])), col=c("#B2182B","#4D4D4D"), ylab="Ancestry", border="black")

barplot(t(as.matrix(runs[[3]])), col=c("#B2182B","#4D4D4D","black"), ylab="Ancestry", border="black")

Repeat with singletons removed

#use this thinned vcf file to execute ADMIXTURE on the cluster using this script submitted as a slurm job:
#!/bin/sh
#
#SBATCH --job-name=admixture               # Job Name
#SBATCH --nodes=1             # 40 nodes
#SBATCH --ntasks-per-node=10               # 40 CPU allocation per Task
#SBATCH --partition=sixhour         # Name of the Slurm partition used
#SBATCH --chdir=/home/d669d153/work/phil.dicaeum/admixture.mac    # Set working d$
#SBATCH --mem-per-cpu=1gb            # memory requested
#SBATCH --time=360

#use plink to convert vcf directly to bed format:
/home/d669d153/work/plink --vcf thinned.nooutgroup.mac.vcf --double-id --allow-extra-chr --make-bed --out binary_fileset
#fix chromosome names
cut -f2- binary_fileset.bim  > temp
awk 'BEGIN{FS=OFS="\t"}{print value 1 OFS $0}' temp > binary_fileset.bim

#run admixture for a K of 1-10, using cross-validation, with 10 threads
for K in 1 2 3 4 5 6 7 8 9 10; 
do /home/d669d153/work/admixture_linux-1.3.0/admixture --cv -j10 binary_fileset.bed $K | tee log${K}.out;
done

#Which K iteration is optimal according to ADMIXTURE ?
grep -h CV log*.out > log.errors.txt

visualize the results in R

#Now copy your entire admixture directory into your local machine, in bash, using a command like this:
#scp -r d669d153@hpc.crc.ku.edu:/home/d669d153/work/phil.dicaeum/admixture.mac /Users/devder/Desktop/phil.dicaeum/

#read in ADMIXTURE results to R
#setwd to the admixture directory you brought in from the cluster
setwd("~/Desktop/phil.dicaeum/admixture.mac")

#read in log error values to determine optimal K
log<-read.table("log.errors.txt")[,c(3:4)]
#use double backslash to interpret the opening parentheses literally in the regular expression
log$V3<-gsub("\\(K=", "", log$V3)
log$V3<-gsub("):", "", log$V3)
#interpret K values as numerical
log$V3<-as.numeric(log$V3)
#rename columns
colnames(log)<-c("Kvalue","cross.validation.error")

#make plot showing the cross validation error across K values 1:10
ggplot(data=log, aes(x=Kvalue, y=cross.validation.error, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  ylab("cross-validation error")+
  xlab("K")+
  scale_x_continuous(breaks = c(1:10))+
  theme_classic()

#lowest CV value is the ideal K value

visualize results as bar charts

#setwd to the admixture directory you brought in from the cluster
setwd("~/Desktop/phil.dicaeum/admixture.mac")

#read in input file
sampling<-read.table("binary_fileset.fam")[,1]
#get list of input samples in order they appear
sampling

##  [1] "D_hypoleucum_18070"      "D_hypoleucum_18191"     
##  [3] "D_hypoleucum_14037"      "D_hypoleucum_14079"     
##  [5] "D_hypoleucum_14065"      "D_hypoleucum_14120"     
##  [7] "D_hypoleucum_18159"      "D_hypoleucum_14075"     
##  [9] "D_hypoleucum_20218"      "D_hypoleucum_19177"     
## [11] "D_hypoleucum_19178"      "D_hypoleucum_20213"     
## [13] "D_hypoleucum_FMNH454949" "D_hypoleucum_25637"     
## [15] "D_hypoleucum_1271"       "D_hypoleucum_1273"      
## [17] "D_hypoleucum_1275"       "D_hypoleucum_25921"     
## [19] "D_hypoleucum_3274"       "D_hypoleucum_26984"     
## [21] "D_hypoleucum_3208"       "D_hypoleucum_3158"      
## [23] "D_hypoleucum_2253"       "D_hypoleucum_3095"      
## [25] "D_hypoleucum_462070"     "D_hypoleucum_25675"     
## [27] "D_hypoleucum_454950"     "D_hypoleucum_25880"     
## [29] "D_hypoleucum_3314"       "D_hypoleucum_357608"    
## [31] "D_hypoleucum_357615"     "D_hypoleucum_357612"    
## [33] "D_hypoleucum_19638"      "D_hypoleucum_25868"     
## [35] "D_hypoleucum_17969"      "D_hypoleucum_25672"     
## [37] "D_hypoleucum_26975"      "D_hypoleucum_2229"      
## [39] "D_hypoleucum_2067"       "D_hypoleucum_28329"     
## [41] "D_hypoleucum_28361"      "D_hypoleucum_28376"     
## [43] "D_hypoleucum_2208"       "D_hypoleucum_28294"     
## [45] "D_hypoleucum_1956"       "D_hypoleucum_25670"     
## [47] "D_hypoleucum_20921"      "D_hypoleucum_20193"     
## [49] "D_hypoleucum_28416"      "D_hypoleucum_18193"     
## [51] "D_hypoleucum_27450"      "D_hypoleucum_19046"     
## [53] "D_hypoleucum_19136"      "D_hypoleucum_28676"     
## [55] "D_hypoleucum_29945"      "D_hypoleucum_31636"     
## [57] "D_hypoleucum_29951"      "D_hypoleucum_31644"

#read in all ten runs and save each dataframe in a list
runs<-list()
#read in log files
for (i in 1:10){
  runs[[i]]<-read.table(paste0("binary_fileset.", i, ".Q"))
}

#plot each run
par(mfrow=c(1,1))
for (i in 1:5){
barplot(t(as.matrix(runs[[i]])), col=rainbow(i), ylab="Ancestry", border="black")
}

#reorder samples based on sampling locality for k=2/3
pops<-read.csv("~/Desktop/phil.dicaeum/dicaeum.sampling.csv")
pops<-pops[pops$ID %in% colnames(v@gt),]
pops$Taxa[c(23,38,39,43,45)]<-"mtbusa"
#reorder the sample info file to match the order of samples in the vcf
pops<-pops[order(match(pops$ID,colnames(v@gt)[-1])),]
pops$Taxa[order(pops$Taxa)]

##  [1] "Luzon"     "Luzon"     "Luzon"     "Luzon"     "Luzon"     "Luzon"    
##  [7] "Luzon"     "Luzon"     "Luzon"     "Luzon"     "Luzon"     "Luzon"    
## [13] "Luzon"     "Luzon"     "Luzon"     "Luzon"     "Luzon"     "Luzon"    
## [19] "Mindanao"  "Mindanao"  "Mindanao"  "Mindanao"  "Mindanao"  "Mindanao" 
## [25] "Mindanao"  "Mindanao"  "Mindanao"  "Mindanao"  "Mindanao"  "Mindanao" 
## [31] "Mindanao"  "Mindanao"  "Mindanao"  "Mindanao"  "Mindanao"  "Mindanao" 
## [37] "Mindanao"  "Mindanao"  "Mindanao"  "Mindanao"  "Mindanao"  "Mindanao" 
## [43] "Mindanao"  "Mindanao"  "Mindanao"  "Mindanao"  "mtbusa"    "mtbusa"   
## [49] "mtbusa"    "mtbusa"    "mtbusa"    "Zamboanga" "Zamboanga" "Zamboanga"
## [55] "Zamboanga" "Zamboanga" "Zamboanga" "Zamboanga"

#check sample order
pops$ID[order(pops$Taxa)]

##  [1] "D_hypoleucum_18070"      "D_hypoleucum_20218"     
##  [3] "D_hypoleucum_20213"      "D_hypoleucum_FMNH454949"
##  [5] "D_hypoleucum_25637"      "D_hypoleucum_25921"     
##  [7] "D_hypoleucum_26984"      "D_hypoleucum_462070"    
##  [9] "D_hypoleucum_25675"      "D_hypoleucum_454950"    
## [11] "D_hypoleucum_25880"      "D_hypoleucum_19638"     
## [13] "D_hypoleucum_25868"      "D_hypoleucum_17969"     
## [15] "D_hypoleucum_25672"      "D_hypoleucum_26975"     
## [17] "D_hypoleucum_25670"      "D_hypoleucum_20193"     
## [19] "D_hypoleucum_14037"      "D_hypoleucum_14079"     
## [21] "D_hypoleucum_14065"      "D_hypoleucum_14120"     
## [23] "D_hypoleucum_14075"      "D_hypoleucum_1271"      
## [25] "D_hypoleucum_1273"       "D_hypoleucum_1275"      
## [27] "D_hypoleucum_3274"       "D_hypoleucum_3208"      
## [29] "D_hypoleucum_3158"       "D_hypoleucum_3095"      
## [31] "D_hypoleucum_3314"       "D_hypoleucum_357608"    
## [33] "D_hypoleucum_357615"     "D_hypoleucum_357612"    
## [35] "D_hypoleucum_28329"      "D_hypoleucum_28361"     
## [37] "D_hypoleucum_28376"      "D_hypoleucum_28294"     
## [39] "D_hypoleucum_20921"      "D_hypoleucum_28416"     
## [41] "D_hypoleucum_27450"      "D_hypoleucum_19046"     
## [43] "D_hypoleucum_19136"      "D_hypoleucum_28676"     
## [45] "D_hypoleucum_31636"      "D_hypoleucum_31644"     
## [47] "D_hypoleucum_2253"       "D_hypoleucum_2229"      
## [49] "D_hypoleucum_2067"       "D_hypoleucum_2208"      
## [51] "D_hypoleucum_1956"       "D_hypoleucum_18191"     
## [53] "D_hypoleucum_18159"      "D_hypoleucum_19177"     
## [55] "D_hypoleucum_19178"      "D_hypoleucum_18193"     
## [57] "D_hypoleucum_29945"      "D_hypoleucum_29951"

runs[[2]]<-runs[[2]][order(pops$Taxa),]
runs[[3]]<-runs[[3]][order(pops$Taxa),]
#plot barplots for the two most relevant runs
barplot(t(as.matrix(runs[[2]])), col=c("white","gray"), ylab="Ancestry", border="black")

barplot(t(as.matrix(runs[[3]])), col=c("black","gray","white"), ylab="Ancestry", border="black")

#these look cleaner, we will use these for publication

#save barplots
#pdf("~/Desktop/phil.dicaeum/admixture/admix.plots.pdf", width = 8, height=3.3)
#par(mfrow=c(2,1))
#par(mar = c(3, 3, 0, 0), oma = c(1, 1, 1, 1)) #set margins
#barplot(t(as.matrix(runs[[2]])), col=c("white","gray"), ylab="Ancestry", border="black")
#barplot(t(as.matrix(runs[[3]])), col=c("black","gray","white"), ylab="Ancestry", border="black")
#dev.off()

cleaned up version

#see final product
knitr::include_graphics("/Users/devder/Desktop/phil.dicaeum/admix.barplots.png")

run ADMIXTURE on KU cluster

Devon DeRaad

2022-12-01

read in thinned (i.e., linkage-filtered) vcf to r and remove outgroup

use this bash code to execute ADMIXTURE on the cluster

visualize the results in R

visualize results as bar charts

Repeat with singletons removed

visualize the results in R

visualize results as bar charts

cleaned up version