run admixture iterations

#load packages
library(vcfR)

## 
##    *****       ***   vcfR   ***       *****
##    This is vcfR 1.12.0 
##      browseVignettes('vcfR') # Documentation
##      citation('vcfR') # Citation
##    *****       *****      *****       *****

library(SNPfiltR)

## This is SNPfiltR v.1.0.0
## 
## Detailed usage information is available at: devonderaad.github.io/SNPfiltR/ 
## 
## If you use SNPfiltR in your published work, please cite the following papers: 
## 
## DeRaad, D.A. (2022), SNPfiltR: an R package for interactive and reproducible SNP filtering. Molecular Ecology Resources, 00, 1-15. http://doi.org/10.1111/1755-0998.13618 
## 
## Knaus, Brian J., and Niklaus J. Grunwald. 2017. VCFR: a package to manipulate and visualize variant call format data in R. Molecular Ecology Resources, 17.1:44-53. http://doi.org/10.1111/1755-0998.12549

library(RColorBrewer)
library(ggplot2)
library(StAMPP)

## Loading required package: pegas

## Loading required package: ape

## Registered S3 method overwritten by 'pegas':
##   method      from
##   print.amova ade4

## 
## Attaching package: 'pegas'

## The following object is masked from 'package:ape':
## 
##     mst

## The following objects are masked from 'package:vcfR':
## 
##     getINFO, write.vcf

library(adegenet)

## Loading required package: ade4

## 
## Attaching package: 'ade4'

## The following object is masked from 'package:pegas':
## 
##     amova

## 
##    /// adegenet 2.1.5 is loaded ////////////
## 
##    > overview: '?adegenet'
##    > tutorials/doc/questions: 'adegenetWeb()' 
##    > bug reports/feature requests: adegenetIssues()

#read in vcf as vcfR
vcfR <- read.vcfR("~/Desktop/marsh.wren.rad/unlinked.filtered.vcf.gz")

## Scanning file to determine attributes.
## File attributes:
##   meta lines: 14
##   header_line: 15
##   variant count: 3087
##   column count: 146
## 
Meta line 14 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
##   Character matrix gt rows: 3087
##   Character matrix gt cols: 146
##   skip: 0
##   nrows: 3087
##   row_num: 0
## 
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant: 3087
## All variants processed

samps<-read.csv("~/Desktop/marsh.wren.rad/marsh.wren.rad.sampling.csv")
#retain samples that were sequenced successfully
samps<-samps[samps$Tissue.num %in% colnames(vcfR@gt),]
#reorder sampling file to match order of samples in vcf
samps<-samps[match(colnames(vcfR@gt)[-1], samps$Tissue.num),]
samps$Tissue.num == colnames(vcfR@gt)[-1] #check that order matches

##   [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [46] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [61] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [76] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [91] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [106] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [121] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [136] TRUE TRUE

table(samps$Population)

## 
##       CHAPLIN         CRANE  EYEBROW LAKE     LOSTWOODS      NEBRASKA 
##             6            16            69             9            10 
## NICOLLE FLATS 
##            27

#must make chromosome names non-numeric for plink
vcfR@fix[,1]<-paste("a", vcfR@fix[,1], sep="")
#write this version to disk
vcfR::write.vcf(vcfR, file="~/Desktop/marsh.wren.rad/admix.vcf.gz")

use this thinned file to execute ADMIXTURE on the cluster using this script:

#!/bin/sh
#
#SBATCH --job-name=admixture               # Job Name
#SBATCH --nodes=1             # 40 nodes
#SBATCH --ntasks-per-node=5              # 40 CPU allocation per Task
#SBATCH --partition=sixhour         # Name of the Slurm partition used
#SBATCH --chdir=/home/d669d153/work/marsh.wren.rad/admixture    # Set working d$
#SBATCH --mem-per-cpu=1gb            # memory requested
#SBATCH --time=360

#use plink to convert vcf directly to bed format:
/home/d669d153/work/plink --vcf unlinked.filtered.vcf --double-id --allow-extra-chr --make-bed --out binary_fileset
#fix chromosome names
cut -f2- binary_fileset.bim  > temp
awk 'BEGIN{FS=OFS="\t"}{print value 1 OFS $0}' temp > binary_fileset.bim

#run admixture for a K of 1-5, using cross-validation, with 10 threads
for K in 1 2 3 4 5; 
do /home/d669d153/work/admixture_linux-1.3.0/admixture --cv -j10 binary_fileset.bed $K | tee log${K}.out;
done

#Which K iteration is optimal according to ADMIXTURE ?
grep -h CV log*.out > log.errors.txt

#read in ADMIXTURE results
#setwd to admixture directory run on the cluster
setwd("~/Desktop/marsh.wren.rad/admixture")

#read in log error values to determine optimal K
log<-read.table("log.errors.txt")[,c(3:4)]
#use double backslash to interpret the opening parentheses literally in the regular expression
log$V3<-gsub("\\(K=", "", log$V3)
log$V3<-gsub("):", "", log$V3)
#interpret K values as numerical
log$V3<-as.numeric(log$V3)
#rename columns
colnames(log)<-c("Kvalue","cross.validation.error")

#make plot showing the cross validation error across K values 1:5
ggplot(data=log, aes(x=Kvalue, y=cross.validation.error, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  ylab("cross-validation error")+
  xlab("K")+
  scale_x_continuous(breaks = c(1:5))+
  theme_classic()

setwd("~/Desktop/marsh.wren.rad/admixture")
#read in input file in order to get list of input samples in order
sampling<-read.table("binary_fileset.fam")[,1]
sampling

##   [1] "B00908" "B00910" "B00911" "B00912" "B00913" "B00914" "B00915" "B00916"
##   [9] "B00927" "B00928" "B00929" "B00930" "B00931" "B00932" "B00934" "B00935"
##  [17] "B00936" "B00937" "B00938" "B00939" "B00940" "B00941" "B00942" "B00943"
##  [25] "B00944" "B00945" "B00946" "B00947" "B00948" "B00949" "B00950" "B00951"
##  [33] "B00953" "B00954" "B00955" "B00956" "B00957" "B00958" "B00959" "B00960"
##  [41] "B00961" "B00962" "B00963" "B00964" "B00965" "B00966" "B00967" "B02324"
##  [49] "B02325" "B02326" "B02327" "B02328" "B02330" "B02332" "B02333" "B02334"
##  [57] "B02336" "B02337" "B02338" "B02339" "B02340" "B02341" "B02342" "B02343"
##  [65] "B02344" "B02345" "B02346" "B02347" "B02348" "B02350" "B02351" "B02352"
##  [73] "B02353" "B02354" "B02355" "B02358" "B02360" "B02361" "B02363" "B02364"
##  [81] "B02366" "B02367" "B02368" "B02371" "B02372" "B02373" "B02374" "B02375"
##  [89] "B02376" "B02377" "B02378" "B02379" "B02380" "B02381" "B02382" "B02383"
##  [97] "B02385" "B02386" "B02387" "B02388" "B02389" "B02390" "B02391" "B02392"
## [105] "B02393" "B02394" "B02395" "B02397" "B02438" "B02439" "B02440" "B02441"
## [113] "B02449" "B02456" "B02458" "B02459" "B02460" "B02461" "B02464" "B02478"
## [121] "B02546" "B02569" "B02578" "B02588" "B02594" "B02609" "B02620" "25372" 
## [129] "25374"  "25375"  "25376"  "25377"  "25379"  "25380"  "25381"  "25382" 
## [137] "25383"

#read in all ten runs and save each dataframe in a list
runs<-list()
#read in log files
for (i in 1:5){
  runs[[i]]<-read.table(paste0("binary_fileset.", i, ".Q"))
}

#plot each run
for (i in 1:5){
barplot(t(as.matrix(runs[[i]])), col=rainbow(i), ylab="Ancestry", border="black")
}

reorder barplot by q value

#check that order matches
sampling == samps$Tissue.num

##   [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [46] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [61] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [76] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [91] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [106] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [121] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [136] TRUE TRUE

#save q value
samps$western.qvalue<-runs[[2]][,2]
#write to disk
#write.csv(x = samps, file="~/Desktop/marsh.wren.rad/samples.with.qvalues.csv")
#reorder dataframe
runs[[2]]<-runs[[2]][order(runs[[2]]$V1),]
#replot
barplot(t(as.matrix(runs[[2]])), col=c("#8C510A","#01665E"), ylab="Ancestry", border="black")

## Open a pdf file
#pdf("~/Desktop/marsh.wren.rad/admixture.pdf", width=8.5, height=4)
##replot
#barplot(t(as.matrix(runs[[2]])), col=c("#8C510A","#01665E"), ylab="Ancestry proportion", #border="black")
##close
#dev.off()

assess the mean ancestry of “pure parental individuals” for each lineage (i.e., greater than 85% assignment to one of the two ancestry bins)

#get eastern qvalue
eastern.qvalue<-1-samps$western.qvalue

#set breakpoints
ax<-seq(.85,1, by= .015)

#plot overlaid histograms colored by parental pop
plot(hist(eastern.qvalue[eastern.qvalue > .85], breaks=ax, plot=FALSE), col=alpha("#8C510A", .5))
plot(hist(samps$western.qvalue[samps$western.qvalue > .85], breaks=ax, plot=FALSE), col=alpha("#01665E", .5), add=TRUE)
abline(v=mean(eastern.qvalue[eastern.qvalue > .85]), col=alpha("#8C510A"), lwd=3, lty=2)
abline(v=mean(samps$western.qvalue[samps$western.qvalue > .85]), col=alpha("#01665E"), lwd=3, lty=2)

#perform T-test with null hypothesis of equal means
t.test(x = samps$western.qvalue[samps$western.qvalue > .85],y = eastern.qvalue[eastern.qvalue > .85])

## 
##  Welch Two Sample t-test
## 
## data:  samps$western.qvalue[samps$western.qvalue > 0.85] and eastern.qvalue[eastern.qvalue > 0.85]
## t = -4.6405, df = 86.432, p-value = 1.228e-05
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.03562960 -0.01425921
## sample estimates:
## mean of x mean of y 
## 0.9697939 0.9947383

#also significant with the parental cutoff set to .75
t.test(x = samps$western.qvalue[samps$western.qvalue > .75],y = eastern.qvalue[eastern.qvalue > .75])

## 
##  Welch Two Sample t-test
## 
## data:  samps$western.qvalue[samps$western.qvalue > 0.75] and eastern.qvalue[eastern.qvalue > 0.75]
## t = -3.8122, df = 113.42, p-value = 0.0002244
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.04619086 -0.01459941
## sample estimates:
## mean of x mean of y 
## 0.9603235 0.9907187

# Open a pdf file
#pdf("~/Desktop/marsh.wren.rad/admixture.tail.histograms.pdf", width=8.5, height=4)
##replot
#plot(hist(eastern.qvalue[eastern.qvalue > .85], breaks=ax, plot=FALSE), col=alpha("#8C510A", .5))
#plot(hist(samps$western.qvalue[samps$western.qvalue > .85], breaks=ax, plot=FALSE), col=alpha("#01665E", .5), #add=TRUE)
#abline(v=mean(eastern.qvalue[eastern.qvalue > .85]), col=alpha("#8C510A"), lwd=3, lty=2)
#abline(v=mean(samps$western.qvalue[samps$western.qvalue > .85]), col=alpha("#01665E"), lwd=3, lty=2)
##close
#dev.off()

run admixture iterations

Devon DeRaad

2022-10-03

use this thinned file to execute ADMIXTURE on the cluster using this script:

reorder barplot by q value

assess the mean ancestry of “pure parental individuals” for each lineage (i.e., greater than 85% assignment to one of the two ancestry bins)