#load packages
library(vcfR)
##
## ***** *** vcfR *** *****
## This is vcfR 1.12.0
## browseVignettes('vcfR') # Documentation
## citation('vcfR') # Citation
## ***** ***** ***** *****
library(SNPfiltR)
## This is SNPfiltR v.1.0.0
##
## Detailed usage information is available at: devonderaad.github.io/SNPfiltR/
##
## If you use SNPfiltR in your published work, please cite the following papers:
##
## DeRaad, D.A. (2022), SNPfiltR: an R package for interactive and reproducible SNP filtering. Molecular Ecology Resources, 00, 1-15. http://doi.org/10.1111/1755-0998.13618
##
## Knaus, Brian J., and Niklaus J. Grunwald. 2017. VCFR: a package to manipulate and visualize variant call format data in R. Molecular Ecology Resources, 17.1:44-53. http://doi.org/10.1111/1755-0998.12549
library(RColorBrewer)
library(ggplot2)
library(StAMPP)
## Loading required package: pegas
## Loading required package: ape
## Registered S3 method overwritten by 'pegas':
## method from
## print.amova ade4
##
## Attaching package: 'pegas'
## The following object is masked from 'package:ape':
##
## mst
## The following objects are masked from 'package:vcfR':
##
## getINFO, write.vcf
library(adegenet)
## Loading required package: ade4
##
## Attaching package: 'ade4'
## The following object is masked from 'package:pegas':
##
## amova
##
## /// adegenet 2.1.5 is loaded ////////////
##
## > overview: '?adegenet'
## > tutorials/doc/questions: 'adegenetWeb()'
## > bug reports/feature requests: adegenetIssues()
#read in vcf as vcfR
vcfR <- read.vcfR("~/Desktop/marsh.wren.rad/unlinked.filtered.vcf.gz")
## Scanning file to determine attributes.
## File attributes:
## meta lines: 14
## header_line: 15
## variant count: 3087
## column count: 146
##
Meta line 14 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
## Character matrix gt rows: 3087
## Character matrix gt cols: 146
## skip: 0
## nrows: 3087
## row_num: 0
##
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant: 3087
## All variants processed
samps<-read.csv("~/Desktop/marsh.wren.rad/marsh.wren.rad.sampling.csv")
#retain samples that were sequenced successfully
samps<-samps[samps$Tissue.num %in% colnames(vcfR@gt),]
#reorder sampling file to match order of samples in vcf
samps<-samps[match(colnames(vcfR@gt)[-1], samps$Tissue.num),]
samps$Tissue.num == colnames(vcfR@gt)[-1] #check that order matches
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [46] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [61] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [76] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [91] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [106] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [121] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [136] TRUE TRUE
table(samps$Population)
##
## CHAPLIN CRANE EYEBROW LAKE LOSTWOODS NEBRASKA
## 6 16 69 9 10
## NICOLLE FLATS
## 27
#must make chromosome names non-numeric for plink
vcfR@fix[,1]<-paste("a", vcfR@fix[,1], sep="")
#write this version to disk
vcfR::write.vcf(vcfR, file="~/Desktop/marsh.wren.rad/admix.vcf.gz")
use this thinned file to execute ADMIXTURE on the cluster using this
script:
#!/bin/sh
#
#SBATCH --job-name=admixture # Job Name
#SBATCH --nodes=1 # 40 nodes
#SBATCH --ntasks-per-node=5 # 40 CPU allocation per Task
#SBATCH --partition=sixhour # Name of the Slurm partition used
#SBATCH --chdir=/home/d669d153/work/marsh.wren.rad/admixture # Set working d$
#SBATCH --mem-per-cpu=1gb # memory requested
#SBATCH --time=360
#use plink to convert vcf directly to bed format:
/home/d669d153/work/plink --vcf unlinked.filtered.vcf --double-id --allow-extra-chr --make-bed --out binary_fileset
#fix chromosome names
cut -f2- binary_fileset.bim > temp
awk 'BEGIN{FS=OFS="\t"}{print value 1 OFS $0}' temp > binary_fileset.bim
#run admixture for a K of 1-5, using cross-validation, with 10 threads
for K in 1 2 3 4 5;
do /home/d669d153/work/admixture_linux-1.3.0/admixture --cv -j10 binary_fileset.bed $K | tee log${K}.out;
done
#Which K iteration is optimal according to ADMIXTURE ?
grep -h CV log*.out > log.errors.txt
#read in ADMIXTURE results
#setwd to admixture directory run on the cluster
setwd("~/Desktop/marsh.wren.rad/admixture")
#read in log error values to determine optimal K
log<-read.table("log.errors.txt")[,c(3:4)]
#use double backslash to interpret the opening parentheses literally in the regular expression
log$V3<-gsub("\\(K=", "", log$V3)
log$V3<-gsub("):", "", log$V3)
#interpret K values as numerical
log$V3<-as.numeric(log$V3)
#rename columns
colnames(log)<-c("Kvalue","cross.validation.error")
#make plot showing the cross validation error across K values 1:5
ggplot(data=log, aes(x=Kvalue, y=cross.validation.error, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
ylab("cross-validation error")+
xlab("K")+
scale_x_continuous(breaks = c(1:5))+
theme_classic()

setwd("~/Desktop/marsh.wren.rad/admixture")
#read in input file in order to get list of input samples in order
sampling<-read.table("binary_fileset.fam")[,1]
sampling
## [1] "B00908" "B00910" "B00911" "B00912" "B00913" "B00914" "B00915" "B00916"
## [9] "B00927" "B00928" "B00929" "B00930" "B00931" "B00932" "B00934" "B00935"
## [17] "B00936" "B00937" "B00938" "B00939" "B00940" "B00941" "B00942" "B00943"
## [25] "B00944" "B00945" "B00946" "B00947" "B00948" "B00949" "B00950" "B00951"
## [33] "B00953" "B00954" "B00955" "B00956" "B00957" "B00958" "B00959" "B00960"
## [41] "B00961" "B00962" "B00963" "B00964" "B00965" "B00966" "B00967" "B02324"
## [49] "B02325" "B02326" "B02327" "B02328" "B02330" "B02332" "B02333" "B02334"
## [57] "B02336" "B02337" "B02338" "B02339" "B02340" "B02341" "B02342" "B02343"
## [65] "B02344" "B02345" "B02346" "B02347" "B02348" "B02350" "B02351" "B02352"
## [73] "B02353" "B02354" "B02355" "B02358" "B02360" "B02361" "B02363" "B02364"
## [81] "B02366" "B02367" "B02368" "B02371" "B02372" "B02373" "B02374" "B02375"
## [89] "B02376" "B02377" "B02378" "B02379" "B02380" "B02381" "B02382" "B02383"
## [97] "B02385" "B02386" "B02387" "B02388" "B02389" "B02390" "B02391" "B02392"
## [105] "B02393" "B02394" "B02395" "B02397" "B02438" "B02439" "B02440" "B02441"
## [113] "B02449" "B02456" "B02458" "B02459" "B02460" "B02461" "B02464" "B02478"
## [121] "B02546" "B02569" "B02578" "B02588" "B02594" "B02609" "B02620" "25372"
## [129] "25374" "25375" "25376" "25377" "25379" "25380" "25381" "25382"
## [137] "25383"
#read in all ten runs and save each dataframe in a list
runs<-list()
#read in log files
for (i in 1:5){
runs[[i]]<-read.table(paste0("binary_fileset.", i, ".Q"))
}
#plot each run
for (i in 1:5){
barplot(t(as.matrix(runs[[i]])), col=rainbow(i), ylab="Ancestry", border="black")
}





reorder barplot by q value
#check that order matches
sampling == samps$Tissue.num
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [46] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [61] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [76] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [91] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [106] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [121] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [136] TRUE TRUE
#save q value
samps$western.qvalue<-runs[[2]][,2]
#write to disk
#write.csv(x = samps, file="~/Desktop/marsh.wren.rad/samples.with.qvalues.csv")
#reorder dataframe
runs[[2]]<-runs[[2]][order(runs[[2]]$V1),]
#replot
barplot(t(as.matrix(runs[[2]])), col=c("#8C510A","#01665E"), ylab="Ancestry", border="black")

## Open a pdf file
#pdf("~/Desktop/marsh.wren.rad/admixture.pdf", width=8.5, height=4)
##replot
#barplot(t(as.matrix(runs[[2]])), col=c("#8C510A","#01665E"), ylab="Ancestry proportion", #border="black")
##close
#dev.off()
assess the mean ancestry of “pure parental individuals” for each
lineage (i.e., greater than 85% assignment to one of the two ancestry
bins)
#get eastern qvalue
eastern.qvalue<-1-samps$western.qvalue
#set breakpoints
ax<-seq(.85,1, by= .015)
#plot overlaid histograms colored by parental pop
plot(hist(eastern.qvalue[eastern.qvalue > .85], breaks=ax, plot=FALSE), col=alpha("#8C510A", .5))
plot(hist(samps$western.qvalue[samps$western.qvalue > .85], breaks=ax, plot=FALSE), col=alpha("#01665E", .5), add=TRUE)
abline(v=mean(eastern.qvalue[eastern.qvalue > .85]), col=alpha("#8C510A"), lwd=3, lty=2)
abline(v=mean(samps$western.qvalue[samps$western.qvalue > .85]), col=alpha("#01665E"), lwd=3, lty=2)

#perform T-test with null hypothesis of equal means
t.test(x = samps$western.qvalue[samps$western.qvalue > .85],y = eastern.qvalue[eastern.qvalue > .85])
##
## Welch Two Sample t-test
##
## data: samps$western.qvalue[samps$western.qvalue > 0.85] and eastern.qvalue[eastern.qvalue > 0.85]
## t = -4.6405, df = 86.432, p-value = 1.228e-05
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.03562960 -0.01425921
## sample estimates:
## mean of x mean of y
## 0.9697939 0.9947383
#also significant with the parental cutoff set to .75
t.test(x = samps$western.qvalue[samps$western.qvalue > .75],y = eastern.qvalue[eastern.qvalue > .75])
##
## Welch Two Sample t-test
##
## data: samps$western.qvalue[samps$western.qvalue > 0.75] and eastern.qvalue[eastern.qvalue > 0.75]
## t = -3.8122, df = 113.42, p-value = 0.0002244
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.04619086 -0.01459941
## sample estimates:
## mean of x mean of y
## 0.9603235 0.9907187
# Open a pdf file
#pdf("~/Desktop/marsh.wren.rad/admixture.tail.histograms.pdf", width=8.5, height=4)
##replot
#plot(hist(eastern.qvalue[eastern.qvalue > .85], breaks=ax, plot=FALSE), col=alpha("#8C510A", .5))
#plot(hist(samps$western.qvalue[samps$western.qvalue > .85], breaks=ax, plot=FALSE), col=alpha("#01665E", .5), #add=TRUE)
#abline(v=mean(eastern.qvalue[eastern.qvalue > .85]), col=alpha("#8C510A"), lwd=3, lty=2)
#abline(v=mean(samps$western.qvalue[samps$western.qvalue > .85]), col=alpha("#01665E"), lwd=3, lty=2)
##close
#dev.off()