***** *** vcfR *** *****
This is vcfR 1.14.0
browseVignettes('vcfR') # Documentation
citation('vcfR') # Citation
***** ***** ***** *****
library(SNPfiltR)
This is SNPfiltR v.1.0.2
Detailed usage information is available at: devonderaad.github.io/SNPfiltR/
If you use SNPfiltR in your published work, please cite the following papers:
DeRaad, D.A. (2022), SNPfiltR: an R package for interactive and reproducible SNP filtering. Molecular Ecology Resources, 22, 2443-2453. http://doi.org/10.1111/1755-0998.13618
Knaus, Brian J., and Niklaus J. Grunwald. 2017. VCFR: a package to manipulate and visualize variant call format data in R. Molecular Ecology Resources, 17.1:44-53. http://doi.org/10.1111/1755-0998.12549
library(ggplot2)library(RColorBrewer)
0.1 Filter input SNP files based on smaple inclusion and MAC to increase signal to noise ratio
#read in filtered vcfvcfR <-read.vcfR("~/Desktop/cali.zosterops.rad/filtered.unlinked.snps.vcf.gz")
Scanning file to determine attributes.
File attributes:
meta lines: 14
header_line: 15
variant count: 1554
column count: 133
Meta line 14 read in.
All meta lines processed.
gt matrix initialized.
Character matrix gt created.
Character matrix gt rows: 1554
Character matrix gt cols: 133
skip: 0
nrows: 1554
row_num: 0
Processed variant 1000
Processed variant: 1554
All variants processed
vcfR
***** Object of Class vcfR *****
124 samples
239 CHROMs
1,554 variants
Object size: 16.3 Mb
5.522 percent missing data
***** ***** *****
#bring in sample info#read in sample info csvsample.info<-read.csv("~/Desktop/cali.zosterops.rad/zosterops.trimmed.RAD.sampling.csv")sample.info<-sample.info[sample.info$ID %in%colnames(vcfR@gt),]table(sample.info$Species)
***** Object of Class vcfR *****
124 samples
181 CHROMs
966 variants
Object size: 10.2 Mb
5.661 percent missing data
***** ***** *****
0.2 Code to convert the vcf into appropriate file structure and run ADMIXTURE on the cluster
#!/bin/sh##SBATCH --job-name=admixture.zost # Job Name#SBATCH --nodes=1 # 40 nodes#SBATCH --ntasks-per-node=5 # 40 CPU allocation per Task#SBATCH --partition=sixhour # Name of the Slurm partition used#SBATCH --chdir=/home/d669d153/work/zosterops.rad/admixture/all.samps # Set working d$#SBATCH --mem-per-cpu=1gb # memory requested#SBATCH --time=200#use plink to convert vcf directly to bed format:/home/d669d153/work/plink--vcf /home/d669d153/work/zosterops.rad/admixture/all.samps/filtered.unlinked.snps.vcf --double-id--allow-extra-chr--make-bed--out binary_fileset#fix chromosome namescut-f2- binary_fileset.bim > tempawk'BEGIN{FS=OFS="\t"}{print value 1 OFS $0}' temp > binary_fileset.bimrm temp#run admixture for a K of 1-10, using cross-validation, with 10 threadsfor K in 1 2 3 4 5 6 7 8 9 10;do/home/d669d153/work/admixture_linux-1.3.0/admixture--cv-j5-m EM binary_fileset.bed $K|tee log${K}.out;done#Which K iteration is optimal according to ADMIXTURE ?grep-h CV log*.out > log.errors.txt
0.3 Repeat in a separate directory with singletons removed
#!/bin/sh##SBATCH --job-name=admixture.zost # Job Name#SBATCH --nodes=1 # 40 nodes#SBATCH --ntasks-per-node=5 # 40 CPU allocation per Task#SBATCH --partition=sixhour # Name of the Slurm partition used#SBATCH --chdir=/home/d669d153/work/zosterops.rad/admixture/mac2 # Set working d$#SBATCH --mem-per-cpu=1gb # memory requested#SBATCH --time=200#use plink to convert vcf directly to bed format:/home/d669d153/work/plink--vcf /home/d669d153/work/zosterops.rad/admixture/mac2/nosingletons.vcf --double-id--allow-extra-chr--make-bed--out binary_fileset#fix chromosome namescut-f2- binary_fileset.bim > tempawk'BEGIN{FS=OFS="\t"}{print value 1 OFS $0}' temp > binary_fileset.bimrm temp#run admixture for a K of 1-10, using cross-validation, with 10 threadsfor K in 1 2 3 4 5 6 7 8 9 10;do/home/d669d153/work/admixture_linux-1.3.0/admixture--cv-j5-m EM binary_fileset.bed $K|tee log${K}.out;done#Which K iteration is optimal according to ADMIXTURE ?grep-h CV log*.out > log.errors.txt
0.4 check out the run with all samples included and no mac filter
#setwd to admixture directory run on the clustersetwd("~/Desktop/cali.zosterops.rad/admixture/all.samps/")#read in log error values to determine optimal Klog<-read.table("log.errors.txt")[,c(3:4)]#use double backslash to interpret the opening parentheses literally in the regular expressionlog$V3<-gsub("\\(K=", "", log$V3)log$V3<-gsub("):", "", log$V3)#interpret K values as numericallog$V3<-as.numeric(log$V3)#rename columnscolnames(log)<-c("Kvalue","cross.validation.error")#make plot showing the cross validation error across K values 1:10ggplot(data=log, aes(x=Kvalue, y=cross.validation.error, group=1)) +geom_line(linetype ="dashed")+geom_point()+ylab("cross-validation error")+xlab("K")+scale_x_continuous(breaks =c(1:10))+theme_classic()
#read in input file in order to get list of input samples in ordersamps<-read.table("binary_fileset.fam")[,1]#reorder sampling df to match order of the plotsample.info<-sample.info[match(samps, sample.info$ID),]sample.info$ID == samps
#read in all ten runs and save each dataframe in a listruns<-list()#read in log filesfor (i in1:10){ runs[[i]]<-read.table(paste0("binary_fileset.", i, ".Q"))}par(mfrow=c(1,1))#plot each runfor (i in1:5){barplot(t(as.matrix(runs[[i]])), col=rainbow(i), ylab="Ancestry", border="black")}
for (i in6:10){barplot(t(as.matrix(runs[[i]])), col=rainbow(i), ylab="Ancestry", border="black")}
#rename rows in sampling columnrownames(sample.info)<-1:nrow(sample.info)#index out each species to reorderorderbyspecies<-c(as.numeric(rownames(sample.info[sample.info$Species =="everetti",])),as.numeric(rownames(sample.info[sample.info$Species =="nigrorum",])),as.numeric(rownames(sample.info[sample.info$Species =="palpebrosus",])),as.numeric(rownames(sample.info[sample.info$Species =="erythropleurus",])),as.numeric(rownames(sample.info[sample.info$Species =="simplex",])),as.numeric(rownames(sample.info[sample.info$Species =="japonicus",])) )#reorder sampling dfsamps.info<-sample.info[orderbyspecies,]#make df of q values by sampledf<-cbind(samps.info[,c(1,6,8)],runs[[6]][orderbyspecies,])rownames(df)<-c(1:nrow(df))#reorder runsfor(i in1:10){runs[[i]]<-t(as.matrix(runs[[i]]))[,orderbyspecies]}#plot reordered barplot of optimal K valuebarplot(runs[[6]], col=c("#882255","#332288","#DDCC77","#88CCEE","#CC6677","grey"), ylab="Ancestry", border="black")
#reorder one final time to clean it up slightly morebarplot(runs[[6]][,c(1:69,120:124,70:86,91:95,110:116,87:90,96:109,117:119)], col=c("#882255","#332288","#DDCC77","#88CCEE","#CC6677","grey"), ylab="Ancestry", border="black")
#setwd to admixture directory run on the clustersetwd("~/Desktop/cali.zosterops.rad/admixture/mac2/")#read in log error values to determine optimal Klog<-read.table("log.errors.txt")[,c(3:4)]#use double backslash to interpret the opening parentheses literally in the regular expressionlog$V3<-gsub("\\(K=", "", log$V3)log$V3<-gsub("):", "", log$V3)#interpret K values as numericallog$V3<-as.numeric(log$V3)#rename columnscolnames(log)<-c("Kvalue","cross.validation.error")#make plot showing the cross validation error across K values 1:10ggplot(data=log, aes(x=Kvalue, y=cross.validation.error, group=1)) +geom_line(linetype ="dashed")+geom_point()+ylab("cross-validation error")+xlab("K")+scale_x_continuous(breaks =c(1:10))+theme_classic()
#read in input file in order to get list of input samples in ordersamps<-read.table("binary_fileset.fam")[,1]#read in all ten runs and save each dataframe in a listruns<-list()#read in log filesfor (i in1:10){ runs[[i]]<-read.table(paste0("binary_fileset.", i, ".Q"))}par(mfrow=c(1,1))#plot each runfor (i in1:5){barplot(t(as.matrix(runs[[i]])), col=rainbow(i), ylab="Ancestry", border="black")}
for (i in6:10){barplot(t(as.matrix(runs[[i]])), col=rainbow(i), ylab="Ancestry", border="black")}
#reorder runsfor(i in1:10){runs[[i]]<-t(as.matrix(runs[[i]]))[,orderbyspecies]}#reorder one final time to clean it up slightly morebarplot(runs[[6]][,c(1:69,120:124,70:86,91:95,110:116,87:90,96:109,117:119)], col=c("#882255","#332288","#DDCC77","#88CCEE","#CC6677","grey"), ylab="Ancestry", border="black")
### MAC filter appears to make no difference, so we will just run with the full unlinked SNP dataset
0.6 Run hierarchical subsets
Because everetti and nigrorum are not split when they are run as part of the entire dataset, we will isolate them out of the vcf and re-run them separately to show that they are cleanly two separate things that are just not being split due to hierarchical structure
0.7 Run this subset in a unique directory on the cluster
#!/bin/sh##SBATCH --job-name=admixture.zost # Job Name#SBATCH --nodes=1 # 40 nodes#SBATCH --ntasks-per-node=5 # 40 CPU allocation per Task#SBATCH --partition=sixhour # Name of the Slurm partition used#SBATCH --chdir=/home/d669d153/work/zosterops.rad/admixture/ev.ni # Set working d$#SBATCH --mem-per-cpu=1gb # memory requested#SBATCH --time=200#use plink to convert vcf directly to bed format:/home/d669d153/work/plink--vcf /home/d669d153/work/zosterops.rad/admixture/mac2/ev.ni.vcf --double-id--allow-extra-chr--make-bed--out binary_fileset#fix chromosome namescut-f2- binary_fileset.bim > tempawk'BEGIN{FS=OFS="\t"}{print value 1 OFS $0}' temp > binary_fileset.bimrm temp#run admixture for a K of 1-10, using cross-validation, with 10 threadsfor K in 1 2 3 4 5 6 7 8 9 10;do/home/d669d153/work/admixture_linux-1.3.0/admixture--cv-j5-m EM binary_fileset.bed $K|tee log${K}.out;done#Which K iteration is optimal according to ADMIXTURE ?grep-h CV log*.out > log.errors.txt
0.8 Check out the subset results
#setwd to admixture directory run on the clustersetwd("~/Desktop/cali.zosterops.rad/admixture/ev.ni/")#read in log error values to determine optimal Klog<-read.table("log.errors.txt")[,c(3:4)]#use double backslash to interpret the opening parentheses literally in the regular expressionlog$V3<-gsub("\\(K=", "", log$V3)log$V3<-gsub("):", "", log$V3)#interpret K values as numericallog$V3<-as.numeric(log$V3)#rename columnscolnames(log)<-c("Kvalue","cross.validation.error")#make plot showing the cross validation error across K values 1:10ggplot(data=log, aes(x=Kvalue, y=cross.validation.error, group=1)) +geom_line(linetype ="dashed")+geom_point()+ylab("cross-validation error")+xlab("K")+scale_x_continuous(breaks =c(1:10))+theme_classic()
#read in input file in order to get list of input samples in ordersamps<-read.table("binary_fileset.fam")[,1]#read in all ten runs and save each dataframe in a listruns<-list()#read in log filesfor (i in1:10){ runs[[i]]<-read.table(paste0("binary_fileset.", i, ".Q"))}par(mfrow=c(1,1))#plot each runfor (i in1:5){barplot(t(as.matrix(runs[[i]])), col=rainbow(i), ylab="Ancestry", border="black")}
for (i in6:10){barplot(t(as.matrix(runs[[i]])), col=rainbow(i), ylab="Ancestry", border="black")}
#plot with appropriate color codebarplot(t(as.matrix(runs[[2]])), col=c("#117733","#DDCC77"), ylab="Ancestry", border="black")