load packages and read in data
#load packages
library(introgress)
## Loading required package: nnet
## Loading required package: genetics
## Loading required package: combinat
##
## Attaching package: 'combinat'
## The following object is masked from 'package:utils':
##
## combn
## Loading required package: gdata
## gdata: read.xls support for 'XLS' (Excel 97-2004) files ENABLED.
##
## gdata: read.xls support for 'XLSX' (Excel 2007+) files ENABLED.
##
## Attaching package: 'gdata'
## The following object is masked from 'package:stats':
##
## nobs
## The following object is masked from 'package:utils':
##
## object.size
## The following object is masked from 'package:base':
##
## startsWith
## Loading required package: gtools
## Loading required package: MASS
## Loading required package: mvtnorm
##
## NOTE: THIS PACKAGE IS NOW OBSOLETE.
##
## The R-Genetics project has developed an set of enhanced genetics
## packages to replace 'genetics'. Please visit the project homepage
## at http://rgenetics.org for informtion.
##
##
## Attaching package: 'genetics'
## The following objects are masked from 'package:base':
##
## %in%, as.factor, order
## Loading required package: RColorBrewer
library(vcfR)
##
## ***** *** vcfR *** *****
## This is vcfR 1.12.0
## browseVignettes('vcfR') # Documentation
## citation('vcfR') # Citation
## ***** ***** ***** *****
library(SNPfiltR)
## This is SNPfiltR v.1.0.0
##
## Detailed usage information is available at: devonderaad.github.io/SNPfiltR/
##
## If you use SNPfiltR in your published work, please cite the following papers:
##
## DeRaad, D.A. (2022), SNPfiltR: an R package for interactive and reproducible SNP filtering. Molecular Ecology Resources, 00, 1-15. http://doi.org/10.1111/1755-0998.13618
##
## Knaus, Brian J., and Niklaus J. Grunwald. 2017. VCFR: a package to manipulate and visualize variant call format data in R. Molecular Ecology Resources, 17.1:44-53. http://doi.org/10.1111/1755-0998.12549
library(RColorBrewer)
library(ggplot2)
library(StAMPP)
## Loading required package: pegas
## Loading required package: ape
## Registered S3 methods overwritten by 'pegas':
## method from
## print.amova ade4
## [.haplotype genetics
##
## Attaching package: 'pegas'
## The following object is masked from 'package:ape':
##
## mst
## The following objects are masked from 'package:vcfR':
##
## getINFO, write.vcf
## The following objects are masked from 'package:genetics':
##
## haplotype, LD
library(adegenet)
## Loading required package: ade4
##
## Attaching package: 'ade4'
## The following object is masked from 'package:pegas':
##
## amova
## The following object is masked from 'package:introgress':
##
## triangle.plot
##
## /// adegenet 2.1.5 is loaded ////////////
##
## > overview: '?adegenet'
## > tutorials/doc/questions: 'adegenetWeb()'
## > bug reports/feature requests: adegenetIssues()
##
## Attaching package: 'adegenet'
## The following object is masked from 'package:gtools':
##
## chr
#read in vcf as vcfR
vcfR <- read.vcfR("~/Desktop/marsh.wren.rad/filtered.vcf.gz")
## Scanning file to determine attributes.
## File attributes:
## meta lines: 14
## header_line: 15
## variant count: 9583
## column count: 146
##
Meta line 14 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
## Character matrix gt rows: 9583
## Character matrix gt cols: 146
## skip: 0
## nrows: 9583
## row_num: 0
##
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant 6000
Processed variant 7000
Processed variant 8000
Processed variant 9000
Processed variant: 9583
## All variants processed
#read in sample info with vocal info included
samps.voc<-read.csv("~/Desktop/marsh.wren.rad/samples.with.qvalues.plus.song.info.csv")
samps.voc$Tissue.num == colnames(vcfR@gt)[-1] #check that order matches
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [46] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [61] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [76] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [91] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [106] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [121] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [136] TRUE TRUE
perform introgress analysis
#introgress analysis
#create genotype matrix
mat<-extract.gt(vcfR)
#convert matrix to numeric values
conv.mat<-mat
conv.mat[conv.mat == "0/0"]<-0
conv.mat[conv.mat == "0/1"]<-1
conv.mat[conv.mat == "1/1"]<-2
conv.mat<-as.data.frame(conv.mat)
#convert class to numeric
for (i in 1:ncol(conv.mat)){
conv.mat[,i]<-as.numeric(as.character(conv.mat[,i]))
}
#check sample order
colnames(conv.mat)
## [1] "B00908" "B00910" "B00911" "B00912" "B00913" "B00914" "B00915" "B00916"
## [9] "B00927" "B00928" "B00929" "B00930" "B00931" "B00932" "B00934" "B00935"
## [17] "B00936" "B00937" "B00938" "B00939" "B00940" "B00941" "B00942" "B00943"
## [25] "B00944" "B00945" "B00946" "B00947" "B00948" "B00949" "B00950" "B00951"
## [33] "B00953" "B00954" "B00955" "B00956" "B00957" "B00958" "B00959" "B00960"
## [41] "B00961" "B00962" "B00963" "B00964" "B00965" "B00966" "B00967" "B02324"
## [49] "B02325" "B02326" "B02327" "B02328" "B02330" "B02332" "B02333" "B02334"
## [57] "B02336" "B02337" "B02338" "B02339" "B02340" "B02341" "B02342" "B02343"
## [65] "B02344" "B02345" "B02346" "B02347" "B02348" "B02350" "B02351" "B02352"
## [73] "B02353" "B02354" "B02355" "B02358" "B02360" "B02361" "B02363" "B02364"
## [81] "B02366" "B02367" "B02368" "B02371" "B02372" "B02373" "B02374" "B02375"
## [89] "B02376" "B02377" "B02378" "B02379" "B02380" "B02381" "B02382" "B02383"
## [97] "B02385" "B02386" "B02387" "B02388" "B02389" "B02390" "B02391" "B02392"
## [105] "B02393" "B02394" "B02395" "B02397" "B02438" "B02439" "B02440" "B02441"
## [113] "B02449" "B02456" "B02458" "B02459" "B02460" "B02461" "B02464" "B02478"
## [121] "B02546" "B02569" "B02578" "B02588" "B02594" "B02609" "B02620" "25372"
## [129] "25374" "25375" "25376" "25377" "25379" "25380" "25381" "25382"
## [137] "25383"
#calc AF for the samples you will use to call fixed differences
west.af<-(rowSums(conv.mat[,c(128:137)], na.rm=T)/(rowSums(is.na(conv.mat[,c(128:137)]) == FALSE)))/2
east.af<-(rowSums(conv.mat[,c(1:8,14)], na.rm=T)/(rowSums(is.na(conv.mat[,c(1:8,14)]) == FALSE)))/2
#find fixed SNPs
diff<-abs(west.af - east.af)
#how many SNPs are fixed
table(is.na(diff) == FALSE & diff == 1)
##
## FALSE TRUE
## 9324 259
#subsample original matrix to only fixed diff SNPs
gen.mat<-mat[is.na(diff) == FALSE & diff == 1,]
dim(gen.mat)
## [1] 259 137
#subsample matrix converted for AF calcs to only fixed SNPS
conv.mat<-conv.mat[is.na(diff) == FALSE & diff == 1,]
dim(conv.mat)
## [1] 259 137
#write a logical test to convert alleles so that a single number represents one parental ancestry
for (i in 1:nrow(gen.mat)){
#if 1 is the west allele (ie = 0 frequency in the eastern samples used for identifying informative SNPs)
if((sum(conv.mat[i,c(1:8,14)], na.rm=T)/(sum(is.na(conv.mat[i,c(1:8,14)]) == FALSE)))/2 == 0){
#swap all '0/0' cells with '2/2'
gen.mat[i,][gen.mat[i,] == "0/0"]<-"2/2"
#swap all '1/1' cells with '0/0'
gen.mat[i,][gen.mat[i,] == "1/1"]<-"0/0"
#finally convert all '2/2' cells (originally 0/0) into '1/1'
gen.mat[i,][gen.mat[i,] == "2/2"]<-"1/1"
#no need to touch hets
}
}
#subset to alleles with < 10 missing genotypes
#gen.mat<-gen.mat[rowSums(is.na(gen.mat)) < 5,]
#dim(gen.mat)
#convert R class NAs to the string "NA/NA"
gen.mat[is.na(gen.mat) == TRUE]<-"NA/NA"
#check out the df
head(gen.mat)[,c(1:10)]
## B00908 B00910 B00911 B00912 B00913 B00914 B00915 B00916 B00927
## 50_74 "1/1" "1/1" "1/1" "1/1" "1/1" "1/1" "1/1" "1/1" "0/1"
## 58_14 "1/1" "1/1" "1/1" "1/1" "1/1" "1/1" "1/1" "1/1" "NA/NA"
## 112_38 "1/1" "1/1" "1/1" "1/1" "1/1" "1/1" "NA/NA" "1/1" "1/1"
## 267_61 "NA/NA" "1/1" "1/1" "1/1" "1/1" "1/1" "1/1" "1/1" "NA/NA"
## 351_60 "NA/NA" "1/1" "1/1" "NA/NA" "1/1" "1/1" "1/1" "1/1" "1/1"
## 553_48 "1/1" "1/1" "1/1" "1/1" "1/1" "1/1" "1/1" "1/1" "0/1"
## B00928
## 50_74 "0/0"
## 58_14 "0/0"
## 112_38 "NA/NA"
## 267_61 "0/0"
## 351_60 "0/0"
## 553_48 "0/0"
#make locus info df
locus.info<-data.frame(locus=rownames(gen.mat),
type=rep("C", times=nrow(gen.mat)),
#lg=1:nrow(gen.mat),
#marker.pos=1:nrow(gen.mat))
lg=vcfR@fix[,1][is.na(diff) == FALSE & diff == 1],
marker.pos=vcfR@fix[,2][is.na(diff) == FALSE & diff == 1])
#we now have a gt matrix in proper format for introgress
#convert genotype data into a matrix of allele counts
count.matrix<-prepare.data(admix.gen=gen.mat, loci.data=locus.info,
parental1="1",parental2="0", pop.id=F,
ind.id=F, fixed=T)
## prepare.data is working; this may take a moment
## Processing data for 137 individuals and 259 loci.
#estimate hybrid index values
hi.index.sim<-est.h(introgress.data=count.matrix,loci.data=locus.info,
fixed=T, p1.allele="0", p2.allele="1")
## est.h is working; this may take a few minutes
#calculate mean heterozygosity using their function
het<-calc.intersp.het(introgress.data=count.matrix)
#faster hand-written code to calculate per sample heterozygosity
#het<-colSums(count.matrix$Count.matrix == 1, na.rm=TRUE)/colSums(!is.na(count.matrix$Count.matrix))
#make triangle plot
introgress::triangle.plot(hi.index=hi.index.sim, int.het=het, pdf = F)

#see how many recent hybrids with different cutoffs
table(het > .85)
##
## FALSE TRUE
## 134 3
table(het > .75)
##
## FALSE TRUE
## 130 7
table(het > .5)
##
## FALSE TRUE
## 124 13
#plot triangle with points colored by call
ca<-gsub("unclear", "WESTERN", samps.voc$Song.Type)
ca<-gsub("WESTERN", "blue", ca)
ca<-gsub("EASTERN", "red", ca)
ca<-gsub("MIXED", "green", ca)
#plot colored by song type
plot(x=hi.index.sim$h, y=het, bg=ca,
pch=21, cex=1.5,
xlab="Hybrid Index", ylab="Interspecific heterozygosity",
ylim=c(0,1))
segments(x0 =0, y0 =0, x1 =.5, y1 =1)
segments(x0 =1, y0 =0, x1 =.5, y1 =1)

#plot triangle with points colored by sampling locality
brewer.pal(n=6, "BrBG") #get your palette
## [1] "#8C510A" "#D8B365" "#F6E8C3" "#C7EAE5" "#5AB4AC" "#01665E"
ca<-gsub("LOSTWOODS", "#8C510A", samps.voc$Population)
ca<-gsub("NICOLLE FLATS", "#D8B365", ca)
ca<-gsub("EYEBROW LAKE", "#F6E8C3", ca)
ca<-gsub("CHAPLIN", "#C7EAE5", ca)
ca<-gsub("CRANE", "#5AB4AC", ca)
ca<-gsub("NEBRASKA", "#01665E", ca)
#plot colored by sampling locality
plot(x=hi.index.sim$h, y=het, bg=alpha(ca, .75),
pch=21, cex=1.2,
xlab="Hybrid Index", ylab="Interspecific heterozygosity",
ylim=c(0,1))
segments(x0 =0, y0 =0, x1 =.5, y1 =1)
segments(x0 =1, y0 =0, x1 =.5, y1 =1)

#add call info
va<-gsub("unclear", 22, samps.voc$Song.Type)
va<-gsub("WESTERN", 21, va)
va<-gsub("MIXED", 23, va)
va<-gsub("EASTERN", 24, va)
va<-as.numeric(va)
#plot colored by sampling locality
plot(x=hi.index.sim$h, y=het, bg=alpha(ca, .75),
pch=va, cex=1.8,
xlab="Hybrid Index", ylab="Interspecific heterozygosity",
ylim=c(0,1))
points(x =c(.5,.5,.25,.75), y=c(1,.5,.5,.5), pch=8)
segments(x0 =0, y0 =0, x1 =.5, y1 =1)
segments(x0 =1, y0 =0, x1 =.5, y1 =1)

#save plot
#pdf("~/Desktop/marsh.wren.rad/triangle.plot.pdf", width = 5.5, height=5)
##plot colored by sampling locality
#plot(x=hi.index.sim$h, y=het, bg=alpha(ca, .75),
# pch=va, cex=1.8,
# xlab="Hybrid Index", ylab="Interspecific heterozygosity",
# ylim=c(0,1))
#points(x =c(.5,.5,.25,.75), y=c(1,.5,.5,.5), pch=8)
#segments(x0 =0, y0 =0, x1 =.5, y1 =1)
#segments(x0 =1, y0 =0, x1 =.5, y1 =1)
#dev.off()
make genotype plot
#make genotype plot
mk.image(introgress.data=count.matrix, loci.data=locus.info,
hi.index=hi.index.sim, ylab.image="Individuals",
xlab.h="population 2 ancestry", pdf=F,
col.image=c(rgb(0,0,1,alpha=.5),rgb(0,0,0,alpha=.8),rgb(1,0,0,alpha=.5)))

#make genotype plot colored by lineage
mk.image(introgress.data=count.matrix, loci.data=locus.info,
hi.index=hi.index.sim, ylab.image="Individuals",
xlab.h="population 2 ancestry", pdf=F,
col.image=c("#01665E","black","#8C510A"))

#save plot
#pdf("~/Desktop/marsh.wren.rad/genotype.plot.pdf", width = 9, height = 3.5)
#mk.image(introgress.data=count.matrix, loci.data=locus.info,
# hi.index=hi.index.sim, ylab.image="Individuals",
# xlab.h="population 2 ancestry", pdf=F,
# col.image=c("#01665E","black","#8C510A"))
#dev.off()
make t-SNE plot using only these fixed differences
#get number of unique loci
length(unique(gsub("_.*", "", rownames(conv.mat))))
## [1] 207
#subset vcfR object to include only fixed differences
vcfR.sub<-vcfR[paste0(vcfR@fix[,1],"_",vcfR@fix[,2]) %in% rownames(conv.mat),]
#make tsne plot
fixed.pca<-assess_missing_data_tsne(vcfR.sub, popmap = data.frame(id=samps.voc$Tissue.num, pop=samps.voc$Population),
clustering = TRUE)



#flip PC1 so that west is on the left, east on the right
fixed.pca$tsne.ax1<-fixed.pca$tsne.ax1*-1
ggplot(fixed.pca, aes(x=tsne.ax1, y=tsne.ax2, col=popmap.pop))+
geom_point(cex=4, alpha=.8)+
scale_color_manual(values=brewer.pal(n=6, "BrBG")[c(4,5,3,1,6,2)])+
theme_classic()+
xlab("t-SNE axis 1")+
ylab("t-SNE axis 2")

#ggsave("~/Desktop/marsh.wren.rad/tsne.fixed.diffs.pdf", width =6, height=4)