introgress

load packages and read in data

#load packages
library(introgress)

## Loading required package: nnet

## Loading required package: genetics

## Loading required package: combinat

## 
## Attaching package: 'combinat'

## The following object is masked from 'package:utils':
## 
##     combn

## Loading required package: gdata

## gdata: read.xls support for 'XLS' (Excel 97-2004) files ENABLED.

##

## gdata: read.xls support for 'XLSX' (Excel 2007+) files ENABLED.

## 
## Attaching package: 'gdata'

## The following object is masked from 'package:stats':
## 
##     nobs

## The following object is masked from 'package:utils':
## 
##     object.size

## The following object is masked from 'package:base':
## 
##     startsWith

## Loading required package: gtools

## Loading required package: MASS

## Loading required package: mvtnorm

##

## NOTE: THIS PACKAGE IS NOW OBSOLETE.

##

##   The R-Genetics project has developed an set of enhanced genetics

##   packages to replace 'genetics'. Please visit the project homepage

##   at http://rgenetics.org for informtion.

##

## 
## Attaching package: 'genetics'

## The following objects are masked from 'package:base':
## 
##     %in%, as.factor, order

## Loading required package: RColorBrewer

library(vcfR)

## 
##    *****       ***   vcfR   ***       *****
##    This is vcfR 1.12.0 
##      browseVignettes('vcfR') # Documentation
##      citation('vcfR') # Citation
##    *****       *****      *****       *****

library(SNPfiltR)

## This is SNPfiltR v.1.0.0
## 
## Detailed usage information is available at: devonderaad.github.io/SNPfiltR/ 
## 
## If you use SNPfiltR in your published work, please cite the following papers: 
## 
## DeRaad, D.A. (2022), SNPfiltR: an R package for interactive and reproducible SNP filtering. Molecular Ecology Resources, 00, 1-15. http://doi.org/10.1111/1755-0998.13618 
## 
## Knaus, Brian J., and Niklaus J. Grunwald. 2017. VCFR: a package to manipulate and visualize variant call format data in R. Molecular Ecology Resources, 17.1:44-53. http://doi.org/10.1111/1755-0998.12549

library(RColorBrewer)
library(ggplot2)
library(StAMPP)

## Loading required package: pegas

## Loading required package: ape

## Registered S3 methods overwritten by 'pegas':
##   method      from    
##   print.amova ade4    
##   [.haplotype genetics

## 
## Attaching package: 'pegas'

## The following object is masked from 'package:ape':
## 
##     mst

## The following objects are masked from 'package:vcfR':
## 
##     getINFO, write.vcf

## The following objects are masked from 'package:genetics':
## 
##     haplotype, LD

library(adegenet)

## Loading required package: ade4

## 
## Attaching package: 'ade4'

## The following object is masked from 'package:pegas':
## 
##     amova

## The following object is masked from 'package:introgress':
## 
##     triangle.plot

## 
##    /// adegenet 2.1.5 is loaded ////////////
## 
##    > overview: '?adegenet'
##    > tutorials/doc/questions: 'adegenetWeb()' 
##    > bug reports/feature requests: adegenetIssues()

## 
## Attaching package: 'adegenet'

## The following object is masked from 'package:gtools':
## 
##     chr

#read in vcf as vcfR
vcfR <- read.vcfR("~/Desktop/marsh.wren.rad/filtered.vcf.gz")

## Scanning file to determine attributes.
## File attributes:
##   meta lines: 14
##   header_line: 15
##   variant count: 9583
##   column count: 146
## 
Meta line 14 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
##   Character matrix gt rows: 9583
##   Character matrix gt cols: 146
##   skip: 0
##   nrows: 9583
##   row_num: 0
## 
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant 6000
Processed variant 7000
Processed variant 8000
Processed variant 9000
Processed variant: 9583
## All variants processed

#read in sample info with vocal info included
samps.voc<-read.csv("~/Desktop/marsh.wren.rad/samples.with.qvalues.plus.song.info.csv")
samps.voc$Tissue.num == colnames(vcfR@gt)[-1] #check that order matches

##   [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [46] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [61] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [76] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [91] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [106] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [121] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [136] TRUE TRUE

perform introgress analysis

#introgress analysis
#create genotype matrix
mat<-extract.gt(vcfR)
#convert matrix to numeric values
conv.mat<-mat
conv.mat[conv.mat == "0/0"]<-0
conv.mat[conv.mat == "0/1"]<-1
conv.mat[conv.mat == "1/1"]<-2
conv.mat<-as.data.frame(conv.mat)
#convert class to numeric
for (i in 1:ncol(conv.mat)){
  conv.mat[,i]<-as.numeric(as.character(conv.mat[,i]))
}
#check sample order
colnames(conv.mat)

##   [1] "B00908" "B00910" "B00911" "B00912" "B00913" "B00914" "B00915" "B00916"
##   [9] "B00927" "B00928" "B00929" "B00930" "B00931" "B00932" "B00934" "B00935"
##  [17] "B00936" "B00937" "B00938" "B00939" "B00940" "B00941" "B00942" "B00943"
##  [25] "B00944" "B00945" "B00946" "B00947" "B00948" "B00949" "B00950" "B00951"
##  [33] "B00953" "B00954" "B00955" "B00956" "B00957" "B00958" "B00959" "B00960"
##  [41] "B00961" "B00962" "B00963" "B00964" "B00965" "B00966" "B00967" "B02324"
##  [49] "B02325" "B02326" "B02327" "B02328" "B02330" "B02332" "B02333" "B02334"
##  [57] "B02336" "B02337" "B02338" "B02339" "B02340" "B02341" "B02342" "B02343"
##  [65] "B02344" "B02345" "B02346" "B02347" "B02348" "B02350" "B02351" "B02352"
##  [73] "B02353" "B02354" "B02355" "B02358" "B02360" "B02361" "B02363" "B02364"
##  [81] "B02366" "B02367" "B02368" "B02371" "B02372" "B02373" "B02374" "B02375"
##  [89] "B02376" "B02377" "B02378" "B02379" "B02380" "B02381" "B02382" "B02383"
##  [97] "B02385" "B02386" "B02387" "B02388" "B02389" "B02390" "B02391" "B02392"
## [105] "B02393" "B02394" "B02395" "B02397" "B02438" "B02439" "B02440" "B02441"
## [113] "B02449" "B02456" "B02458" "B02459" "B02460" "B02461" "B02464" "B02478"
## [121] "B02546" "B02569" "B02578" "B02588" "B02594" "B02609" "B02620" "25372" 
## [129] "25374"  "25375"  "25376"  "25377"  "25379"  "25380"  "25381"  "25382" 
## [137] "25383"

#calc AF for the samples you will use to call fixed differences
west.af<-(rowSums(conv.mat[,c(128:137)], na.rm=T)/(rowSums(is.na(conv.mat[,c(128:137)]) == FALSE)))/2
east.af<-(rowSums(conv.mat[,c(1:8,14)], na.rm=T)/(rowSums(is.na(conv.mat[,c(1:8,14)]) == FALSE)))/2

#find fixed SNPs
diff<-abs(west.af - east.af)
#how many SNPs are fixed
table(is.na(diff) == FALSE & diff == 1)

## 
## FALSE  TRUE 
##  9324   259

#subsample original matrix to only fixed diff SNPs
gen.mat<-mat[is.na(diff) == FALSE & diff == 1,]
dim(gen.mat)

## [1] 259 137

#subsample matrix converted for AF calcs to only fixed SNPS
conv.mat<-conv.mat[is.na(diff) == FALSE & diff == 1,]
dim(conv.mat)

## [1] 259 137

#write a logical test to convert alleles so that a single number represents one parental ancestry
for (i in 1:nrow(gen.mat)){
  #if 1 is the west allele (ie = 0 frequency in the eastern samples used for identifying informative SNPs)
  if((sum(conv.mat[i,c(1:8,14)], na.rm=T)/(sum(is.na(conv.mat[i,c(1:8,14)]) == FALSE)))/2 == 0){
    #swap all '0/0' cells with '2/2'
    gen.mat[i,][gen.mat[i,] == "0/0"]<-"2/2"
    #swap all '1/1' cells with '0/0'
    gen.mat[i,][gen.mat[i,] == "1/1"]<-"0/0"
    #finally convert all '2/2' cells (originally 0/0) into '1/1'
    gen.mat[i,][gen.mat[i,] == "2/2"]<-"1/1"
    #no need to touch hets
  }
}

#subset to alleles with < 10 missing genotypes
#gen.mat<-gen.mat[rowSums(is.na(gen.mat)) < 5,]
#dim(gen.mat)

#convert R class NAs to the string "NA/NA"
gen.mat[is.na(gen.mat) == TRUE]<-"NA/NA"

#check out the df
head(gen.mat)[,c(1:10)]

##        B00908  B00910 B00911 B00912  B00913 B00914 B00915  B00916 B00927 
## 50_74  "1/1"   "1/1"  "1/1"  "1/1"   "1/1"  "1/1"  "1/1"   "1/1"  "0/1"  
## 58_14  "1/1"   "1/1"  "1/1"  "1/1"   "1/1"  "1/1"  "1/1"   "1/1"  "NA/NA"
## 112_38 "1/1"   "1/1"  "1/1"  "1/1"   "1/1"  "1/1"  "NA/NA" "1/1"  "1/1"  
## 267_61 "NA/NA" "1/1"  "1/1"  "1/1"   "1/1"  "1/1"  "1/1"   "1/1"  "NA/NA"
## 351_60 "NA/NA" "1/1"  "1/1"  "NA/NA" "1/1"  "1/1"  "1/1"   "1/1"  "1/1"  
## 553_48 "1/1"   "1/1"  "1/1"  "1/1"   "1/1"  "1/1"  "1/1"   "1/1"  "0/1"  
##        B00928 
## 50_74  "0/0"  
## 58_14  "0/0"  
## 112_38 "NA/NA"
## 267_61 "0/0"  
## 351_60 "0/0"  
## 553_48 "0/0"

#make locus info df
locus.info<-data.frame(locus=rownames(gen.mat),
                       type=rep("C", times=nrow(gen.mat)),
                       #lg=1:nrow(gen.mat),
                       #marker.pos=1:nrow(gen.mat))
                       lg=vcfR@fix[,1][is.na(diff) == FALSE & diff == 1],
                       marker.pos=vcfR@fix[,2][is.na(diff) == FALSE & diff == 1])

#we now have a gt matrix in proper format for introgress
#convert genotype data into a matrix of allele counts
count.matrix<-prepare.data(admix.gen=gen.mat, loci.data=locus.info,
                           parental1="1",parental2="0", pop.id=F,
                           ind.id=F, fixed=T)

## prepare.data is working; this may take a moment
## Processing data for 137 individuals and 259 loci.

#estimate hybrid index values
hi.index.sim<-est.h(introgress.data=count.matrix,loci.data=locus.info,
                    fixed=T, p1.allele="0", p2.allele="1")

## est.h is working; this may take a few minutes

#calculate mean heterozygosity using their function
het<-calc.intersp.het(introgress.data=count.matrix)
#faster hand-written code to calculate per sample heterozygosity
#het<-colSums(count.matrix$Count.matrix == 1, na.rm=TRUE)/colSums(!is.na(count.matrix$Count.matrix))
#make triangle plot
introgress::triangle.plot(hi.index=hi.index.sim, int.het=het, pdf = F)

#see how many recent hybrids with different cutoffs
table(het > .85)

## 
## FALSE  TRUE 
##   134     3

table(het > .75)

## 
## FALSE  TRUE 
##   130     7

table(het > .5)

## 
## FALSE  TRUE 
##   124    13

#plot triangle with points colored by call
ca<-gsub("unclear", "WESTERN", samps.voc$Song.Type)
ca<-gsub("WESTERN", "blue", ca)
ca<-gsub("EASTERN", "red", ca)
ca<-gsub("MIXED", "green", ca)

#plot colored by song type
plot(x=hi.index.sim$h, y=het, bg=ca,
     pch=21, cex=1.5,
     xlab="Hybrid Index", ylab="Interspecific heterozygosity",
     ylim=c(0,1))
segments(x0 =0, y0 =0, x1 =.5, y1 =1)
segments(x0 =1, y0 =0, x1 =.5, y1 =1)

#plot triangle with points colored by sampling locality
brewer.pal(n=6, "BrBG") #get your palette

## [1] "#8C510A" "#D8B365" "#F6E8C3" "#C7EAE5" "#5AB4AC" "#01665E"

ca<-gsub("LOSTWOODS", "#8C510A", samps.voc$Population)
ca<-gsub("NICOLLE FLATS", "#D8B365", ca)
ca<-gsub("EYEBROW LAKE", "#F6E8C3", ca)
ca<-gsub("CHAPLIN", "#C7EAE5", ca)
ca<-gsub("CRANE", "#5AB4AC", ca)
ca<-gsub("NEBRASKA", "#01665E", ca)

#plot colored by sampling locality
plot(x=hi.index.sim$h, y=het, bg=alpha(ca, .75),
     pch=21, cex=1.2,
     xlab="Hybrid Index", ylab="Interspecific heterozygosity",
     ylim=c(0,1))
segments(x0 =0, y0 =0, x1 =.5, y1 =1)
segments(x0 =1, y0 =0, x1 =.5, y1 =1)

#add call info
va<-gsub("unclear", 22, samps.voc$Song.Type)
va<-gsub("WESTERN", 21, va)
va<-gsub("MIXED", 23, va)
va<-gsub("EASTERN", 24, va)
va<-as.numeric(va)
#plot colored by sampling locality
plot(x=hi.index.sim$h, y=het, bg=alpha(ca, .75),
     pch=va, cex=1.8,
     xlab="Hybrid Index", ylab="Interspecific heterozygosity",
     ylim=c(0,1))
points(x =c(.5,.5,.25,.75), y=c(1,.5,.5,.5), pch=8)
segments(x0 =0, y0 =0, x1 =.5, y1 =1)
segments(x0 =1, y0 =0, x1 =.5, y1 =1)

#save plot
#pdf("~/Desktop/marsh.wren.rad/triangle.plot.pdf", width = 5.5, height=5)
##plot colored by sampling locality
#plot(x=hi.index.sim$h, y=het, bg=alpha(ca, .75),
#     pch=va, cex=1.8,
#     xlab="Hybrid Index", ylab="Interspecific heterozygosity",
#     ylim=c(0,1))
#points(x =c(.5,.5,.25,.75), y=c(1,.5,.5,.5), pch=8)
#segments(x0 =0, y0 =0, x1 =.5, y1 =1)
#segments(x0 =1, y0 =0, x1 =.5, y1 =1)
#dev.off()

make genotype plot

#make genotype plot
mk.image(introgress.data=count.matrix, loci.data=locus.info,
         hi.index=hi.index.sim, ylab.image="Individuals",
         xlab.h="population 2 ancestry", pdf=F,
         col.image=c(rgb(0,0,1,alpha=.5),rgb(0,0,0,alpha=.8),rgb(1,0,0,alpha=.5)))

#make genotype plot colored by lineage
mk.image(introgress.data=count.matrix, loci.data=locus.info,
         hi.index=hi.index.sim, ylab.image="Individuals",
         xlab.h="population 2 ancestry", pdf=F,
         col.image=c("#01665E","black","#8C510A"))

#save plot
#pdf("~/Desktop/marsh.wren.rad/genotype.plot.pdf", width = 9, height = 3.5)
#mk.image(introgress.data=count.matrix, loci.data=locus.info,
#         hi.index=hi.index.sim, ylab.image="Individuals",
#         xlab.h="population 2 ancestry", pdf=F,
#         col.image=c("#01665E","black","#8C510A"))
#dev.off()

make t-SNE plot using only these fixed differences

#get number of unique loci
length(unique(gsub("_.*", "", rownames(conv.mat))))

## [1] 207

#subset vcfR object to include only fixed differences
vcfR.sub<-vcfR[paste0(vcfR@fix[,1],"_",vcfR@fix[,2]) %in% rownames(conv.mat),]
#make tsne plot
fixed.pca<-assess_missing_data_tsne(vcfR.sub, popmap = data.frame(id=samps.voc$Tissue.num, pop=samps.voc$Population),
                                       clustering = TRUE)

#flip PC1 so that west is on the left, east on the right
fixed.pca$tsne.ax1<-fixed.pca$tsne.ax1*-1

ggplot(fixed.pca, aes(x=tsne.ax1, y=tsne.ax2, col=popmap.pop))+
  geom_point(cex=4, alpha=.8)+
  scale_color_manual(values=brewer.pal(n=6, "BrBG")[c(4,5,3,1,6,2)])+
  theme_classic()+
  xlab("t-SNE axis 1")+
  ylab("t-SNE axis 2")

#ggsave("~/Desktop/marsh.wren.rad/tsne.fixed.diffs.pdf", width =6, height=4)

introgress

Devon DeRaad

2022-10-05

load packages and read in data

perform introgress analysis

make genotype plot

make t-SNE plot using only these fixed differences