Run introgress on California and Woodhouse's Scrub-Jays

library(ggplot2)
#introgress tutorial
library(introgress)

## Loading required package: nnet

## Loading required package: genetics

## Warning: package 'genetics' was built under R version 3.5.2

## Loading required package: combinat

## 
## Attaching package: 'combinat'

## The following object is masked from 'package:utils':
## 
##     combn

## Loading required package: gdata

## gdata: read.xls support for 'XLS' (Excel 97-2004) files ENABLED.

##

## gdata: read.xls support for 'XLSX' (Excel 2007+) files ENABLED.

## 
## Attaching package: 'gdata'

## The following object is masked from 'package:stats':
## 
##     nobs

## The following object is masked from 'package:utils':
## 
##     object.size

## The following object is masked from 'package:base':
## 
##     startsWith

## Loading required package: gtools

## Loading required package: MASS

## Loading required package: mvtnorm

## Warning: package 'mvtnorm' was built under R version 3.5.2

##

## NOTE: THIS PACKAGE IS NOW OBSOLETE.

##

##   The R-Genetics project has developed an set of enhanced genetics

##   packages to replace 'genetics'. Please visit the project homepage

##   at http://rgenetics.org for informtion.

##

## 
## Attaching package: 'genetics'

## The following objects are masked from 'package:base':
## 
##     %in%, as.factor, order

## Loading required package: RColorBrewer

#see if this works for California and Woodhouse's Scrub-Jays
library(vcfR)

## 
##    *****       ***   vcfR   ***       *****
##    This is vcfR 1.12.0 
##      browseVignettes('vcfR') # Documentation
##      citation('vcfR') # Citation
##    *****       *****      *****       *****

library(adegenet)

## Loading required package: ade4

## Warning: package 'ade4' was built under R version 3.5.2

## 
## Attaching package: 'ade4'

## The following object is masked from 'package:introgress':
## 
##     triangle.plot

## 
##    /// adegenet 2.1.3 is loaded ////////////
## 
##    > overview: '?adegenet'
##    > tutorials/doc/questions: 'adegenetWeb()' 
##    > bug reports/feature requests: adegenetIssues()

## 
## Attaching package: 'adegenet'

## The following object is masked from 'package:gtools':
## 
##     chr

vcf<-read.vcfR("/Users/devder/Desktop/aph.data/unzipped.filtered.vcf")

## Scanning file to determine attributes.
## File attributes:
##   meta lines: 14
##   header_line: 15
##   variant count: 16307
##   column count: 104
## 
Meta line 14 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
##   Character matrix gt rows: 16307
##   Character matrix gt cols: 104
##   skip: 0
##   nrows: 16307
##   row_num: 0
## 
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant 6000
Processed variant 7000
Processed variant 8000
Processed variant 9000
Processed variant 10000
Processed variant 11000
Processed variant 12000
Processed variant 13000
Processed variant 14000
Processed variant 15000
Processed variant 16000
Processed variant: 16307
## All variants processed

#read in locality info for samples
locs<-read.csv("~/Desktop/aph.data/rad.sampling.csv")
#subset locs to include only samples that passed filtering, and have been retained in the vcf
locs<-locs[locs$id %in% colnames(vcf@gt),]
rownames(locs)<-1:95

#remove sumi, island, florida, wood mex, and texas
locs<-locs[c(1:32,57:73),]
vcf@gt<-vcf@gt[,c(1:33,58:74)]
colnames(vcf@gt)[-1] == locs$id

##  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [46] TRUE TRUE TRUE TRUE

rownames(locs)<-1:nrow(locs)
dim(vcf)

## variants fix_cols  gt_cols 
##    16307        8       50

#remove SNPs with no data
source("~/Desktop/snpfiltR/min_mac.R")
vcf<-min_mac(vcf, min.mac = 1)

## [1] "35.74% of SNPs fell below a minor allele count of 1 and were removed from the VCF"

dim(vcf)

## variants fix_cols  gt_cols 
##    10479        8       50

vcfR.100<-vcf
gt <- extract.gt(vcf)
vcfR.100@gt <- vcf@gt[rowSums(is.na(gt)) == 0,]
vcfR.100@fix <- vcf@fix[rowSums(is.na(gt)) == 0,]

#convert to genlight
gen<-vcfR2genlight(vcf)
gen.100<-vcfR2genlight(vcfR.100)
#perform PCA
pca<-glPca(gen.100, nf=30)
var_frac <- pca$eig/sum(pca$eig)

#pull pca scores out of df
pca.scores<-as.data.frame(pca$scores)
pca.scores$species<-locs$species

#ggplot color by species
ggplot(pca.scores, aes(x=PC1, y=PC2, color=species)) +
  geom_point(cex = 5, alpha=.5)+
  theme_classic()

#identify the three samples each with the largest and smallest scores on PC1 to use to call fixed differences
pca.scores[pca.scores$PC1 > 2.8,]

##                           PC1        PC2        PC3        PC4        PC5
## A_woodhouseii_334133 3.037061 -0.5912709 -0.3366753 -1.2982493 -0.4105088
## A_woodhouseii_334142 2.918402 -0.4139282 -0.2897220 -0.2394965 -0.1007825
## A_woodhouseii_334188 2.867045  0.2770548  0.1651711 -0.1783781 -1.7928326
##                             PC6       PC7         PC8        PC9       PC10
## A_woodhouseii_334133  2.7113278 1.0677231  1.52651710  0.2022530 -0.5829984
## A_woodhouseii_334142  0.1840205 0.1122149 -0.01768832 -0.1171638  0.2990504
## A_woodhouseii_334188 -1.9130799 1.2166849  1.53710520  1.5344688  0.4165617
##                           PC11        PC12        PC13      PC14       PC15
## A_woodhouseii_334133 0.7557227 -0.09574684  0.50066615 0.2728824  0.1079958
## A_woodhouseii_334142 0.4241522 -0.36321660  0.02376014 0.2266165 -0.1635947
## A_woodhouseii_334188 0.1287661  0.19944368 -0.17519061 0.9435037  0.2801640
##                            PC16       PC17       PC18        PC19       PC20
## A_woodhouseii_334133 -0.1330119 0.09838332  0.3995260 -0.53089697 -0.5033858
## A_woodhouseii_334142 -0.3278944 0.48012713 -1.4268928 -0.55029156  0.2247248
## A_woodhouseii_334188  0.3770443 1.23535020 -0.7099712 -0.07484724  0.6756530
##                             PC21        PC22        PC23       PC24        PC25
## A_woodhouseii_334133  0.58298820 -0.07867022  0.01721067 -0.1267086 -0.24242027
## A_woodhouseii_334142  0.03837660  0.84604092 -0.35708292  0.3454019 -0.05066455
## A_woodhouseii_334188 -0.01992159  0.30579469 -0.38748638 -0.6021840 -0.08677654
##                              PC26       PC27        PC28      PC29       PC30
## A_woodhouseii_334133 -0.002967459 -0.6530334 -0.04263875  0.105289  0.4409701
## A_woodhouseii_334142  1.446289900  1.3385117  0.39446418  2.280723  0.3805915
## A_woodhouseii_334188 -0.271817151  0.8364311 -0.02676062 -0.582162 -1.3784496
##                          species
## A_woodhouseii_334133 woodhouseii
## A_woodhouseii_334142 woodhouseii
## A_woodhouseii_334188 woodhouseii

pca.scores[pca.scores$PC1 < -1.8,]

##                            PC1        PC2        PC3        PC4        PC5
## A_californica_333862 -1.804101 -0.5661435  1.1548452  0.2021166 -0.7515666
## A_californica_333914 -1.817297 -0.2487985 -0.3034541 -0.5917913  1.0302872
## A_californica_334015 -1.966439 -0.5860374 -1.6546370  0.8451443 -2.0216284
##                             PC6        PC7        PC8         PC9       PC10
## A_californica_333862  0.8835571  1.0657724 -0.9015312 0.009941559  0.7563753
## A_californica_333914 -0.6694047  0.4747831  0.1585496 0.665815432 -0.1567706
## A_californica_334015  0.6884735 -0.5459441 -1.6858110 0.627836125 -2.3821100
##                            PC11        PC12        PC13       PC14        PC15
## A_californica_333862 -0.9058575  0.02744106 -0.20679281 -0.9389570  0.35837375
## A_californica_333914 -0.2109665  0.82529406  0.27276817 -1.0938468 -0.05939096
## A_californica_334015  0.4859042 -1.40177485  0.01270681  0.5813366  1.83616207
##                            PC16        PC17        PC18       PC19        PC20
## A_californica_333862 0.07772191  0.63149980 -0.18508546 1.78749419 -0.01152692
## A_californica_333914 0.14860400 -0.08060549  0.05166560 0.04417143  0.24921548
## A_californica_334015 0.61725442  0.92940532  0.08783751 0.93314856 -0.07623473
##                           PC21        PC22       PC23       PC24       PC25
## A_californica_333862 0.2473056  0.01030362 -0.4619286  0.1482236 -1.5032410
## A_californica_333914 0.2001853 -0.17313988 -0.1757631 -0.2625740 -0.1701078
## A_californica_334015 0.5592592  0.83049168  0.4319268  0.8599101  0.4944268
##                           PC26       PC27       PC28       PC29       PC30
## A_californica_333862 1.4953388 -1.9385959  0.3864047 -0.8902584 -0.6003776
## A_californica_333914 1.4910720  0.2802200 -0.9885955 -0.7238619 -0.0646918
## A_californica_334015 0.8056459  0.4395896 -0.5026916 -0.2230700  0.5557611
##                          species
## A_californica_333862 californica
## A_californica_333914 californica
## A_californica_334015 californica

#create SNP matrices
mat<-extract.gt(vcf)
mat[1:5,1:5]

##           A_californica_333849 A_californica_333854 A_californica_333855
## 271:52:+  "0/1"                "0/0"                NA                  
## 271:68:+  NA                   "0/0"                "0/0"               
## 274:22:-  "0/0"                "0/0"                "0/0"               
## 274:42:-  "0/0"                "0/1"                "0/0"               
## 1254:48:+ "0/0"                "0/0"                "0/0"               
##           A_californica_333857 A_californica_333860
## 271:52:+  "0/0"                "0/0"               
## 271:68:+  "0/0"                "0/0"               
## 274:22:-  "0/0"                "0/0"               
## 274:42:-  "0/0"                "0/0"               
## 1254:48:+ "0/0"                "0/0"

conv.mat<-mat
conv.mat[conv.mat == "0/0"]<-0
conv.mat[conv.mat == "0/1"]<-1
conv.mat[conv.mat == "1/1"]<-2
conv.mat<-as.data.frame(conv.mat)
#convert to numeric
for (i in 1:ncol(conv.mat)){
  conv.mat[,i]<-as.numeric(as.character(conv.mat[,i]))
}

#calc AF for the samples you will use to call fixed differences
cal.af<-(rowSums(conv.mat[,c(6,10,17)], na.rm=T)/(rowSums(is.na(conv.mat[,c(6,10,17)]) == FALSE)))/2
wood.af<-(rowSums(conv.mat[,c(39,41,44)], na.rm=T)/(rowSums(is.na(conv.mat[,c(39,41,44)]) == FALSE)))/2

#find fixed SNPs
diff<-abs(cal.af - wood.af)
#how many SNPs are fixed
table(is.na(diff) == FALSE & diff > .8)

## 
## FALSE  TRUE 
## 10351   128

vcf@fix[,1][is.na(diff) == FALSE & diff > .8]

##   [1] "Pseudochr1"  "Pseudochr1"  "Pseudochr1"  "Pseudochr1"  "Pseudochr10"
##   [6] "Pseudochr10" "Pseudochr15" "Pseudochr18" "Pseudochr1A" "Pseudochr1A"
##  [11] "Pseudochr1A" "Pseudochr1A" "Pseudochr1A" "Pseudochr1A" "Pseudochr1A"
##  [16] "Pseudochr1A" "Pseudochr2"  "Pseudochr2"  "Pseudochr2"  "Pseudochr2" 
##  [21] "Pseudochr2"  "Pseudochr2"  "Pseudochr2"  "Pseudochr2"  "Pseudochr2" 
##  [26] "Pseudochr2"  "Pseudochr2"  "Pseudochr2"  "Pseudochr2"  "Pseudochr2" 
##  [31] "Pseudochr2"  "Pseudochr2"  "Pseudochr2"  "Pseudochr2"  "Pseudochr2" 
##  [36] "Pseudochr2"  "Pseudochr2"  "Pseudochr2"  "Pseudochr2"  "Pseudochr2" 
##  [41] "Pseudochr2"  "Pseudochr27" "Pseudochr3"  "Pseudochr3"  "Pseudochr3" 
##  [46] "Pseudochr3"  "Pseudochr3"  "Pseudochr3"  "Pseudochr3"  "Pseudochr3" 
##  [51] "Pseudochr3"  "Pseudochr3"  "Pseudochr3"  "Pseudochr3"  "Pseudochr3" 
##  [56] "Pseudochr3"  "Pseudochr3"  "Pseudochr3"  "Pseudochr3"  "Pseudochr3" 
##  [61] "Pseudochr3"  "Pseudochr3"  "Pseudochr4"  "Pseudochr4"  "Pseudochr4" 
##  [66] "Pseudochr4"  "Pseudochr4"  "Pseudochr4"  "Pseudochr4"  "Pseudochr4" 
##  [71] "Pseudochr4"  "Pseudochr4A" "Pseudochr4A" "Pseudochr5"  "Pseudochr5" 
##  [76] "Pseudochr5"  "Pseudochr5"  "Pseudochr5"  "Pseudochr5"  "Pseudochr5" 
##  [81] "Pseudochr5"  "Pseudochr5"  "Pseudochr5"  "Pseudochr5"  "Pseudochr5" 
##  [86] "Pseudochr5"  "Pseudochr7"  "Pseudochr9"  "Pseudochr9"  "PseudochrM" 
##  [91] "PseudochrM"  "PseudochrM"  "PseudochrZ"  "PseudochrZ"  "PseudochrZ" 
##  [96] "PseudochrZ"  "PseudochrZ"  "PseudochrZ"  "PseudochrZ"  "PseudochrZ" 
## [101] "PseudochrZ"  "PseudochrZ"  "PseudochrZ"  "PseudochrZ"  "PseudochrZ" 
## [106] "PseudochrZ"  "PseudochrZ"  "PseudochrZ"  "PseudochrZ"  "PseudochrZ" 
## [111] "PseudochrZ"  "PseudochrZ"  "PseudochrZ"  "PseudochrZ"  "PseudochrZ" 
## [116] "PseudochrZ"  "PseudochrZ"  "PseudochrZ"  "PseudochrZ"  "PseudochrZ" 
## [121] "PseudochrZ"  "PseudochrZ"  "PseudochrZ"  "PseudochrZ"  "PseudochrZ" 
## [126] "PseudochrZ"  "PseudochrZ"  "PseudochrZ"

#subsample original matrix to only fixed diff SNPs
gen.mat<-mat[is.na(diff) == FALSE & diff > .8,]
dim(gen.mat)

## [1] 128  49

#subsample matrix converted for AF calcs to only fixed SNPS
conv.mat<-conv.mat[is.na(diff) == FALSE & diff > .8,]
dim(conv.mat)

## [1] 128  49

#write a logical test to convert alleles so that a single number represents one parental ancestry
for (i in 1:nrow(gen.mat)){
  #if 1 is the wood allele (ie < .2 frequency in the 3 cali samples used for identifying informative SNPs)
  if((sum(conv.mat[i,c(6,10,17)], na.rm=T)/(sum(is.na(conv.mat[i,c(6,10,17)]) == FALSE)))/2 < .2){
    #swap all '0/0' cells with '2/2'
    gen.mat[i,][gen.mat[i,] == "0/0"]<-"2/2"
    #swap all '1/1' cells with '0/0'
    gen.mat[i,][gen.mat[i,] == "1/1"]<-"0/0"
    #finally convert all '2/2' cells (originally 0/0) into '1/1'
    gen.mat[i,][gen.mat[i,] == "2/2"]<-"1/1"
    #no need to touch hets
  }
}

#convert R class NAs to the string "NA/NA"
gen.mat[is.na(gen.mat) == TRUE]<-"NA/NA"

#make locus info df
locus.info<-data.frame(locus=rownames(gen.mat),
                       type=rep("C", times=nrow(gen.mat)),
                       lg=vcf@fix[,1][is.na(diff) == FALSE & diff > .8],
                       marker.pos=vcf@fix[,2][is.na(diff) == FALSE & diff > .8])
#make linkage group numeric
locus.info$lg<-gsub("Pseudochr", "", locus.info$lg)
locus.info$lg[locus.info$lg == "M"]<-31
locus.info$lg[locus.info$lg == "Z"]<-30
locus.info$lg[locus.info$lg == "1A"]<-1.5
locus.info$lg[locus.info$lg == "4A"]<-4.5
locus.info$lg<-as.numeric(locus.info$lg)
locus.info$marker.pos<-as.numeric(as.character(locus.info$marker.pos))

#make bpcum
nCHR <- length(unique(locus.info$lg))
locus.info$BPcum <- NA
s <- 0
nbp <- c()
for (i in sort(unique(locus.info$lg))){
  nbp[i] <- max(locus.info[locus.info$lg == i,]$marker.pos)
  locus.info[locus.info$lg == i,"BPcum"] <- locus.info[locus.info$lg == i,"marker.pos"] + s
  s <- s + nbp[i]
}

#we now have a gt matrix in proper format for introgress
#convert genotype data into a matrix of allele counts
count.matrix<-prepare.data(admix.gen=gen.mat, loci.data=locus.info,
                           parental1="1",parental2="0", pop.id=F,
                           ind.id=F, fixed=T)

## prepare.data is working; this may take a moment
## Processing data for 49 individuals and 128 loci.

#estimate hybrid index values
hi.index.sim<-est.h(introgress.data=count.matrix,loci.data=locus.info,
                    fixed=T, p1.allele="1", p2.allele="0")

## est.h is working; this may take a few minutes

locus.info$locus<-rep("", times=nrow(locus.info))
#LociDataSim1$lg<-c(1:110)
mk.image(introgress.data=count.matrix, loci.data=locus.info,
         marker.order=order(locus.info$BPcum),hi.index=hi.index.sim, ylab.image="Individuals",
         xlab.h="population 2 ancestry", pdf=F,
         col.image=c(rgb(1,0,0,alpha=.5),rgb(0,0,0,alpha=.8),rgb(0,0,1,alpha=.5)))

#calculate mean heterozygosity across these 110 fixed markers for each sample
#using their function
het<-calc.intersp.het(introgress.data=count.matrix)
#dev.off()
#make triangle plot
introgress::triangle.plot(hi.index=hi.index.sim, int.het=het, pdf = F)

#save plots
#pdf(file = "~/Desktop/aph.data/introgress/pca.triangel.pdf",   # The directory you want to save the file in
#    width = 8.5, # The width of the plot in inches
#    height = 4) # The height of the plot in inches

par(mfrow=c(1,2))
#plot pca
plot(x=pca.scores$PC1, y=pca.scores$PC2, bg=c(rep(rgb(1,0,0,alpha=.5), times=19),rgb(0,0,1,alpha=.5),
                                              rep(rgb(1,0,0,alpha=.5), times=12), rep(rgb(0,0,1,alpha=.5), times=17)),
     pch=21, cex=1.5,
     xlab="PC1, 11.7% variance explained", ylab="PC2, 3.7% variance explained")

#plot triangle
plot(x=hi.index.sim$h, y=het, bg=c(rep(rgb(1,0,0,alpha=.5), times=19),rgb(0,0,1,alpha=.5),
                                   rep(rgb(1,0,0,alpha=.5), times=12), rep(rgb(0,0,1,alpha=.5), times=17)),
     pch=21, cex=1.5,
     xlab="Hybrid Index", ylab="Interspecific heterozygosity",
     ylim=c(0,1))
segments(x0 =0, y0 =0, x1 =.5, y1 =1)
segments(x0 =1, y0 =0, x1 =.5, y1 =1)

#dev.off()

#pdf(file = "~/Desktop/aph.data/introgress/genotype.plot.pdf",   # The directory you want to save the file in
#    width = 8.5, # The width of the plot in inches
#    height = 5) # The height of the plot in inches

#make genotype plot
mk.image(introgress.data=count.matrix, loci.data=locus.info,
         hi.index=hi.index.sim, ylab.image="Individuals", marker.order = order(locus.info$BPcum),
         xlab.h="population 2 ancestry", pdf=F,
         col.image=c(rgb(0,0,1,alpha=.5),rgb(0,0,0,alpha=.8),rgb(1,0,0,alpha=.5)))

#dev.off()

Run introgress on California and Woodhouse's Scrub-Jays

Devon DeRaad

6/20/2021