library(ggplot2)
#introgress tutorial
library(introgress)
## Loading required package: nnet
## Loading required package: genetics
## Warning: package 'genetics' was built under R version 3.5.2
## Loading required package: combinat
##
## Attaching package: 'combinat'
## The following object is masked from 'package:utils':
##
## combn
## Loading required package: gdata
## gdata: read.xls support for 'XLS' (Excel 97-2004) files ENABLED.
##
## gdata: read.xls support for 'XLSX' (Excel 2007+) files ENABLED.
##
## Attaching package: 'gdata'
## The following object is masked from 'package:stats':
##
## nobs
## The following object is masked from 'package:utils':
##
## object.size
## The following object is masked from 'package:base':
##
## startsWith
## Loading required package: gtools
## Loading required package: MASS
## Loading required package: mvtnorm
## Warning: package 'mvtnorm' was built under R version 3.5.2
##
## NOTE: THIS PACKAGE IS NOW OBSOLETE.
##
## The R-Genetics project has developed an set of enhanced genetics
## packages to replace 'genetics'. Please visit the project homepage
## at http://rgenetics.org for informtion.
##
##
## Attaching package: 'genetics'
## The following objects are masked from 'package:base':
##
## %in%, as.factor, order
## Loading required package: RColorBrewer
#see if this works for California and Woodhouse's Scrub-Jays
library(vcfR)
##
## ***** *** vcfR *** *****
## This is vcfR 1.12.0
## browseVignettes('vcfR') # Documentation
## citation('vcfR') # Citation
## ***** ***** ***** *****
library(adegenet)
## Loading required package: ade4
## Warning: package 'ade4' was built under R version 3.5.2
##
## Attaching package: 'ade4'
## The following object is masked from 'package:introgress':
##
## triangle.plot
##
## /// adegenet 2.1.3 is loaded ////////////
##
## > overview: '?adegenet'
## > tutorials/doc/questions: 'adegenetWeb()'
## > bug reports/feature requests: adegenetIssues()
##
## Attaching package: 'adegenet'
## The following object is masked from 'package:gtools':
##
## chr
vcf<-read.vcfR("/Users/devder/Desktop/aph.data/unzipped.filtered.vcf")
## Scanning file to determine attributes.
## File attributes:
## meta lines: 14
## header_line: 15
## variant count: 16307
## column count: 104
##
Meta line 14 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
## Character matrix gt rows: 16307
## Character matrix gt cols: 104
## skip: 0
## nrows: 16307
## row_num: 0
##
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant 6000
Processed variant 7000
Processed variant 8000
Processed variant 9000
Processed variant 10000
Processed variant 11000
Processed variant 12000
Processed variant 13000
Processed variant 14000
Processed variant 15000
Processed variant 16000
Processed variant: 16307
## All variants processed
#read in locality info for samples
locs<-read.csv("~/Desktop/aph.data/rad.sampling.csv")
#subset locs to include only samples that passed filtering, and have been retained in the vcf
locs<-locs[locs$id %in% colnames(vcf@gt),]
rownames(locs)<-1:95
#remove sumi, island, florida, wood mex, and texas
locs<-locs[c(1:32,57:73),]
vcf@gt<-vcf@gt[,c(1:33,58:74)]
colnames(vcf@gt)[-1] == locs$id
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [46] TRUE TRUE TRUE TRUE
rownames(locs)<-1:nrow(locs)
dim(vcf)
## variants fix_cols gt_cols
## 16307 8 50
#remove SNPs with no data
source("~/Desktop/snpfiltR/min_mac.R")
vcf<-min_mac(vcf, min.mac = 1)

## [1] "35.74% of SNPs fell below a minor allele count of 1 and were removed from the VCF"
dim(vcf)
## variants fix_cols gt_cols
## 10479 8 50
vcfR.100<-vcf
gt <- extract.gt(vcf)
vcfR.100@gt <- vcf@gt[rowSums(is.na(gt)) == 0,]
vcfR.100@fix <- vcf@fix[rowSums(is.na(gt)) == 0,]
#convert to genlight
gen<-vcfR2genlight(vcf)
gen.100<-vcfR2genlight(vcfR.100)
#perform PCA
pca<-glPca(gen.100, nf=30)
var_frac <- pca$eig/sum(pca$eig)
#pull pca scores out of df
pca.scores<-as.data.frame(pca$scores)
pca.scores$species<-locs$species
#ggplot color by species
ggplot(pca.scores, aes(x=PC1, y=PC2, color=species)) +
geom_point(cex = 5, alpha=.5)+
theme_classic()

#identify the three samples each with the largest and smallest scores on PC1 to use to call fixed differences
pca.scores[pca.scores$PC1 > 2.8,]
## PC1 PC2 PC3 PC4 PC5
## A_woodhouseii_334133 3.037061 -0.5912709 -0.3366753 -1.2982493 -0.4105088
## A_woodhouseii_334142 2.918402 -0.4139282 -0.2897220 -0.2394965 -0.1007825
## A_woodhouseii_334188 2.867045 0.2770548 0.1651711 -0.1783781 -1.7928326
## PC6 PC7 PC8 PC9 PC10
## A_woodhouseii_334133 2.7113278 1.0677231 1.52651710 0.2022530 -0.5829984
## A_woodhouseii_334142 0.1840205 0.1122149 -0.01768832 -0.1171638 0.2990504
## A_woodhouseii_334188 -1.9130799 1.2166849 1.53710520 1.5344688 0.4165617
## PC11 PC12 PC13 PC14 PC15
## A_woodhouseii_334133 0.7557227 -0.09574684 0.50066615 0.2728824 0.1079958
## A_woodhouseii_334142 0.4241522 -0.36321660 0.02376014 0.2266165 -0.1635947
## A_woodhouseii_334188 0.1287661 0.19944368 -0.17519061 0.9435037 0.2801640
## PC16 PC17 PC18 PC19 PC20
## A_woodhouseii_334133 -0.1330119 0.09838332 0.3995260 -0.53089697 -0.5033858
## A_woodhouseii_334142 -0.3278944 0.48012713 -1.4268928 -0.55029156 0.2247248
## A_woodhouseii_334188 0.3770443 1.23535020 -0.7099712 -0.07484724 0.6756530
## PC21 PC22 PC23 PC24 PC25
## A_woodhouseii_334133 0.58298820 -0.07867022 0.01721067 -0.1267086 -0.24242027
## A_woodhouseii_334142 0.03837660 0.84604092 -0.35708292 0.3454019 -0.05066455
## A_woodhouseii_334188 -0.01992159 0.30579469 -0.38748638 -0.6021840 -0.08677654
## PC26 PC27 PC28 PC29 PC30
## A_woodhouseii_334133 -0.002967459 -0.6530334 -0.04263875 0.105289 0.4409701
## A_woodhouseii_334142 1.446289900 1.3385117 0.39446418 2.280723 0.3805915
## A_woodhouseii_334188 -0.271817151 0.8364311 -0.02676062 -0.582162 -1.3784496
## species
## A_woodhouseii_334133 woodhouseii
## A_woodhouseii_334142 woodhouseii
## A_woodhouseii_334188 woodhouseii
pca.scores[pca.scores$PC1 < -1.8,]
## PC1 PC2 PC3 PC4 PC5
## A_californica_333862 -1.804101 -0.5661435 1.1548452 0.2021166 -0.7515666
## A_californica_333914 -1.817297 -0.2487985 -0.3034541 -0.5917913 1.0302872
## A_californica_334015 -1.966439 -0.5860374 -1.6546370 0.8451443 -2.0216284
## PC6 PC7 PC8 PC9 PC10
## A_californica_333862 0.8835571 1.0657724 -0.9015312 0.009941559 0.7563753
## A_californica_333914 -0.6694047 0.4747831 0.1585496 0.665815432 -0.1567706
## A_californica_334015 0.6884735 -0.5459441 -1.6858110 0.627836125 -2.3821100
## PC11 PC12 PC13 PC14 PC15
## A_californica_333862 -0.9058575 0.02744106 -0.20679281 -0.9389570 0.35837375
## A_californica_333914 -0.2109665 0.82529406 0.27276817 -1.0938468 -0.05939096
## A_californica_334015 0.4859042 -1.40177485 0.01270681 0.5813366 1.83616207
## PC16 PC17 PC18 PC19 PC20
## A_californica_333862 0.07772191 0.63149980 -0.18508546 1.78749419 -0.01152692
## A_californica_333914 0.14860400 -0.08060549 0.05166560 0.04417143 0.24921548
## A_californica_334015 0.61725442 0.92940532 0.08783751 0.93314856 -0.07623473
## PC21 PC22 PC23 PC24 PC25
## A_californica_333862 0.2473056 0.01030362 -0.4619286 0.1482236 -1.5032410
## A_californica_333914 0.2001853 -0.17313988 -0.1757631 -0.2625740 -0.1701078
## A_californica_334015 0.5592592 0.83049168 0.4319268 0.8599101 0.4944268
## PC26 PC27 PC28 PC29 PC30
## A_californica_333862 1.4953388 -1.9385959 0.3864047 -0.8902584 -0.6003776
## A_californica_333914 1.4910720 0.2802200 -0.9885955 -0.7238619 -0.0646918
## A_californica_334015 0.8056459 0.4395896 -0.5026916 -0.2230700 0.5557611
## species
## A_californica_333862 californica
## A_californica_333914 californica
## A_californica_334015 californica
#create SNP matrices
mat<-extract.gt(vcf)
mat[1:5,1:5]
## A_californica_333849 A_californica_333854 A_californica_333855
## 271:52:+ "0/1" "0/0" NA
## 271:68:+ NA "0/0" "0/0"
## 274:22:- "0/0" "0/0" "0/0"
## 274:42:- "0/0" "0/1" "0/0"
## 1254:48:+ "0/0" "0/0" "0/0"
## A_californica_333857 A_californica_333860
## 271:52:+ "0/0" "0/0"
## 271:68:+ "0/0" "0/0"
## 274:22:- "0/0" "0/0"
## 274:42:- "0/0" "0/0"
## 1254:48:+ "0/0" "0/0"
conv.mat<-mat
conv.mat[conv.mat == "0/0"]<-0
conv.mat[conv.mat == "0/1"]<-1
conv.mat[conv.mat == "1/1"]<-2
conv.mat<-as.data.frame(conv.mat)
#convert to numeric
for (i in 1:ncol(conv.mat)){
conv.mat[,i]<-as.numeric(as.character(conv.mat[,i]))
}
#calc AF for the samples you will use to call fixed differences
cal.af<-(rowSums(conv.mat[,c(6,10,17)], na.rm=T)/(rowSums(is.na(conv.mat[,c(6,10,17)]) == FALSE)))/2
wood.af<-(rowSums(conv.mat[,c(39,41,44)], na.rm=T)/(rowSums(is.na(conv.mat[,c(39,41,44)]) == FALSE)))/2
#find fixed SNPs
diff<-abs(cal.af - wood.af)
#how many SNPs are fixed
table(is.na(diff) == FALSE & diff > .8)
##
## FALSE TRUE
## 10351 128
vcf@fix[,1][is.na(diff) == FALSE & diff > .8]
## [1] "Pseudochr1" "Pseudochr1" "Pseudochr1" "Pseudochr1" "Pseudochr10"
## [6] "Pseudochr10" "Pseudochr15" "Pseudochr18" "Pseudochr1A" "Pseudochr1A"
## [11] "Pseudochr1A" "Pseudochr1A" "Pseudochr1A" "Pseudochr1A" "Pseudochr1A"
## [16] "Pseudochr1A" "Pseudochr2" "Pseudochr2" "Pseudochr2" "Pseudochr2"
## [21] "Pseudochr2" "Pseudochr2" "Pseudochr2" "Pseudochr2" "Pseudochr2"
## [26] "Pseudochr2" "Pseudochr2" "Pseudochr2" "Pseudochr2" "Pseudochr2"
## [31] "Pseudochr2" "Pseudochr2" "Pseudochr2" "Pseudochr2" "Pseudochr2"
## [36] "Pseudochr2" "Pseudochr2" "Pseudochr2" "Pseudochr2" "Pseudochr2"
## [41] "Pseudochr2" "Pseudochr27" "Pseudochr3" "Pseudochr3" "Pseudochr3"
## [46] "Pseudochr3" "Pseudochr3" "Pseudochr3" "Pseudochr3" "Pseudochr3"
## [51] "Pseudochr3" "Pseudochr3" "Pseudochr3" "Pseudochr3" "Pseudochr3"
## [56] "Pseudochr3" "Pseudochr3" "Pseudochr3" "Pseudochr3" "Pseudochr3"
## [61] "Pseudochr3" "Pseudochr3" "Pseudochr4" "Pseudochr4" "Pseudochr4"
## [66] "Pseudochr4" "Pseudochr4" "Pseudochr4" "Pseudochr4" "Pseudochr4"
## [71] "Pseudochr4" "Pseudochr4A" "Pseudochr4A" "Pseudochr5" "Pseudochr5"
## [76] "Pseudochr5" "Pseudochr5" "Pseudochr5" "Pseudochr5" "Pseudochr5"
## [81] "Pseudochr5" "Pseudochr5" "Pseudochr5" "Pseudochr5" "Pseudochr5"
## [86] "Pseudochr5" "Pseudochr7" "Pseudochr9" "Pseudochr9" "PseudochrM"
## [91] "PseudochrM" "PseudochrM" "PseudochrZ" "PseudochrZ" "PseudochrZ"
## [96] "PseudochrZ" "PseudochrZ" "PseudochrZ" "PseudochrZ" "PseudochrZ"
## [101] "PseudochrZ" "PseudochrZ" "PseudochrZ" "PseudochrZ" "PseudochrZ"
## [106] "PseudochrZ" "PseudochrZ" "PseudochrZ" "PseudochrZ" "PseudochrZ"
## [111] "PseudochrZ" "PseudochrZ" "PseudochrZ" "PseudochrZ" "PseudochrZ"
## [116] "PseudochrZ" "PseudochrZ" "PseudochrZ" "PseudochrZ" "PseudochrZ"
## [121] "PseudochrZ" "PseudochrZ" "PseudochrZ" "PseudochrZ" "PseudochrZ"
## [126] "PseudochrZ" "PseudochrZ" "PseudochrZ"
#subsample original matrix to only fixed diff SNPs
gen.mat<-mat[is.na(diff) == FALSE & diff > .8,]
dim(gen.mat)
## [1] 128 49
#subsample matrix converted for AF calcs to only fixed SNPS
conv.mat<-conv.mat[is.na(diff) == FALSE & diff > .8,]
dim(conv.mat)
## [1] 128 49
#write a logical test to convert alleles so that a single number represents one parental ancestry
for (i in 1:nrow(gen.mat)){
#if 1 is the wood allele (ie < .2 frequency in the 3 cali samples used for identifying informative SNPs)
if((sum(conv.mat[i,c(6,10,17)], na.rm=T)/(sum(is.na(conv.mat[i,c(6,10,17)]) == FALSE)))/2 < .2){
#swap all '0/0' cells with '2/2'
gen.mat[i,][gen.mat[i,] == "0/0"]<-"2/2"
#swap all '1/1' cells with '0/0'
gen.mat[i,][gen.mat[i,] == "1/1"]<-"0/0"
#finally convert all '2/2' cells (originally 0/0) into '1/1'
gen.mat[i,][gen.mat[i,] == "2/2"]<-"1/1"
#no need to touch hets
}
}
#convert R class NAs to the string "NA/NA"
gen.mat[is.na(gen.mat) == TRUE]<-"NA/NA"
#make locus info df
locus.info<-data.frame(locus=rownames(gen.mat),
type=rep("C", times=nrow(gen.mat)),
lg=vcf@fix[,1][is.na(diff) == FALSE & diff > .8],
marker.pos=vcf@fix[,2][is.na(diff) == FALSE & diff > .8])
#make linkage group numeric
locus.info$lg<-gsub("Pseudochr", "", locus.info$lg)
locus.info$lg[locus.info$lg == "M"]<-31
locus.info$lg[locus.info$lg == "Z"]<-30
locus.info$lg[locus.info$lg == "1A"]<-1.5
locus.info$lg[locus.info$lg == "4A"]<-4.5
locus.info$lg<-as.numeric(locus.info$lg)
locus.info$marker.pos<-as.numeric(as.character(locus.info$marker.pos))
#make bpcum
nCHR <- length(unique(locus.info$lg))
locus.info$BPcum <- NA
s <- 0
nbp <- c()
for (i in sort(unique(locus.info$lg))){
nbp[i] <- max(locus.info[locus.info$lg == i,]$marker.pos)
locus.info[locus.info$lg == i,"BPcum"] <- locus.info[locus.info$lg == i,"marker.pos"] + s
s <- s + nbp[i]
}
#we now have a gt matrix in proper format for introgress
#convert genotype data into a matrix of allele counts
count.matrix<-prepare.data(admix.gen=gen.mat, loci.data=locus.info,
parental1="1",parental2="0", pop.id=F,
ind.id=F, fixed=T)
## prepare.data is working; this may take a moment
## Processing data for 49 individuals and 128 loci.
#estimate hybrid index values
hi.index.sim<-est.h(introgress.data=count.matrix,loci.data=locus.info,
fixed=T, p1.allele="1", p2.allele="0")
## est.h is working; this may take a few minutes
locus.info$locus<-rep("", times=nrow(locus.info))
#LociDataSim1$lg<-c(1:110)
mk.image(introgress.data=count.matrix, loci.data=locus.info,
marker.order=order(locus.info$BPcum),hi.index=hi.index.sim, ylab.image="Individuals",
xlab.h="population 2 ancestry", pdf=F,
col.image=c(rgb(1,0,0,alpha=.5),rgb(0,0,0,alpha=.8),rgb(0,0,1,alpha=.5)))

#calculate mean heterozygosity across these 110 fixed markers for each sample
#using their function
het<-calc.intersp.het(introgress.data=count.matrix)
#dev.off()
#make triangle plot
introgress::triangle.plot(hi.index=hi.index.sim, int.het=het, pdf = F)

#save plots
#pdf(file = "~/Desktop/aph.data/introgress/pca.triangel.pdf", # The directory you want to save the file in
# width = 8.5, # The width of the plot in inches
# height = 4) # The height of the plot in inches
par(mfrow=c(1,2))
#plot pca
plot(x=pca.scores$PC1, y=pca.scores$PC2, bg=c(rep(rgb(1,0,0,alpha=.5), times=19),rgb(0,0,1,alpha=.5),
rep(rgb(1,0,0,alpha=.5), times=12), rep(rgb(0,0,1,alpha=.5), times=17)),
pch=21, cex=1.5,
xlab="PC1, 11.7% variance explained", ylab="PC2, 3.7% variance explained")
#plot triangle
plot(x=hi.index.sim$h, y=het, bg=c(rep(rgb(1,0,0,alpha=.5), times=19),rgb(0,0,1,alpha=.5),
rep(rgb(1,0,0,alpha=.5), times=12), rep(rgb(0,0,1,alpha=.5), times=17)),
pch=21, cex=1.5,
xlab="Hybrid Index", ylab="Interspecific heterozygosity",
ylim=c(0,1))
segments(x0 =0, y0 =0, x1 =.5, y1 =1)
segments(x0 =1, y0 =0, x1 =.5, y1 =1)

#dev.off()
#pdf(file = "~/Desktop/aph.data/introgress/genotype.plot.pdf", # The directory you want to save the file in
# width = 8.5, # The width of the plot in inches
# height = 5) # The height of the plot in inches
#make genotype plot
mk.image(introgress.data=count.matrix, loci.data=locus.info,
hi.index=hi.index.sim, ylab.image="Individuals", marker.order = order(locus.info$BPcum),
xlab.h="population 2 ancestry", pdf=F,
col.image=c(rgb(0,0,1,alpha=.5),rgb(0,0,0,alpha=.8),rgb(1,0,0,alpha=.5)))

#dev.off()