read in data
#load packages
library(vcfR)
##
## ***** *** vcfR *** *****
## This is vcfR 1.12.0
## browseVignettes('vcfR') # Documentation
## citation('vcfR') # Citation
## ***** ***** ***** *****
library(SNPfiltR)
## This is SNPfiltR v.1.0.0
##
## Detailed usage information is available at: devonderaad.github.io/SNPfiltR/
##
## If you use SNPfiltR in your published work, please cite the following papers:
##
## DeRaad, D.A. (2022), SNPfiltR: an R package for interactive and reproducible SNP filtering. Molecular Ecology Resources, 00, 1-15. http://doi.org/10.1111/1755-0998.13618
##
## Knaus, Brian J., and Niklaus J. Grunwald. 2017. VCFR: a package to manipulate and visualize variant call format data in R. Molecular Ecology Resources, 17.1:44-53. http://doi.org/10.1111/1755-0998.12549
library(RColorBrewer)
library(ggplot2)
library(StAMPP)
## Loading required package: pegas
## Loading required package: ape
## Registered S3 method overwritten by 'pegas':
## method from
## print.amova ade4
##
## Attaching package: 'pegas'
## The following object is masked from 'package:ape':
##
## mst
## The following objects are masked from 'package:vcfR':
##
## getINFO, write.vcf
library(adegenet)
## Loading required package: ade4
##
## Attaching package: 'ade4'
## The following object is masked from 'package:pegas':
##
## amova
##
## /// adegenet 2.1.5 is loaded ////////////
##
## > overview: '?adegenet'
## > tutorials/doc/questions: 'adegenetWeb()'
## > bug reports/feature requests: adegenetIssues()
#read in vcf as vcfR
vcfR <- read.vcfR("~/Desktop/marsh.wren.rad/filtered.vcf.gz")
#read in sample info
samps<-read.csv("~/Desktop/marsh.wren.rad/samples.with.qvalues.csv")
#retain samples that were sequenced successfully
samps<-samps[samps$Tissue.num %in% colnames(vcfR@gt),]
#reorder sampling file to match order of samples in vcf
samps<-samps[match(colnames(vcfR@gt)[-1], samps$Tissue.num),]
samps$Tissue.num == colnames(vcfR@gt)[-1] #check that order matches
#read in sample info with vocal info included
samps.voc<-read.csv("~/Desktop/marsh.wren.rad/samples.with.qvalues.plus.song.info.csv")
##retain samples that were sequenced successfully
#samps.voc<-samps.voc[samps.voc$Tissue.num %in% colnames(vcfR@gt),]
##reorder sampling file to match order of samples in vcf
#samps.voc<-samps.voc[match(colnames(vcfR@gt)[-1], samps.voc$Tissue.num),]
#samps.voc$Tissue.num == colnames(vcfR@gt)[-1] #check that order matches
##check vocal assignments from the field for the 137 samples that made it through filtering
table(samps.voc$Song.Type)
hist(samps.voc$perc.east.buzz)

#add q value and write out this dataset
#samps.voc$western.qvalue<-samps$western.qvalue
#write to disk
#write.csv(x = samps.voc, quote=FALSE, row.names = FALSE, file="~/Desktop/marsh.wren.rad/samples.with.qvalues.plus.song.info.csv")
investigate ancestry/song association
begin by creating sample subsets
#subset vcf
vcfR<-vcfR[,c(1:128,134:138)]
#subset dataframe to only samples with field identified song type
samps.voc<-samps.voc[samps.voc$Song.Type != "unclear",]
#remove invariant sites
vcfR<-min_mac(vcfR, min.mac = 1)
## 0.93% of SNPs fell below a minor allele count of 1 and were removed from the VCF

#make a PCA colored by song type
song.type.pca<-assess_missing_data_pca(vcfR,
popmap = data.frame(id=samps.voc$Tissue.num, pop=samps.voc$Song.Type),
clustering = FALSE, thresholds=c(0,.95))
## cutoff is specified, filtered vcfR object will be returned
## 0% of SNPs fell below a completeness cutoff of 0 and were removed from the VCF


## cutoff is specified, filtered vcfR object will be returned

## 65.79% of SNPs fell below a completeness cutoff of 0.95 and were removed from the VCF



#missing data is not affecting the PCA inference, and it looks like song learning is potentially bidirectional (western songs > introgressing directionally into eastern birds but not vice versa)
Make PCAs for publication
song.type.pca<-assess_missing_data_pca(vcfR,
popmap = data.frame(id=samps.voc$Tissue.num, pop=samps.voc$Song.Type),
clustering = FALSE)


#flip PC1 so that west is on the left, east on the right
song.type.pca$PC1<-song.type.pca$PC1*-1
ggplot(song.type.pca, aes(x=PC1, y=PC2, col=pop))+
geom_point(cex=4, alpha=.6)+
scale_color_manual(values=c('#8C510A','gray', '#01665E'))+
theme_classic()+
xlab("PC1, 42.55% variance explained")+
ylab("PC2, 0.86% variance explained")

#ggsave("~/Desktop/marsh.wren.rad/song.pca.pdf", width = 6, height=4)
#repeat with only the 90 samples with quantitative data
#subset vcfR
sub.vcfR<-vcfR[,c(TRUE,!is.na(samps.voc$perc.east.buzz))]
sub.vcfR
## ***** Object of Class vcfR *****
## 90 samples
## 3080 CHROMs
## 9,494 variants
## Object size: 68.8 Mb
## 7.801 percent missing data
## ***** ***** *****
sub.vcfR<-min_mac(sub.vcfR, min.mac = 1)
## 12.08% of SNPs fell below a minor allele count of 1 and were removed from the VCF

sub.vcfR
## ***** Object of Class vcfR *****
## 90 samples
## 2949 CHROMs
## 8,347 variants
## Object size: 60.9 Mb
## 7.771 percent missing data
## ***** ***** *****
#make PCA
song.type.pca<-assess_missing_data_pca(sub.vcfR,
popmap = data.frame(id=samps.voc$Tissue.num[!is.na(samps.voc$perc.east.buzz)], pop=samps.voc$Song.Type[!is.na(samps.voc$perc.east.buzz)]),
clustering = FALSE)


#add percent east buzz to dataframe
song.type.pca$perceastbuzz<-samps.voc$perc.east.buzz[!is.na(samps.voc$perc.east.buzz)]
#flip PC1 so that west is on the left, east on the right
song.type.pca$PC1<-song.type.pca$PC1*-1
ggplot(song.type.pca, aes(x=PC1, y=PC2, col=perceastbuzz))+
geom_point(cex=4, alpha=.6)+
scale_colour_gradient(low = "#01665E",high = "#8C510A")+
theme_classic()+
xlab("PC1, 45.1% variance explained")+
ylab("PC2, 1.02% variance explained")

#ggsave("~/Desktop/marsh.wren.rad/quant.song.pca.pdf", width = 6, height=4)
make histograms showing the distribution of % songs beginning with
an eastern buzz note and q value
#histograms for all 85 samples with quantified song type
#get eastern q value
eastern.qvalue<-1-samps.voc$western.qvalue
#hist eastern q value for samples with quantified song
hist(eastern.qvalue[!is.na(samps.voc$perc.east.buzz)])

#hist buzz %
hist(samps.voc$perc.east.buzz)

#set breakpoints
ax<-seq(0,1, by= .05)
#plot overlaid histograms colored by parental pop
plot(hist(samps.voc$perc.east.buzz, breaks=ax, plot=FALSE), col=alpha("red", .2))
plot(hist(eastern.qvalue[!is.na(samps.voc$perc.east.buzz)], breaks=ax, plot=FALSE), col=alpha("blue", .2), add=TRUE)

#do it with ggplot
df<-data.frame(characteristic=c(rep("perc.east.buzz", times=nrow(samps.voc.sub)), rep("eastern.qvalue", times=nrow(samps.voc.sub))),
value=c(samps.voc.sub$perc.east.buzz,samps.voc.sub$east.qvalue))
plot_multi_histogram <- function(df, feature, label_column) {
plt <- ggplot(df, aes(x=eval(parse(text=feature)), fill=eval(parse(text=label_column)))) +
geom_histogram(alpha=0.6, position="identity", color="black") +
#geom_density(alpha=0.2) +
#geom_vline(aes(xintercept=mean(eval(parse(text=feature)))), color="black", linetype="dashed", size=1) +
labs(x="percentage", y = "number of individuals")+
theme_classic()
plt + guides(fill=guide_legend(title=label_column))
}
plot_multi_histogram(df, 'value', 'characteristic')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#ggsave("~/Desktop/marsh.wren.rad/song.ancestry.hist.pdf", width=6.75, height=4)
### subset to only sites 3-5
perceastbuzz<-samps.voc$perc.east.buzz[samps.voc$Population == "EYEBROW LAKE" | samps.voc$Population == "NICOLLE FLATS" | samps.voc$Population == "CHAPLIN"]
east.sub<-eastern.qvalue[samps.voc$Population == "EYEBROW LAKE" | samps.voc$Population == "NICOLLE FLATS" | samps.voc$Population == "CHAPLIN"]
#how many total samples retianed from hybrid zone core?
length(east.sub[!is.na(perceastbuzz)])
## [1] 62
#plot overlaid histograms colored by parental pop
plot(hist(east.sub[!is.na(perceastbuzz)], breaks=ax, plot=FALSE), col=alpha("blue", .2))
plot(hist(perceastbuzz/100, breaks=ax, plot=FALSE), col=alpha("red", .2), add=TRUE)
