read in data

#load packages
library(vcfR)
## 
##    *****       ***   vcfR   ***       *****
##    This is vcfR 1.12.0 
##      browseVignettes('vcfR') # Documentation
##      citation('vcfR') # Citation
##    *****       *****      *****       *****
library(SNPfiltR)
## This is SNPfiltR v.1.0.0
## 
## Detailed usage information is available at: devonderaad.github.io/SNPfiltR/ 
## 
## If you use SNPfiltR in your published work, please cite the following papers: 
## 
## DeRaad, D.A. (2022), SNPfiltR: an R package for interactive and reproducible SNP filtering. Molecular Ecology Resources, 00, 1-15. http://doi.org/10.1111/1755-0998.13618 
## 
## Knaus, Brian J., and Niklaus J. Grunwald. 2017. VCFR: a package to manipulate and visualize variant call format data in R. Molecular Ecology Resources, 17.1:44-53. http://doi.org/10.1111/1755-0998.12549
library(RColorBrewer)
library(ggplot2)
library(StAMPP)
## Loading required package: pegas
## Loading required package: ape
## Registered S3 method overwritten by 'pegas':
##   method      from
##   print.amova ade4
## 
## Attaching package: 'pegas'
## The following object is masked from 'package:ape':
## 
##     mst
## The following objects are masked from 'package:vcfR':
## 
##     getINFO, write.vcf
library(adegenet)
## Loading required package: ade4
## 
## Attaching package: 'ade4'
## The following object is masked from 'package:pegas':
## 
##     amova
## 
##    /// adegenet 2.1.5 is loaded ////////////
## 
##    > overview: '?adegenet'
##    > tutorials/doc/questions: 'adegenetWeb()' 
##    > bug reports/feature requests: adegenetIssues()
#read in vcf as vcfR
vcfR <- read.vcfR("~/Desktop/marsh.wren.rad/filtered.vcf.gz")

#read in sample info
samps<-read.csv("~/Desktop/marsh.wren.rad/samples.with.qvalues.csv")
#retain samples that were sequenced successfully
samps<-samps[samps$Tissue.num %in% colnames(vcfR@gt),]
#reorder sampling file to match order of samples in vcf
samps<-samps[match(colnames(vcfR@gt)[-1], samps$Tissue.num),]
samps$Tissue.num == colnames(vcfR@gt)[-1] #check that order matches

#read in sample info with vocal info included
samps.voc<-read.csv("~/Desktop/marsh.wren.rad/samples.with.qvalues.plus.song.info.csv")
##retain samples that were sequenced successfully
#samps.voc<-samps.voc[samps.voc$Tissue.num %in% colnames(vcfR@gt),]
##reorder sampling file to match order of samples in vcf
#samps.voc<-samps.voc[match(colnames(vcfR@gt)[-1], samps.voc$Tissue.num),]
#samps.voc$Tissue.num == colnames(vcfR@gt)[-1] #check that order matches
##check vocal assignments from the field for the 137 samples that made it through filtering
table(samps.voc$Song.Type)
hist(samps.voc$perc.east.buzz)

#add q value and write out this dataset
#samps.voc$western.qvalue<-samps$western.qvalue
#write to disk
#write.csv(x = samps.voc, quote=FALSE, row.names = FALSE, file="~/Desktop/marsh.wren.rad/samples.with.qvalues.plus.song.info.csv")

investigate ancestry/song association

begin by creating sample subsets

#subset vcf
vcfR<-vcfR[,c(1:128,134:138)]
#subset dataframe to only samples with field identified song type
samps.voc<-samps.voc[samps.voc$Song.Type != "unclear",]
#remove invariant sites
vcfR<-min_mac(vcfR, min.mac = 1)
## 0.93% of SNPs fell below a minor allele count of 1 and were removed from the VCF

#make a PCA colored by song type
song.type.pca<-assess_missing_data_pca(vcfR,
                                       popmap = data.frame(id=samps.voc$Tissue.num, pop=samps.voc$Song.Type),
                                       clustering = FALSE, thresholds=c(0,.95))
## cutoff is specified, filtered vcfR object will be returned
## 0% of SNPs fell below a completeness cutoff of 0 and were removed from the VCF

## cutoff is specified, filtered vcfR object will be returned

## 65.79% of SNPs fell below a completeness cutoff of 0.95 and were removed from the VCF

#missing data is not affecting the PCA inference, and it looks like song learning is potentially bidirectional (western songs > introgressing directionally into eastern birds but not vice versa)

Make PCAs for publication

song.type.pca<-assess_missing_data_pca(vcfR,
                                       popmap = data.frame(id=samps.voc$Tissue.num, pop=samps.voc$Song.Type),
                                       clustering = FALSE)

#flip PC1 so that west is on the left, east on the right
song.type.pca$PC1<-song.type.pca$PC1*-1

ggplot(song.type.pca, aes(x=PC1, y=PC2, col=pop))+
  geom_point(cex=4, alpha=.6)+
  scale_color_manual(values=c('#8C510A','gray', '#01665E'))+
  theme_classic()+
  xlab("PC1, 42.55% variance explained")+
  ylab("PC2, 0.86% variance explained")

#ggsave("~/Desktop/marsh.wren.rad/song.pca.pdf", width = 6, height=4)

#repeat with only the 90 samples with quantitative data
#subset vcfR
sub.vcfR<-vcfR[,c(TRUE,!is.na(samps.voc$perc.east.buzz))]
sub.vcfR
## ***** Object of Class vcfR *****
## 90 samples
## 3080 CHROMs
## 9,494 variants
## Object size: 68.8 Mb
## 7.801 percent missing data
## *****        *****         *****
sub.vcfR<-min_mac(sub.vcfR, min.mac = 1)
## 12.08% of SNPs fell below a minor allele count of 1 and were removed from the VCF

sub.vcfR
## ***** Object of Class vcfR *****
## 90 samples
## 2949 CHROMs
## 8,347 variants
## Object size: 60.9 Mb
## 7.771 percent missing data
## *****        *****         *****
#make PCA
song.type.pca<-assess_missing_data_pca(sub.vcfR,
                                       popmap = data.frame(id=samps.voc$Tissue.num[!is.na(samps.voc$perc.east.buzz)], pop=samps.voc$Song.Type[!is.na(samps.voc$perc.east.buzz)]),
                                       clustering = FALSE)

#add percent east buzz to dataframe
song.type.pca$perceastbuzz<-samps.voc$perc.east.buzz[!is.na(samps.voc$perc.east.buzz)]

#flip PC1 so that west is on the left, east on the right
song.type.pca$PC1<-song.type.pca$PC1*-1
ggplot(song.type.pca, aes(x=PC1, y=PC2, col=perceastbuzz))+
  geom_point(cex=4, alpha=.6)+
  scale_colour_gradient(low = "#01665E",high = "#8C510A")+
  theme_classic()+
  xlab("PC1, 45.1% variance explained")+
  ylab("PC2, 1.02% variance explained")

#ggsave("~/Desktop/marsh.wren.rad/quant.song.pca.pdf", width = 6, height=4)

perform regressions to test association

#subset dataframe
samps.voc.sub<-samps.voc[!is.na(samps.voc$perc.east.buzz),]
samps.voc.sub$east.qvalue<-1-samps.voc.sub$western.qvalue

#fit linear model
model <- lm(perc.east.buzz ~ east.qvalue, data = samps.voc.sub)

#assess fit
summary(model)
## 
## Call:
## lm(formula = perc.east.buzz ~ east.qvalue, data = samps.voc.sub)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.86980 -0.01759  0.04803  0.13020  0.53217 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.04804    0.03499  -1.373    0.173    
## east.qvalue  0.91785    0.04946  18.556   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2181 on 88 degrees of freedom
## Multiple R-squared:  0.7964, Adjusted R-squared:  0.7941 
## F-statistic: 344.3 on 1 and 88 DF,  p-value: < 2.2e-16
#plot
ggplot(samps.voc.sub, aes(x=east.qvalue, y=perc.east.buzz))+
  geom_point(cex=4, alpha=.6)+
  stat_smooth(method="lm")+
  theme_classic()
## `geom_smooth()` using formula 'y ~ x'

#ggsave("~/Desktop/marsh.wren.rad/ancestry.song.scatter.pdf", width = 5.2, height=4)

#does the pattern hold if you use only samples from the center of the contact zone?
eb<-samps.voc.sub$perc.east.buzz[samps.voc.sub$Population == "EYEBROW LAKE" | samps.voc.sub$Population == "NICOLLE FLATS" | samps.voc.sub$Population == "CHAPLIN"]
ea<-samps.voc.sub$east.qvalue[samps.voc.sub$Population == "EYEBROW LAKE" | samps.voc.sub$Population == "NICOLLE FLATS" | samps.voc.sub$Population == "CHAPLIN"]

#make df for plotting
df<-data.frame(eb=eb,ea=ea)

#plot
ggplot(df, aes(x=ea, y=eb))+
  geom_point(cex=4, alpha=.6)+
  stat_smooth(method="lm")+
  theme_classic()
## `geom_smooth()` using formula 'y ~ x'

#same pattern in center of contact zone

make histograms showing the distribution of % songs beginning with an eastern buzz note and q value

#histograms for all 85 samples with quantified song type
#get eastern q value
eastern.qvalue<-1-samps.voc$western.qvalue
#hist eastern q value for samples with quantified song
hist(eastern.qvalue[!is.na(samps.voc$perc.east.buzz)])

#hist buzz %
hist(samps.voc$perc.east.buzz)

#set breakpoints
ax<-seq(0,1, by= .05)

#plot overlaid histograms colored by parental pop
plot(hist(samps.voc$perc.east.buzz, breaks=ax, plot=FALSE), col=alpha("red", .2))
plot(hist(eastern.qvalue[!is.na(samps.voc$perc.east.buzz)], breaks=ax, plot=FALSE), col=alpha("blue", .2), add=TRUE)

#do it with ggplot
df<-data.frame(characteristic=c(rep("perc.east.buzz", times=nrow(samps.voc.sub)), rep("eastern.qvalue", times=nrow(samps.voc.sub))),
               value=c(samps.voc.sub$perc.east.buzz,samps.voc.sub$east.qvalue))

plot_multi_histogram <- function(df, feature, label_column) {
    plt <- ggplot(df, aes(x=eval(parse(text=feature)), fill=eval(parse(text=label_column)))) +
    geom_histogram(alpha=0.6, position="identity", color="black") +
    #geom_density(alpha=0.2) +
    #geom_vline(aes(xintercept=mean(eval(parse(text=feature)))), color="black", linetype="dashed", size=1) +
    labs(x="percentage", y = "number of individuals")+
    theme_classic()
    plt + guides(fill=guide_legend(title=label_column))
}
plot_multi_histogram(df, 'value', 'characteristic')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#ggsave("~/Desktop/marsh.wren.rad/song.ancestry.hist.pdf", width=6.75, height=4)

### subset to only sites 3-5
perceastbuzz<-samps.voc$perc.east.buzz[samps.voc$Population == "EYEBROW LAKE" | samps.voc$Population == "NICOLLE FLATS" | samps.voc$Population == "CHAPLIN"]
east.sub<-eastern.qvalue[samps.voc$Population == "EYEBROW LAKE" | samps.voc$Population == "NICOLLE FLATS" | samps.voc$Population == "CHAPLIN"]
#how many total samples retianed from hybrid zone core?
length(east.sub[!is.na(perceastbuzz)])
## [1] 62
#plot overlaid histograms colored by parental pop
plot(hist(east.sub[!is.na(perceastbuzz)], breaks=ax, plot=FALSE), col=alpha("blue", .2))
plot(hist(perceastbuzz/100, breaks=ax, plot=FALSE), col=alpha("red", .2), add=TRUE)