make ancestry histograms and plot diversity

#load packages
library(vcfR)

## 
##    *****       ***   vcfR   ***       *****
##    This is vcfR 1.14.0 
##      browseVignettes('vcfR') # Documentation
##      citation('vcfR') # Citation
##    *****       *****      *****       *****

library(SNPfiltR)

## This is SNPfiltR v.1.0.2
## 
## Detailed usage information is available at: devonderaad.github.io/SNPfiltR/ 
## 
## If you use SNPfiltR in your published work, please cite the following papers: 
## 
## DeRaad, D.A. (2022), SNPfiltR: an R package for interactive and reproducible SNP filtering. Molecular Ecology Resources, 22, 2443-2453. http://doi.org/10.1111/1755-0998.13618 
## 
## Knaus, Brian J., and Niklaus J. Grunwald. 2017. VCFR: a package to manipulate and visualize variant call format data in R. Molecular Ecology Resources, 17.1:44-53. http://doi.org/10.1111/1755-0998.12549

library(RColorBrewer)
library(ggplot2)
library(StAMPP)

## Loading required package: pegas

## Loading required package: ape

## 
## Attaching package: 'pegas'

## The following object is masked from 'package:ape':
## 
##     mst

## The following objects are masked from 'package:vcfR':
## 
##     getINFO, write.vcf

## Registered S3 method overwritten by 'ade4':
##   method      from 
##   print.amova pegas

library(adegenet)

## Loading required package: ade4

## 
## Attaching package: 'ade4'

## The following object is masked from 'package:pegas':
## 
##     amova

## 
##    /// adegenet 2.1.10 is loaded ////////////
## 
##    > overview: '?adegenet'
##    > tutorials/doc/questions: 'adegenetWeb()' 
##    > bug reports/feature requests: adegenetIssues()

library(ggmulti)
library(gridExtra)
library(diptest)

#read in vcf as vcfR
vcfR <- read.vcfR("~/Desktop/marsh.wren.rad/filtered.vcf.gz")

## Scanning file to determine attributes.
## File attributes:
##   meta lines: 14
##   header_line: 15
##   variant count: 9583
##   column count: 146
## 
Meta line 14 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
##   Character matrix gt rows: 9583
##   Character matrix gt cols: 146
##   skip: 0
##   nrows: 9583
##   row_num: 0
## 
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant 6000
Processed variant 7000
Processed variant 8000
Processed variant 9000
Processed variant: 9583
## All variants processed

#read in sample info
samps<-read.csv("~/Desktop/marsh.wren.rad/samples.with.qvalues.csv")
#retain samples that were sequenced successfully
samps<-samps[samps$Tissue.num %in% colnames(vcfR@gt),]
#reorder sampling file to match order of samples in vcf
samps<-samps[match(colnames(vcfR@gt)[-1], samps$Tissue.num),]
samps$Tissue.num == colnames(vcfR@gt)[-1] #check that order matches

##   [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [46] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [61] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [76] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [91] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [106] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [121] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [136] TRUE TRUE

#read in sample info with vocal info included
samps.voc<-read.csv("~/Desktop/marsh.wren.rad/samples.with.qvalues.plus.song.info.csv")
#specify that the 5 nebraska samples with song data have no eastern buzz
#samps.voc$perc.east.buzz[133:137]<-0
#write.csv(samps.voc, "~/Desktop/marsh.wren.rad/samples.with.qvalues.plus.song.info.csv", row.names = F, quote = F)

#plot histograms of qvalue (ancestry proportion) to see underlying variance at each site
ggplot(samps.voc, aes(x = western.qvalue)) +
  geom_histogram(fill = "white", colour = "black") +
  facet_grid(Population ~ .)+
  theme_minimal()+
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#use pop assignments instead of names
#pops<-gsub("NEBRASKA", 1, samps.voc$Population)
#pops<-gsub("CRANE", 2, pops)
#pops<-gsub("CHAPLIN", 3, pops)
#pops<-gsub("EYEBROW LAKE", 4, pops)
#pops<-gsub("NICOLLE FLATS", 5, pops)
#pops<-gsub("LOSTWOODS", 6, pops)
#samps.voc$pops<-pops

#make eastern q
samps.voc$eastern.qvalue<-1-samps.voc$western.qvalue
samps.voc$pops<-as.factor(samps.voc$pops)

#plot
ggplot(samps.voc, aes(x = western.qvalue)) +
  geom_histogram(fill = "white", colour = "black") +
  facet_grid(pops ~ .)+
  theme_minimal()+
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#plot percent eastern buzz
ggplot(samps.voc, aes(x = perc.east.buzz)) +
  geom_histogram(fill = "white", colour = "black") +
  facet_grid(. ~ pops)+
  theme_classic()+
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line.y = element_blank())+
  coord_flip()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 47 rows containing non-finite values (`stat_bin()`).

ggplot(samps.voc, aes(x = perc.east.buzz)) +
  geom_histogram(fill = "white", colour = "black") +
  facet_grid(. ~ pops)+
  theme_classic()+
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),rect = element_blank())+
  coord_flip()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 47 rows containing non-finite values (`stat_bin()`).

#make violin plots
ggplot(samps.voc, aes(x=pops, y=western.qvalue, fill=pops))+ 
  geom_violin()+
  geom_jitter(position=position_jitter(0.1))+
  ylim(c(0,1))+
  theme_classic()

ggplot(samps.voc, aes(x=pops, y=western.qvalue))+ 
  geom_dotplot(binaxis='y', stackdir='center', dotsize=.3)+
  ylim(c(0,1))+
  theme_classic()

## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.

#use geom_histogram_() function from package 'ggmulti'
ggplot(samps.voc, mapping = aes(x = pops, y = eastern.qvalue, fill=pops)) +
  scale_fill_brewer(palette = "BrBG", direction = -1)+
  geom_histogram_(as.mix = TRUE, prop = 2.3, color="black")+
  theme_classic()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(samps.voc, mapping = aes(x = pops, y = perc.east.buzz, fill=pops)) +
  scale_fill_brewer(palette = "BrBG", direction = -1)+
  geom_histogram_(as.mix = TRUE, prop = 1.6, color="black")+
  theme_classic()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 47 rows containing missing values (stat_hist_).

#save
qval<-ggplot(samps.voc, mapping = aes(x = pops, y = eastern.qvalue, fill=pops)) +
  scale_fill_brewer(palette = "BrBG", direction = -1)+
  geom_histogram_(as.mix = TRUE, prop = 2.3, color="black", bins =15)+
  theme_classic()+
  ylab("eastern ancestry proportion")+
  theme(legend.position = "none")

eb<-ggplot(samps.voc, mapping = aes(x = pops, y = perc.east.buzz, fill=pops)) +
  scale_fill_brewer(palette = "BrBG", direction = -1)+
  geom_histogram_(as.mix = TRUE, prop = 1.6, color="black", bins = 15)+
  theme_classic()+
  ylab("eastern 'buzz' proportion")+
  theme(legend.position = "none")

figure <- grid.arrange(qval, eb,
                    ncol = 1, nrow = 2)

## Warning: Removed 47 rows containing missing values (stat_hist_).

#ggsave("~/Desktop/marsh.wren.rad/hists.by.site.pdf",figure,width=8.5,height=4.5)

### Test for unimodality at each site
#buzz %
dip.test(samps.voc$perc.east.buzz[samps.voc$pops ==1 & is.na(samps.voc$perc.east.buzz) == FALSE])

## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values

## 
##  Hartigans' dip test for unimodality / multimodality
## 
## data:  samps.voc$perc.east.buzz[samps.voc$pops == 1 & is.na(samps.voc$perc.east.buzz) ==     FALSE]
## D = 0.1, p-value = 0.9029
## alternative hypothesis: non-unimodal, i.e., at least bimodal

dip.test(samps.voc$perc.east.buzz[samps.voc$pops ==2 & is.na(samps.voc$perc.east.buzz) == FALSE])

## 
##  Hartigans' dip test for unimodality / multimodality
## 
## data:  samps.voc$perc.east.buzz[samps.voc$pops == 2 & is.na(samps.voc$perc.east.buzz) ==     FALSE]
## D = 0.035714, p-value = 1
## alternative hypothesis: non-unimodal, i.e., at least bimodal

dip.test(samps.voc$perc.east.buzz[samps.voc$pops ==3 & is.na(samps.voc$perc.east.buzz) == FALSE])

## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values

## 
##  Hartigans' dip test for unimodality / multimodality
## 
## data:  samps.voc$perc.east.buzz[samps.voc$pops == 3 & is.na(samps.voc$perc.east.buzz) ==     FALSE]
## D = 0.2475, p-value < 2.2e-16
## alternative hypothesis: non-unimodal, i.e., at least bimodal

dip.test(samps.voc$perc.east.buzz[samps.voc$pops ==4 & is.na(samps.voc$perc.east.buzz) == FALSE])

## 
##  Hartigans' dip test for unimodality / multimodality
## 
## data:  samps.voc$perc.east.buzz[samps.voc$pops == 4 & is.na(samps.voc$perc.east.buzz) ==     FALSE]
## D = 0.21187, p-value < 2.2e-16
## alternative hypothesis: non-unimodal, i.e., at least bimodal

dip.test(samps.voc$perc.east.buzz[samps.voc$pops ==5 & is.na(samps.voc$perc.east.buzz) == FALSE])

## 
##  Hartigans' dip test for unimodality / multimodality
## 
## data:  samps.voc$perc.east.buzz[samps.voc$pops == 5 & is.na(samps.voc$perc.east.buzz) ==     FALSE]
## D = 0.1875, p-value = 9.916e-07
## alternative hypothesis: non-unimodal, i.e., at least bimodal

dip.test(samps.voc$perc.east.buzz[samps.voc$pops ==6 & is.na(samps.voc$perc.east.buzz) == FALSE])

## 
##  Hartigans' dip test for unimodality / multimodality
## 
## data:  samps.voc$perc.east.buzz[samps.voc$pops == 6 & is.na(samps.voc$perc.east.buzz) ==     FALSE]
## D = 0.055556, p-value = 1
## alternative hypothesis: non-unimodal, i.e., at least bimodal

#q-value
dip.test(samps.voc$eastern.qvalue[samps.voc$pops ==1 & is.na(samps.voc$eastern.qvalue) == FALSE])

## 
##  Hartigans' dip test for unimodality / multimodality
## 
## data:  samps.voc$eastern.qvalue[samps.voc$pops == 1 & is.na(samps.voc$eastern.qvalue) ==     FALSE]
## D = 0.05, p-value = 1
## alternative hypothesis: non-unimodal, i.e., at least bimodal

dip.test(samps.voc$eastern.qvalue[samps.voc$pops ==2 & is.na(samps.voc$eastern.qvalue) == FALSE])

## 
##  Hartigans' dip test for unimodality / multimodality
## 
## data:  samps.voc$eastern.qvalue[samps.voc$pops == 2 & is.na(samps.voc$eastern.qvalue) ==     FALSE]
## D = 0.040599, p-value = 0.9965
## alternative hypothesis: non-unimodal, i.e., at least bimodal

dip.test(samps.voc$eastern.qvalue[samps.voc$pops ==3 & is.na(samps.voc$eastern.qvalue) == FALSE])

## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values

## 
##  Hartigans' dip test for unimodality / multimodality
## 
## data:  samps.voc$eastern.qvalue[samps.voc$pops == 3 & is.na(samps.voc$eastern.qvalue) ==     FALSE]
## D = 0.22408, p-value = 0.0005267
## alternative hypothesis: non-unimodal, i.e., at least bimodal

dip.test(samps.voc$eastern.qvalue[samps.voc$pops ==4 & is.na(samps.voc$eastern.qvalue) == FALSE])

## 
##  Hartigans' dip test for unimodality / multimodality
## 
## data:  samps.voc$eastern.qvalue[samps.voc$pops == 4 & is.na(samps.voc$eastern.qvalue) ==     FALSE]
## D = 0.15587, p-value < 2.2e-16
## alternative hypothesis: non-unimodal, i.e., at least bimodal

dip.test(samps.voc$eastern.qvalue[samps.voc$pops ==5 & is.na(samps.voc$eastern.qvalue) == FALSE])

## 
##  Hartigans' dip test for unimodality / multimodality
## 
## data:  samps.voc$eastern.qvalue[samps.voc$pops == 5 & is.na(samps.voc$eastern.qvalue) ==     FALSE]
## D = 0.10346, p-value = 0.01455
## alternative hypothesis: non-unimodal, i.e., at least bimodal

dip.test(samps.voc$eastern.qvalue[samps.voc$pops ==6 & is.na(samps.voc$eastern.qvalue) == FALSE])

## 
##  Hartigans' dip test for unimodality / multimodality
## 
## data:  samps.voc$eastern.qvalue[samps.voc$pops == 6 & is.na(samps.voc$eastern.qvalue) ==     FALSE]
## D = 0.055556, p-value = 1
## alternative hypothesis: non-unimodal, i.e., at least bimodal

calculate pi and heterozygosity

#make popmaps
#samps.voc<-read.csv("~/Desktop/marsh.wren.rad/samples.with.qvalues.plus.song.info.csv")
#write.table(samps.voc[,c(1,19)], "~/Desktop/marsh.wren.rad/pops.popmap.txt", sep = "\t", quote = F, row.names = F, col.names = F)
#write.table(samps.voc[,c(1,1)], "~/Desktop/marsh.wren.rad/samps.popmap.txt", sep = "\t", quote = F, row.names = F, col.names = F)

#run this in bash, where pops.popmap.txt assigns samples to species, to get species level pi estimates
#and samps.popmap.txt assigns each sample as a unique population to get per individual heterozygosity estimates
# Run populations and export population info
#/home/d669d153/work/stacks-2.41/populations -P ./stacks_n4 -M pops.popmap.txt --fstats -O ./pops
# Run populations and export population info
#/home/d669d153/work/stacks-2.41/populations -P ./stacks_n4 -M samps.popmap.txt -O ./single.sample

#read in each output file
#pops
#pi.pops<-read.table("~/Desktop/marsh.wren.rad/pi.het.info/fixed.pops/populations.sumstats_summary.tsv", header=T, sep='\t')
#per sample
pi.sample<-read.table("~/Desktop/marsh.wren.rad/pi.het.info/single.sample/populations.sumstats_summary.tsv", header=T, sep='\t')
#check that ordering stayed the same
pi.sample$Pop.ID == samps.voc$Tissue.num

##   [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [46] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [61] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [76] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [91] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [106] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [121] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [136] TRUE TRUE

pi.vec<-gsub("1","0.00111", samps.voc$pops)
pi.vec<-gsub("2","0.0018", pi.vec)
pi.vec<-gsub("3","0.0027", pi.vec)
pi.vec<-gsub("4","0.00241", pi.vec)
pi.vec<-gsub("5","0.00231", pi.vec)
pi.vec<-gsub("6","0.00195", pi.vec)
#make plotting df
plotting.df<-data.frame(sample=pi.sample$Pop.ID,
                        population=as.factor(samps.voc$pops),
                        het=pi.sample$Obs_Het,
                        pi=pi.vec)
plotting.df$pi<-as.numeric(plotting.df$pi)

#plot heterozygosity violin plots
ggplot(plotting.df, aes(x=population, y=het))+ 
  #geom_violin(trim = FALSE)+
  geom_dotplot(binaxis='y', stackdir='center', dotsize = 1, alpha=.8, aes(fill=population,color=population))+
  theme_classic()+
  scale_fill_brewer(palette = "BrBG", direction = -1)+
  scale_color_brewer(palette = "BrBG", direction = -1)+
  theme(legend.position = "none")+
  geom_point(aes(y=pi), pch=8, cex=3)+
  labs(x="",y="heterozygosity/Pi")

## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.

  #scale_y_continuous(sec.axis = sec_axis(trans = (~.*1), name="Pi"))

#plot
het<-ggplot(plotting.df, aes(x=population, y=het)) + 
  #geom_violin(trim = FALSE)+
  geom_dotplot(binaxis='y', stackdir='center', dotsize = 2, alpha=.8, aes(fill=population,color=population))+
  theme_classic()+
  scale_fill_brewer(palette = "BrBG", direction = -1)+
  scale_color_brewer(palette = "BrBG", direction = -1)+
  theme(legend.position = "none")+
  geom_point(aes(y=pi), pch=8, cex=3)+
  labs(x="",y="heterozygosity / Pi")
  
grid.arrange(qval, eb, het,
                    ncol = 1, nrow = 3)

## Warning: Removed 47 rows containing missing values (stat_hist_).

## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.

#save
figure <- grid.arrange(qval, eb, het,
                    ncol = 1, nrow = 3)

## Warning: Removed 47 rows containing missing values (stat_hist_).

## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.

#ggsave("~/Desktop/marsh.wren.rad/hists.by.site.plus.het.pdf",figure,width=9,height=6)

make ancestry histograms and plot diversity

Devon DeRaad

2022-10-04

calculate pi and heterozygosity