Optimize ‘m’

#load RADstackshelpR package
library(RADstackshelpR)
library(ggplot2)
#load gridExtra package to combine ggplot visualizations
library(gridExtra)
#if we have already read in the 16 vcf files and saved the details as an 'Rdata' file, load it in here:
load("~/Desktop/marsh.wren.rad/denovo.optimization.RData")
#optimize_m function will generate summary stats on your 5 iterative runs
#input can be full path to each file, or just the file name if the vcf files are in your working directory
m.out<-optimize_m(m3="/Users/devder/Desktop/marsh.wren.rad/m3.vcf",
           m4="/Users/devder/Desktop/marsh.wren.rad/m4.vcf",
           m5="/Users/devder/Desktop/marsh.wren.rad/m5.vcf",
           m6="/Users/devder/Desktop/marsh.wren.rad/m6.vcf",
           m7="/Users/devder/Desktop/marsh.wren.rad/m7.vcf")
#Assigning the output of this function to the variable 'm.out' should generate a list containing five objects of class 'data.frame' with the following characteristics: 'depth' showing depth per sample for each m value, 'snp' showing the number of non-missing SNPs retained in each sample at each m value, 'loci' showing the number of non-missing loci retained in each sample at each m value, 'snp.R80' showing the total number of SNPs retained at an 80% completeness cutoff, and 'loci.R80' showing the total number of polymorphic loci retained at an 80% completeness cutoff.

#Use this output list as input for this function, to visualize the effect of varying m on the depth of each sample
vis_depth(output = m.out)
## [1] "Visualize how different values of m affect average depth in each sample"
## Picking joint bandwidth of 17

#visualize the effect of varying m on the number of SNPs retained
vis_snps(output = m.out, stacks_param = "m")
## Visualize how different values of m affect number of SNPs retained.
## Density plot shows the distribution of the number of SNPs retained in each sample,
## while the asterisk denotes the total number of SNPs retained at an 80% completeness cutoff.
## Picking joint bandwidth of 2490

#visualize the effect of varying m on the number of loci retained
vis_loci(output = m.out, stacks_param = "m")
## Visualize how different values of m affect number of polymorphic loci retained.
## Density plot shows the distribution of the number of loci retained in each sample,
## while the asterisk denotes the total number of loci retained at an 80% completeness cutoff. The optimal value is denoted by red color.
## Picking joint bandwidth of 732

Optimize ‘M’

#optimize_bigM function will generate summary stats on your 8 iterative runs
M.out<-optimize_bigM(M1="/Users/devder/Desktop/marsh.wren.rad/bigM1.vcf",
           M2="/Users/devder/Desktop/marsh.wren.rad/bigM2.vcf",
           M3="/Users/devder/Desktop/marsh.wren.rad/bigM3.vcf",
           M4="/Users/devder/Desktop/marsh.wren.rad/bigM4.vcf",
           M5="/Users/devder/Desktop/marsh.wren.rad/bigM5.vcf",
           M6="/Users/devder/Desktop/marsh.wren.rad/bigM6.vcf",
           M7="/Users/devder/Desktop/marsh.wren.rad/bigM7.vcf",
           M8="/Users/devder/Desktop/marsh.wren.rad/bigM8.vcf")
#Assigning the output of this function to the variable 'M.out' should generate a list containing four objects of class 'data.frame' with the following characteristics: 'snp' showing the number of non-missing SNPs retained in each sample at each m value, 'loci' showing the number of non-missing loci retained in each sample at each m value, 'snp.R80' showing the total number of SNPs retained at an 80% completeness cutoff, and 'loci.R80' showing the total number of polymorphic loci retained at an 80% completeness cutoff.

#use this function to visualize the effect of varying 'M' on the number of SNPs retained
vis_snps(output = M.out, stacks_param = "M")
## Visualize how different values of M affect number of SNPs retained.
## Density plot shows the distribution of the number of SNPs retained in each sample,
## while the asterisk denotes the total number of SNPs retained at an 80% completeness cutoff.
## Picking joint bandwidth of 2060

#visualize the effect of varying 'M' on the number of polymorphic loci retained
vis_loci(output = M.out, stacks_param = "M")
## Visualize how different values of M affect number of polymorphic loci retained.
## Density plot shows the distribution of the number of loci retained in each sample,
## while the asterisk denotes the total number of loci retained at an 80% completeness cutoff. The optimal value is denoted by red color.
## Picking joint bandwidth of 649

Optimize ‘n’

#optimize n
n.out<-optimize_n(nequalsMminus1="/Users/devder/Desktop/marsh.wren.rad/n3.vcf",
           nequalsM="/Users/devder/Desktop/marsh.wren.rad/n4.vcf",
           nequalsMplus1="/Users/devder/Desktop/marsh.wren.rad/n5.vcf")
#save the details to an 'Rdata' file so that this script can be rerun without having to load in all 16 vcf files next time
#save(m.out, M.out, n.out, file = "~/Desktop/marsh.wren.rad/denovo.optimization.RData")
##Assigning the output of this function to the variable 'n.out' should generate a single object of class 'data.frame' showing the number of SNPs and loci retained across filtering levels for each value of n.

#visualize the effect of varying n on the number of SNPs retained
vis_snps(output = n.out, stacks_param = "n")
## Visualize how different values of n affect number of SNPs retained.
## Density plot shows the distribution of the number of SNPs retained in each sample,
## while the asterisk denotes the total number of SNPs retained at an 80% completeness cutoff.
## Picking joint bandwidth of 1850

#visualize the effect of varying n on the number of polymorphic loci retained
vis_loci(output = n.out, stacks_param = "n")
## Visualize how different values of n affect number of polymorphic loci retained.
## Density plot shows the distribution of the number of loci retained in each sample,
## while the asterisk denotes the total number of loci retained at an 80% completeness cutoff. The optimal value is denoted by red color.
## Picking joint bandwidth of 601

plot summary figure

#combine all of these prior visulizations in a single list
gl<-list()
gl[[1]]<-vis_depth(output = m.out)
## [1] "Visualize how different values of m affect average depth in each sample"
gl[[2]]<-vis_snps(output = m.out, stacks_param = "m")
## Visualize how different values of m affect number of SNPs retained.
## Density plot shows the distribution of the number of SNPs retained in each sample,
## while the asterisk denotes the total number of SNPs retained at an 80% completeness cutoff.
gl[[3]]<-vis_loci(output = m.out, stacks_param = "m")
## Visualize how different values of m affect number of polymorphic loci retained.
## Density plot shows the distribution of the number of loci retained in each sample,
## while the asterisk denotes the total number of loci retained at an 80% completeness cutoff. The optimal value is denoted by red color.
gl[[4]]<-vis_snps(output = M.out, stacks_param = "M")
## Visualize how different values of M affect number of SNPs retained.
## Density plot shows the distribution of the number of SNPs retained in each sample,
## while the asterisk denotes the total number of SNPs retained at an 80% completeness cutoff.
gl[[5]]<-vis_loci(output = M.out, stacks_param = "M")
## Visualize how different values of M affect number of polymorphic loci retained.
## Density plot shows the distribution of the number of loci retained in each sample,
## while the asterisk denotes the total number of loci retained at an 80% completeness cutoff. The optimal value is denoted by red color.
gl[[6]]<-vis_snps(output = n.out, stacks_param = "n")
## Visualize how different values of n affect number of SNPs retained.
## Density plot shows the distribution of the number of SNPs retained in each sample,
## while the asterisk denotes the total number of SNPs retained at an 80% completeness cutoff.
gl[[7]]<-vis_loci(output = n.out, stacks_param = "n")
## Visualize how different values of n affect number of polymorphic loci retained.
## Density plot shows the distribution of the number of loci retained in each sample,
## while the asterisk denotes the total number of loci retained at an 80% completeness cutoff. The optimal value is denoted by red color.
#visualize each item of the list as part of a single grid
grid.arrange(grobs = gl, widths = c(1,1,1,1,1,1),
  layout_matrix = rbind(c(1,1,2,2,3,3),
                        c(4,4,4,5,5,5),
                        c(6,6,6,7,7,7)))
## Picking joint bandwidth of 17
## Picking joint bandwidth of 2490
## Picking joint bandwidth of 732
## Picking joint bandwidth of 2060
## Picking joint bandwidth of 649
## Picking joint bandwidth of 1850
## Picking joint bandwidth of 601

#save plot
g<-arrangeGrob(grobs = gl, widths = c(1,1,1,1,1,1),
  layout_matrix = rbind(c(1,1,2,2,3,3),
                        c(4,4,4,5,5,5),
                        c(6,6,6,7,7,7)))
## Picking joint bandwidth of 17
## Picking joint bandwidth of 2490
## Picking joint bandwidth of 732
## Picking joint bandwidth of 2060
## Picking joint bandwidth of 649
## Picking joint bandwidth of 1850
## Picking joint bandwidth of 601
ggsave(file="~/Desktop/marsh.wren.rad/denovo.optimization.png", g, width = 10, height=8, units = "in")