library(vcfR)
## Warning: package 'vcfR' was built under R version 3.5.2
## 
##    *****       ***   vcfR   ***       *****
##    This is vcfR 1.9.0 
##      browseVignettes('vcfR') # Documentation
##      citation('vcfR') # Citation
##    *****       *****      *****       *****
library(ggplot2)
library(gridExtra)
library(ggridges)
## Warning: package 'ggridges' was built under R version 3.5.2
library(adegenet)
## Loading required package: ade4
## Warning: package 'ade4' was built under R version 3.5.2
## 
##    /// adegenet 2.1.3 is loaded ////////////
## 
##    > overview: '?adegenet'
##    > tutorials/doc/questions: 'adegenetWeb()' 
##    > bug reports/feature requests: adegenetIssues()
library(RADstackshelpR)

RADStacksHelpR

Read in your unfiltered vcf file

#read in vcf as vcfR
vcfR <- read.vcfR("~/Desktop/aph.data/populations.snps.vcf")
## Scanning file to determine attributes.
## File attributes:
##   meta lines: 14
##   header_line: 15
##   variant count: 210336
##   column count: 124
## 
Meta line 14 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
##   Character matrix gt rows: 210336
##   Character matrix gt cols: 124
##   skip: 0
##   nrows: 210336
##   row_num: 0
## 
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant 6000
Processed variant 7000
Processed variant 8000
Processed variant 9000
Processed variant 10000
Processed variant 11000
Processed variant 12000
Processed variant 13000
Processed variant 14000
Processed variant 15000
Processed variant 16000
Processed variant 17000
Processed variant 18000
Processed variant 19000
Processed variant 20000
Processed variant 21000
Processed variant 22000
Processed variant 23000
Processed variant 24000
Processed variant 25000
Processed variant 26000
Processed variant 27000
Processed variant 28000
Processed variant 29000
Processed variant 30000
Processed variant 31000
Processed variant 32000
Processed variant 33000
Processed variant 34000
Processed variant 35000
Processed variant 36000
Processed variant 37000
Processed variant 38000
Processed variant 39000
Processed variant 40000
Processed variant 41000
Processed variant 42000
Processed variant 43000
Processed variant 44000
Processed variant 45000
Processed variant 46000
Processed variant 47000
Processed variant 48000
Processed variant 49000
Processed variant 50000
Processed variant 51000
Processed variant 52000
Processed variant 53000
Processed variant 54000
Processed variant 55000
Processed variant 56000
Processed variant 57000
Processed variant 58000
Processed variant 59000
Processed variant 60000
Processed variant 61000
Processed variant 62000
Processed variant 63000
Processed variant 64000
Processed variant 65000
Processed variant 66000
Processed variant 67000
Processed variant 68000
Processed variant 69000
Processed variant 70000
Processed variant 71000
Processed variant 72000
Processed variant 73000
Processed variant 74000
Processed variant 75000
Processed variant 76000
Processed variant 77000
Processed variant 78000
Processed variant 79000
Processed variant 80000
Processed variant 81000
Processed variant 82000
Processed variant 83000
Processed variant 84000
Processed variant 85000
Processed variant 86000
Processed variant 87000
Processed variant 88000
Processed variant 89000
Processed variant 90000
Processed variant 91000
Processed variant 92000
Processed variant 93000
Processed variant 94000
Processed variant 95000
Processed variant 96000
Processed variant 97000
Processed variant 98000
Processed variant 99000
Processed variant 100000
Processed variant 101000
Processed variant 102000
Processed variant 103000
Processed variant 104000
Processed variant 105000
Processed variant 106000
Processed variant 107000
Processed variant 108000
Processed variant 109000
Processed variant 110000
Processed variant 111000
Processed variant 112000
Processed variant 113000
Processed variant 114000
Processed variant 115000
Processed variant 116000
Processed variant 117000
Processed variant 118000
Processed variant 119000
Processed variant 120000
Processed variant 121000
Processed variant 122000
Processed variant 123000
Processed variant 124000
Processed variant 125000
Processed variant 126000
Processed variant 127000
Processed variant 128000
Processed variant 129000
Processed variant 130000
Processed variant 131000
Processed variant 132000
Processed variant 133000
Processed variant 134000
Processed variant 135000
Processed variant 136000
Processed variant 137000
Processed variant 138000
Processed variant 139000
Processed variant 140000
Processed variant 141000
Processed variant 142000
Processed variant 143000
Processed variant 144000
Processed variant 145000
Processed variant 146000
Processed variant 147000
Processed variant 148000
Processed variant 149000
Processed variant 150000
Processed variant 151000
Processed variant 152000
Processed variant 153000
Processed variant 154000
Processed variant 155000
Processed variant 156000
Processed variant 157000
Processed variant 158000
Processed variant 159000
Processed variant 160000
Processed variant 161000
Processed variant 162000
Processed variant 163000
Processed variant 164000
Processed variant 165000
Processed variant 166000
Processed variant 167000
Processed variant 168000
Processed variant 169000
Processed variant 170000
Processed variant 171000
Processed variant 172000
Processed variant 173000
Processed variant 174000
Processed variant 175000
Processed variant 176000
Processed variant 177000
Processed variant 178000
Processed variant 179000
Processed variant 180000
Processed variant 181000
Processed variant 182000
Processed variant 183000
Processed variant 184000
Processed variant 185000
Processed variant 186000
Processed variant 187000
Processed variant 188000
Processed variant 189000
Processed variant 190000
Processed variant 191000
Processed variant 192000
Processed variant 193000
Processed variant 194000
Processed variant 195000
Processed variant 196000
Processed variant 197000
Processed variant 198000
Processed variant 199000
Processed variant 200000
Processed variant 201000
Processed variant 202000
Processed variant 203000
Processed variant 204000
Processed variant 205000
Processed variant 206000
Processed variant 207000
Processed variant 208000
Processed variant 209000
Processed variant 210000
Processed variant: 210336
## All variants processed
### check the metadata present in your vcf
vcfR
## ***** Object of Class vcfR *****
## 115 samples
## 87 CHROMs
## 210,336 variants
## Object size: 685.5 Mb
## 57.1 percent missing data
## *****        *****         *****
vcfR@fix[1:10,1:8]
##       CHROM        POS      ID        REF ALT QUAL FILTER INFO            
##  [1,] "Pseudochr1" "615693" "48:25:+" "T" "C" NA   "PASS" "NS=3;AF=0.333" 
##  [2,] "Pseudochr1" "615724" "48:56:+" "C" "T" NA   "PASS" "NS=3;AF=0.333" 
##  [3,] "Pseudochr1" "674543" "56:43:+" "A" "T" NA   "PASS" "NS=9;AF=0.222" 
##  [4,] "Pseudochr1" "674545" "56:45:+" "A" "T" NA   "PASS" "NS=18;AF=0.222"
##  [5,] "Pseudochr1" "674596" "56:96:+" "A" "T" NA   "PASS" "NS=14;AF=0.250"
##  [6,] "Pseudochr1" "690980" "61:7:+"  "G" "A" NA   "PASS" "NS=10;AF=0.100"
##  [7,] "Pseudochr1" "690991" "61:18:+" "T" "C" NA   "PASS" "NS=10;AF=0.300"
##  [8,] "Pseudochr1" "691006" "61:33:+" "T" "A" NA   "PASS" "NS=10;AF=0.300"
##  [9,] "Pseudochr1" "690931" "62:45:-" "G" "A" NA   "PASS" "NS=4;AF=0.250" 
## [10,] "Pseudochr1" "690906" "62:70:-" "A" "G" NA   "PASS" "NS=4;AF=0.250"
vcfR@gt[1:10,1:2]
##       FORMAT           A_californica_333849                
##  [1,] "GT:DP:AD:GQ:GL" NA                                  
##  [2,] "GT:DP:AD:GQ:GL" NA                                  
##  [3,] "GT:DP:AD:GQ:GL" NA                                  
##  [4,] "GT:DP:AD:GQ:GL" "1/1:67:0,67:40:-256.40,-21.43,0.00"
##  [5,] "GT:DP:AD:GQ:GL" "1/1:67:0,67:40:-226.28,-20.45,0.00"
##  [6,] "GT:DP:AD:GQ:GL" "0/0:2:2,0:32:-0.00,-2.62,-6.23"    
##  [7,] "GT:DP:AD:GQ:GL" "1/1:2:0,2:27:-4.96,-2.15,-0.00"    
##  [8,] "GT:DP:AD:GQ:GL" "0/0:2:2,0:31:-0.00,-2.51,-5.68"    
##  [9,] "GT:DP:AD:GQ:GL" NA                                  
## [10,] "GT:DP:AD:GQ:GL" NA
#generate popmap file. Two column popmap with the same format as stacks, and the columns must be named 'id' and 'pop'
popmap<-data.frame(id=colnames(vcfR@gt)[2:length(colnames(vcfR@gt))],pop=substr(colnames(vcfR@gt)[2:length(colnames(vcfR@gt))], 3,11))

step 1: Implement quality filters that don't involve missing data. This is because removing low data samples will alter percentage/quantile based missing data cutoffs, so we wait to implement those until after deciding on our final set of samples for downstream analysis

Note:If you have a very large vcf output file that you would like to work with in Rstudio, you could implement some percentage based filters (e.g. remove all SNPs above 50% missing data) via vcftools to reduce your filesize before starting to filter in R. Just be aware that once you drop low data samples, your previously enforced (per SNP or locus) missing data % will no longer be exact.

Jon Puritz has an excellent filtering tutorial that is focused specifically on filtering RADseq data: datahttps://www.ddocent.com/filtering/ Some of these RAD specific filters can't be applied to a vcf output by stacks, which doesn't retain metadata like mapping quality and strand, but we can follow these guidelines for hard filtering (he suggests minimum depth=3, gq =30), and can implement suggested filters like allele balance and max depth, here in R.

#hard filter to minimum depth of 5, and minimum genotype quality of 30
vcfR<-hard.filter.vcf(vcfR=vcfR, depth = 5, gq = 30)
## [1] "32.92% of genotypes fall below a read depth of 5 and were converted to NA"
## [1] "2.01% of genotypes fall below a genotype quality of 30 and were converted to NA"

Use this function to filter for allele balance from Puritz SNP filtering tutorial "Allele balance: a number between 0 and 1 representing the ratio of reads showing the reference allele to all reads, considering only reads from individuals called as heterozygous, we expect that the allele balance in our data (for real loci) should be close to 0.5"

#execute allele balance filter
vcfR<-filter.allele.balance(vcfR)
## [1] "7.56% of het genotypes (0.39% of all genotypes) fall outside of .25 - .75 allele balance and were converted to NA"

max depth filter (super high depth loci are likely multiple loci stuck together into a single paralogous locus). Note: we want to apply this 'per SNP' rather than 'per genotype' otherwise we will simply be removing most of the genotypes from our deepest sequenced samples (because sequencing depth is so variable between samples)

#visualize and pick appropriate max depth cutoff
max_depth(vcfR)

## [1] "dashed line indicates a mean depth across all SNPs of 46.7"
#filter vcf by the max depth cutoff you chose
vcfR<-max_depth(vcfR, maxdepth = 100)
## [1] "12.85% of SNPs were above a mean depth of 100 and were removed from the vcf"

Note: It may be a good idea to additionally filter out SNPs that are significantly out of HWE if you have a really good idea of what the population structure in your sample looks like and good sample sizes in each pop. For this dataset, which is highly structured (many isolated island pops) with species boundaries that are in flux, I am not going to use a HWE filter, because I don't feel comfortable confidently identifying populations in which we can expect HWE.

#check vcfR to see how many SNPs we have left
vcfR
## ***** Object of Class vcfR *****
## 115 samples
## 78 CHROMs
## 183,293 variants
## Object size: 408 Mb
## 79.79 percent missing data
## *****        *****         *****

Step 2: visualize missing data by sample. Check out the visualizations and make decision on which samples to keep for downstream analysis.

Determining which samples to retain is highly project specific, and is contingent on your sampling, your taxa, the sequencing results, etc. It is also wise to take missing data into account by population. Often we don't know a-priori what our populations will look like, but in general we want to avoid making inferences about populations if they have consistently less information than the rest of our dataset. On the flip side of that coin, you may have designed a project to get information about a rare sample/population that lacks existing genetic sampling, and therefore must retain specific samples despite low sequencing and high missing data. This is a reality, and simply needs to be addressed carefully.

#run function to visualize samples
missing.by.sample(vcfR=vcfR, popmap = popmap)
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.

##                     indiv filt snps.retained
## 1    A_californica_333849  0.5         31688
## 2    A_californica_333854  0.5         29910
## 3    A_californica_333855  0.5         27617
## 4    A_californica_333857  0.5         31765
## 5    A_californica_333860  0.5         30834
## 6    A_californica_333862  0.5         29263
## 7    A_californica_333868  0.5           846
## 8    A_californica_333871  0.5         20433
## 9    A_californica_333873  0.5         15351
## 10   A_californica_333874  0.5         27270
## 11   A_californica_333914  0.5         19934
## 12   A_californica_333917  0.5         30880
## 13   A_californica_333925  0.5             2
## 14   A_californica_333931  0.5         28889
## 15   A_californica_333932  0.5         30769
## 16   A_californica_333934  0.5         31186
## 17   A_californica_334002  0.5         31698
## 18   A_californica_334012  0.5         31683
## 19   A_californica_334015  0.5         31614
## 20   A_californica_334017  0.5         28141
## 21   A_californica_334019  0.5         22655
## 22   A_californica_334171  0.5         28981
## 23   A_californica_342037  0.5         25503
## 24   A_californica_342048  0.5         30711
## 25   A_californica_342050  0.5         31790
## 26   A_californica_342051  0.5           657
## 27   A_californica_342066  0.5         31014
## 28   A_californica_342072  0.5         19860
## 29   A_californica_343428  0.5         27973
## 30   A_californica_343430  0.5         26020
## 31   A_californica_343432  0.5         24797
## 32   A_californica_343438  0.5         20792
## 33   A_californica_343442  0.5          3820
## 34   A_californica_343451  0.5         21374
## 35   A_californica_393615  0.5         21712
## 36   A_californica_393616  0.5         10758
## 37   A_californica_393721  0.5         28631
## 38  A_coerulescens_396251  0.5         30403
## 39  A_coerulescens_396254  0.5         28166
## 40  A_coerulescens_396256  0.5         21998
## 41  A_coerulescens_396259  0.5          4888
## 42  A_coerulescens_396262  0.5             1
## 43  A_coerulescens_396263  0.5         17512
## 44  A_coerulescens_396264  0.5         30619
## 45  A_coerulescens_396265  0.5         30701
## 46     A_insularis_334025  0.5         29701
## 47     A_insularis_334029  0.5         29041
## 48     A_insularis_334031  0.5         31081
## 49     A_insularis_334032  0.5         31354
## 50     A_insularis_334033  0.5         31236
## 51     A_insularis_334034  0.5         30946
## 52     A_insularis_334037  0.5         30671
## 53     A_insularis_334038  0.5         19877
## 54   A_sumichrasti_343502  0.5         28739
## 55   A_sumichrasti_343503  0.5         31684
## 56   A_sumichrasti_343510  0.5         25150
## 57   A_sumichrasti_343511  0.5         21612
## 58   A_sumichrasti_343512  0.5         31703
## 59   A_sumichrasti_343513  0.5          8925
## 60   A_sumichrasti_343514  0.5         26499
## 61   A_sumichrasti_343515  0.5         27363
## 62   A_sumichrasti_393633  0.5          4202
## 63   A_sumichrasti_393635  0.5         27429
## 64   A_sumichrasti_393636  0.5         12663
## 65   A_sumichrasti_393637  0.5          2707
## 66   A_sumichrasti_393638  0.5         22402
## 67   A_sumichrasti_393639  0.5           964
## 68   A_sumichrasti_393640  0.5         30299
## 69   A_woodhouseii_334059  0.5            43
## 70   A_woodhouseii_334062  0.5         25071
## 71   A_woodhouseii_334063  0.5         28395
## 72   A_woodhouseii_334086  0.5         25925
## 73   A_woodhouseii_334088  0.5         27747
## 74   A_woodhouseii_334096  0.5         17103
## 75   A_woodhouseii_334132  0.5         31010
## 76   A_woodhouseii_334133  0.5         31237
## 77   A_woodhouseii_334134  0.5         16662
## 78   A_woodhouseii_334142  0.5         31175
## 79   A_woodhouseii_334148  0.5         11997
## 80   A_woodhouseii_334153  0.5         28605
## 81   A_woodhouseii_334156  0.5         12201
## 82   A_woodhouseii_334161  0.5         31860
## 83   A_woodhouseii_334170  0.5          8960
## 84   A_woodhouseii_334172  0.5             6
## 85   A_woodhouseii_334188  0.5         25179
## 86   A_woodhouseii_334190  0.5         30720
## 87   A_woodhouseii_334196  0.5         31290
## 88   A_woodhouseii_334210  0.5         31332
## 89   A_woodhouseii_334211  0.5         31539
## 90   A_woodhouseii_334217  0.5         16480
## 91   A_woodhouseii_334230  0.5         29723
## 92   A_woodhouseii_334240  0.5           462
## 93   A_woodhouseii_334241  0.5         23167
## 94   A_woodhouseii_334242  0.5         22918
## 95   A_woodhouseii_334243  0.5         25791
## 96   A_woodhouseii_334244  0.5         31598
## 97   A_woodhouseii_334247  0.5         22130
## 98   A_woodhouseii_334250  0.5         31563
## 99   A_woodhouseii_343453  0.5          3095
## 100  A_woodhouseii_343458  0.5         20743
## 101  A_woodhouseii_343461  0.5         31729
## 102  A_woodhouseii_343476  0.5         30016
## 103  A_woodhouseii_343480  0.5         19252
## 104  A_woodhouseii_343481  0.5         23934
## 105  A_woodhouseii_343483  0.5         30877
## 106  A_woodhouseii_343497  0.5         25551
## 107  A_woodhouseii_393605  0.5         30729
## 108  A_woodhouseii_393606  0.5         29549
## 109  A_woodhouseii_393697  0.5         25261
## 110  A_woodhouseii_393698  0.5         15769
## 111  A_woodhouseii_393699  0.5          3460
## 112  A_woodhouseii_393702  0.5         26265
## 113  A_woodhouseii_393712  0.5         30783
## 114  A_woodhouseii_393713  0.5         18615
## 115  A_woodhouseii_395768  0.5         31768
## 116  A_californica_333849  0.6         25249
## 117  A_californica_333854  0.6         24683
## 118  A_californica_333855  0.6         23334
## 119  A_californica_333857  0.6         25295
## 120  A_californica_333860  0.6         25035
## 121  A_californica_333862  0.6         24441
## 122  A_californica_333868  0.6           823
## 123  A_californica_333871  0.6         18529
## 124  A_californica_333873  0.6         14344
## 125  A_californica_333874  0.6         23487
## 126  A_californica_333914  0.6         18093
## 127  A_californica_333917  0.6         25065
## 128  A_californica_333925  0.6             0
## 129  A_californica_333931  0.6         24214
## 130  A_californica_333932  0.6         25006
## 131  A_californica_333934  0.6         25201
## 132  A_californica_334002  0.6         25294
## 133  A_californica_334012  0.6         25214
## 134  A_californica_334015  0.6         25241
## 135  A_californica_334017  0.6         23837
## 136  A_californica_334019  0.6         20144
## 137  A_californica_334171  0.6         23773
## 138  A_californica_342037  0.6         22305
## 139  A_californica_342048  0.6         24927
## 140  A_californica_342050  0.6         25291
## 141  A_californica_342051  0.6           613
## 142  A_californica_342066  0.6         24876
## 143  A_californica_342072  0.6         17854
## 144  A_californica_343428  0.6         23810
## 145  A_californica_343430  0.6         22617
## 146  A_californica_343432  0.6         21547
## 147  A_californica_343438  0.6         18799
## 148  A_californica_343442  0.6          3653
## 149  A_californica_343451  0.6         19376
## 150  A_californica_393615  0.6         19514
## 151  A_californica_393616  0.6         10029
## 152  A_californica_393721  0.6         24173
## 153 A_coerulescens_396251  0.6         24396
## 154 A_coerulescens_396254  0.6         23429
## 155 A_coerulescens_396256  0.6         19617
## 156 A_coerulescens_396259  0.6          4613
## 157 A_coerulescens_396262  0.6             1
## 158 A_coerulescens_396263  0.6         16090
## 159 A_coerulescens_396264  0.6         24464
## 160 A_coerulescens_396265  0.6         24472
## 161    A_insularis_334025  0.6         24463
## 162    A_insularis_334029  0.6         24099
## 163    A_insularis_334031  0.6         24858
## 164    A_insularis_334032  0.6         25047
## 165    A_insularis_334033  0.6         24938
## 166    A_insularis_334034  0.6         24838
## 167    A_insularis_334037  0.6         24762
## 168    A_insularis_334038  0.6         18081
## 169  A_sumichrasti_343502  0.6         24088
## 170  A_sumichrasti_343503  0.6         25190
## 171  A_sumichrasti_343510  0.6         22100
## 172  A_sumichrasti_343511  0.6         19487
## 173  A_sumichrasti_343512  0.6         25269
## 174  A_sumichrasti_343513  0.6          8329
## 175  A_sumichrasti_343514  0.6         22864
## 176  A_sumichrasti_343515  0.6         23393
## 177  A_sumichrasti_393633  0.6          4005
## 178  A_sumichrasti_393635  0.6         23486
## 179  A_sumichrasti_393636  0.6         11928
## 180  A_sumichrasti_393637  0.6          2569
## 181  A_sumichrasti_393638  0.6         20072
## 182  A_sumichrasti_393639  0.6           920
## 183  A_sumichrasti_393640  0.6         24858
## 184  A_woodhouseii_334059  0.6            35
## 185  A_woodhouseii_334062  0.6         21905
## 186  A_woodhouseii_334063  0.6         23834
## 187  A_woodhouseii_334086  0.6         22258
## 188  A_woodhouseii_334088  0.6         23542
## 189  A_woodhouseii_334096  0.6         15798
## 190  A_woodhouseii_334132  0.6         25105
## 191  A_woodhouseii_334133  0.6         25126
## 192  A_woodhouseii_334134  0.6         15281
## 193  A_woodhouseii_334142  0.6         25136
## 194  A_woodhouseii_334148  0.6         11265
## 195  A_woodhouseii_334153  0.6         23982
## 196  A_woodhouseii_334156  0.6         11635
## 197  A_woodhouseii_334161  0.6         25318
## 198  A_woodhouseii_334170  0.6          8560
## 199  A_woodhouseii_334172  0.6             6
## 200  A_woodhouseii_334188  0.6         21992
## 201  A_woodhouseii_334190  0.6         24992
## 202  A_woodhouseii_334196  0.6         25085
## 203  A_woodhouseii_334210  0.6         25169
## 204  A_woodhouseii_334211  0.6         25276
## 205  A_woodhouseii_334217  0.6         15261
## 206  A_woodhouseii_334230  0.6         24446
## 207  A_woodhouseii_334240  0.6           439
## 208  A_woodhouseii_334241  0.6         20641
## 209  A_woodhouseii_334242  0.6         20380
## 210  A_woodhouseii_334243  0.6         22266
## 211  A_woodhouseii_334244  0.6         25175
## 212  A_woodhouseii_334247  0.6         19708
## 213  A_woodhouseii_334250  0.6         25220
## 214  A_woodhouseii_343453  0.6          2946
## 215  A_woodhouseii_343458  0.6         18717
## 216  A_woodhouseii_343461  0.6         25284
## 217  A_woodhouseii_343476  0.6         24735
## 218  A_woodhouseii_343480  0.6         17346
## 219  A_woodhouseii_343481  0.6         21240
## 220  A_woodhouseii_343483  0.6         25090
## 221  A_woodhouseii_343497  0.6         22217
## 222  A_woodhouseii_393605  0.6         25037
## 223  A_woodhouseii_393606  0.6         24459
## 224  A_woodhouseii_393697  0.6         22192
## 225  A_woodhouseii_393698  0.6         14608
## 226  A_woodhouseii_393699  0.6          3300
## 227  A_woodhouseii_393702  0.6         22930
## 228  A_woodhouseii_393712  0.6         25107
## 229  A_woodhouseii_393713  0.6         17000
## 230  A_woodhouseii_395768  0.6         25310
## 231  A_californica_333849  0.7         18129
## 232  A_californica_333854  0.7         18057
## 233  A_californica_333855  0.7         17609
## 234  A_californica_333857  0.7         18164
## 235  A_californica_333860  0.7         18102
## 236  A_californica_333862  0.7         17985
## 237  A_californica_333868  0.7           736
## 238  A_californica_333871  0.7         15282
## 239  A_californica_333873  0.7         12322
## 240  A_californica_333874  0.7         17817
## 241  A_californica_333914  0.7         14924
## 242  A_californica_333917  0.7         18122
## 243  A_californica_333925  0.7             0
## 244  A_californica_333931  0.7         17862
## 245  A_californica_333932  0.7         18078
## 246  A_californica_333934  0.7         18156
## 247  A_californica_334002  0.7         18172
## 248  A_californica_334012  0.7         18101
## 249  A_californica_334015  0.7         18160
## 250  A_californica_334017  0.7         17787
## 251  A_californica_334019  0.7         16183
## 252  A_californica_334171  0.7         17499
## 253  A_californica_342037  0.7         17333
## 254  A_californica_342048  0.7         18116
## 255  A_californica_342050  0.7         18082
## 256  A_californica_342051  0.7           559
## 257  A_californica_342066  0.7         17911
## 258  A_californica_342072  0.7         14667
## 259  A_californica_343428  0.7         17888
## 260  A_californica_343430  0.7         17399
## 261  A_californica_343432  0.7         16812
## 262  A_californica_343438  0.7         15338
## 263  A_californica_343442  0.7          3347
## 264  A_californica_343451  0.7         15760
## 265  A_californica_393615  0.7         15892
## 266  A_californica_393616  0.7          8819
## 267  A_californica_393721  0.7         17973
## 268 A_coerulescens_396251  0.7         17620
## 269 A_coerulescens_396254  0.7         17326
## 270 A_coerulescens_396256  0.7         15688
## 271 A_coerulescens_396259  0.7          4179
## 272 A_coerulescens_396262  0.7             1
## 273 A_coerulescens_396263  0.7         13577
## 274 A_coerulescens_396264  0.7         17625
## 275 A_coerulescens_396265  0.7         17574
## 276    A_insularis_334025  0.7         17875
## 277    A_insularis_334029  0.7         17797
## 278    A_insularis_334031  0.7         17931
## 279    A_insularis_334032  0.7         18078
## 280    A_insularis_334033  0.7         17973
## 281    A_insularis_334034  0.7         17943
## 282    A_insularis_334037  0.7         17951
## 283    A_insularis_334038  0.7         14917
## 284  A_sumichrasti_343502  0.7         17787
## 285  A_sumichrasti_343503  0.7         18080
## 286  A_sumichrasti_343510  0.7         17145
## 287  A_sumichrasti_343511  0.7         15822
## 288  A_sumichrasti_343512  0.7         18171
## 289  A_sumichrasti_343513  0.7          7328
## 290  A_sumichrasti_343514  0.7         17443
## 291  A_sumichrasti_343515  0.7         17666
## 292  A_sumichrasti_393633  0.7          3639
## 293  A_sumichrasti_393635  0.7         17710
## 294  A_sumichrasti_393636  0.7         10447
## 295  A_sumichrasti_393637  0.7          2359
## 296  A_sumichrasti_393638  0.7         16265
## 297  A_sumichrasti_393639  0.7           836
## 298  A_sumichrasti_393640  0.7         18001
## 299  A_woodhouseii_334059  0.7            34
## 300  A_woodhouseii_334062  0.7         17119
## 301  A_woodhouseii_334063  0.7         17818
## 302  A_woodhouseii_334086  0.7         17095
## 303  A_woodhouseii_334088  0.7         17651
## 304  A_woodhouseii_334096  0.7         13463
## 305  A_woodhouseii_334132  0.7         18104
## 306  A_woodhouseii_334133  0.7         18138
## 307  A_woodhouseii_334134  0.7         12882
## 308  A_woodhouseii_334142  0.7         18110
## 309  A_woodhouseii_334148  0.7          9922
## 310  A_woodhouseii_334153  0.7         17854
## 311  A_woodhouseii_334156  0.7         10418
## 312  A_woodhouseii_334161  0.7         18151
## 313  A_woodhouseii_334170  0.7          7654
## 314  A_woodhouseii_334172  0.7             6
## 315  A_woodhouseii_334188  0.7         17099
## 316  A_woodhouseii_334190  0.7         18128
## 317  A_woodhouseii_334196  0.7         18036
## 318  A_woodhouseii_334210  0.7         18155
## 319  A_woodhouseii_334211  0.7         18160
## 320  A_woodhouseii_334217  0.7         13101
## 321  A_woodhouseii_334230  0.7         17943
## 322  A_woodhouseii_334240  0.7           398
## 323  A_woodhouseii_334241  0.7         16451
## 324  A_woodhouseii_334242  0.7         16400
## 325  A_woodhouseii_334243  0.7         17128
## 326  A_woodhouseii_334244  0.7         18070
## 327  A_woodhouseii_334247  0.7         15955
## 328  A_woodhouseii_334250  0.7         18102
## 329  A_woodhouseii_343453  0.7          2697
## 330  A_woodhouseii_343458  0.7         15359
## 331  A_woodhouseii_343461  0.7         18142
## 332  A_woodhouseii_343476  0.7         18023
## 333  A_woodhouseii_343480  0.7         14229
## 334  A_woodhouseii_343481  0.7         16811
## 335  A_woodhouseii_343483  0.7         18175
## 336  A_woodhouseii_343497  0.7         17159
## 337  A_woodhouseii_393605  0.7         18114
## 338  A_woodhouseii_393606  0.7         17962
## 339  A_woodhouseii_393697  0.7         17297
## 340  A_woodhouseii_393698  0.7         12619
## 341  A_woodhouseii_393699  0.7          3003
## 342  A_woodhouseii_393702  0.7         17593
## 343  A_woodhouseii_393712  0.7         18154
## 344  A_woodhouseii_393713  0.7         14143
## 345  A_woodhouseii_395768  0.7         18147
## 346  A_californica_333849  0.8          9987
## 347  A_californica_333854  0.8          9998
## 348  A_californica_333855  0.8          9929
## 349  A_californica_333857  0.8         10022
## 350  A_californica_333860  0.8         10008
## 351  A_californica_333862  0.8          9984
## 352  A_californica_333868  0.8           600
## 353  A_californica_333871  0.8          9565
## 354  A_californica_333873  0.8          8334
## 355  A_californica_333874  0.8         10010
## 356  A_californica_333914  0.8          9112
## 357  A_californica_333917  0.8         10012
## 358  A_californica_333925  0.8             0
## 359  A_californica_333931  0.8          9979
## 360  A_californica_333932  0.8          9969
## 361  A_californica_333934  0.8         10018
## 362  A_californica_334002  0.8         10012
## 363  A_californica_334012  0.8          9962
## 364  A_californica_334015  0.8         10009
## 365  A_californica_334017  0.8          9968
## 366  A_californica_334019  0.8          9584
## 367  A_californica_334171  0.8          9854
## 368  A_californica_342037  0.8          9911
## 369  A_californica_342048  0.8         10022
## 370  A_californica_342050  0.8          9964
## 371  A_californica_342051  0.8           446
## 372  A_californica_342066  0.8          9931
## 373  A_californica_342072  0.8          9123
## 374  A_californica_343428  0.8         10001
## 375  A_californica_343430  0.8          9912
## 376  A_californica_343432  0.8          9741
## 377  A_californica_343438  0.8          9337
## 378  A_californica_343442  0.8          2685
## 379  A_californica_343451  0.8          9574
## 380  A_californica_393615  0.8          9612
## 381  A_californica_393616  0.8          6174
## 382  A_californica_393721  0.8         10000
## 383 A_coerulescens_396251  0.8          9859
## 384 A_coerulescens_396254  0.8          9788
## 385 A_coerulescens_396256  0.8          9439
## 386 A_coerulescens_396259  0.8          3262
## 387 A_coerulescens_396262  0.8             0
## 388 A_coerulescens_396263  0.8          8838
## 389 A_coerulescens_396264  0.8          9854
## 390 A_coerulescens_396265  0.8          9816
## 391    A_insularis_334025  0.8          9993
## 392    A_insularis_334029  0.8          9959
## 393    A_insularis_334031  0.8          9986
## 394    A_insularis_334032  0.8         10035
## 395    A_insularis_334033  0.8          9998
## 396    A_insularis_334034  0.8          9989
## 397    A_insularis_334037  0.8          9988
## 398    A_insularis_334038  0.8          9326
## 399  A_sumichrasti_343502  0.8          9932
## 400  A_sumichrasti_343503  0.8          9998
## 401  A_sumichrasti_343510  0.8          9911
## 402  A_sumichrasti_343511  0.8          9690
## 403  A_sumichrasti_343512  0.8         10023
## 404  A_sumichrasti_343513  0.8          5209
## 405  A_sumichrasti_343514  0.8          9909
## 406  A_sumichrasti_343515  0.8          9970
## 407  A_sumichrasti_393633  0.8          2725
## 408  A_sumichrasti_393635  0.8          9978
## 409  A_sumichrasti_393636  0.8          7253
## 410  A_sumichrasti_393637  0.8          1926
## 411  A_sumichrasti_393638  0.8          9701
## 412  A_sumichrasti_393639  0.8           695
## 413  A_sumichrasti_393640  0.8          9981
## 414  A_woodhouseii_334059  0.8            23
## 415  A_woodhouseii_334062  0.8          9787
## 416  A_woodhouseii_334063  0.8          9972
## 417  A_woodhouseii_334086  0.8          9847
## 418  A_woodhouseii_334088  0.8          9924
## 419  A_woodhouseii_334096  0.8          8853
## 420  A_woodhouseii_334132  0.8         10012
## 421  A_woodhouseii_334133  0.8         10012
## 422  A_woodhouseii_334134  0.8          8322
## 423  A_woodhouseii_334142  0.8         10022
## 424  A_woodhouseii_334148  0.8          7026
## 425  A_woodhouseii_334153  0.8          9975
## 426  A_woodhouseii_334156  0.8          7413
## 427  A_woodhouseii_334161  0.8         10013
## 428  A_woodhouseii_334170  0.8          5818
## 429  A_woodhouseii_334172  0.8             6
## 430  A_woodhouseii_334188  0.8          9883
## 431  A_woodhouseii_334190  0.8         10035
## 432  A_woodhouseii_334196  0.8          9963
## 433  A_woodhouseii_334210  0.8         10033
## 434  A_woodhouseii_334211  0.8         10035
## 435  A_woodhouseii_334217  0.8          8699
## 436  A_woodhouseii_334230  0.8          9962
## 437  A_woodhouseii_334240  0.8           319
## 438  A_woodhouseii_334241  0.8          9697
## 439  A_woodhouseii_334242  0.8          9727
## 440  A_woodhouseii_334243  0.8          9831
## 441  A_woodhouseii_334244  0.8          9964
## 442  A_woodhouseii_334247  0.8          9606
## 443  A_woodhouseii_334250  0.8          9988
## 444  A_woodhouseii_343453  0.8          2109
## 445  A_woodhouseii_343458  0.8          9474
## 446  A_woodhouseii_343461  0.8         10028
## 447  A_woodhouseii_343476  0.8          9972
## 448  A_woodhouseii_343480  0.8          9119
## 449  A_woodhouseii_343481  0.8          9865
## 450  A_woodhouseii_343483  0.8         10041
## 451  A_woodhouseii_343497  0.8          9866
## 452  A_woodhouseii_393605  0.8         10017
## 453  A_woodhouseii_393606  0.8          9990
## 454  A_woodhouseii_393697  0.8          9976
## 455  A_woodhouseii_393698  0.8          8532
## 456  A_woodhouseii_393699  0.8          2424
## 457  A_woodhouseii_393702  0.8          9995
## 458  A_woodhouseii_393712  0.8         10025
## 459  A_woodhouseii_393713  0.8          9086
## 460  A_woodhouseii_395768  0.8         10009
## 461  A_californica_333849  0.9           503
## 462  A_californica_333854  0.9           504
## 463  A_californica_333855  0.9           502
## 464  A_californica_333857  0.9           504
## 465  A_californica_333860  0.9           501
## 466  A_californica_333862  0.9           504
## 467  A_californica_333868  0.9           121
## 468  A_californica_333871  0.9           503
## 469  A_californica_333873  0.9           504
## 470  A_californica_333874  0.9           504
## 471  A_californica_333914  0.9           500
## 472  A_californica_333917  0.9           504
## 473  A_californica_333925  0.9             0
## 474  A_californica_333931  0.9           504
## 475  A_californica_333932  0.9           503
## 476  A_californica_333934  0.9           504
## 477  A_californica_334002  0.9           504
## 478  A_californica_334012  0.9           503
## 479  A_californica_334015  0.9           504
## 480  A_californica_334017  0.9           504
## 481  A_californica_334019  0.9           502
## 482  A_californica_334171  0.9           501
## 483  A_californica_342037  0.9           504
## 484  A_californica_342048  0.9           501
## 485  A_californica_342050  0.9           503
## 486  A_californica_342051  0.9            53
## 487  A_californica_342066  0.9           502
## 488  A_californica_342072  0.9           492
## 489  A_californica_343428  0.9           504
## 490  A_californica_343430  0.9           503
## 491  A_californica_343432  0.9           495
## 492  A_californica_343438  0.9           494
## 493  A_californica_343442  0.9           309
## 494  A_californica_343451  0.9           501
## 495  A_californica_393615  0.9           504
## 496  A_californica_393616  0.9           464
## 497  A_californica_393721  0.9           504
## 498 A_coerulescens_396251  0.9           504
## 499 A_coerulescens_396254  0.9           504
## 500 A_coerulescens_396256  0.9           502
## 501 A_coerulescens_396259  0.9           345
## 502 A_coerulescens_396262  0.9             0
## 503 A_coerulescens_396263  0.9           499
## 504 A_coerulescens_396264  0.9           504
## 505 A_coerulescens_396265  0.9           504
## 506    A_insularis_334025  0.9           504
## 507    A_insularis_334029  0.9           504
## 508    A_insularis_334031  0.9           504
## 509    A_insularis_334032  0.9           504
## 510    A_insularis_334033  0.9           504
## 511    A_insularis_334034  0.9           504
## 512    A_insularis_334037  0.9           504
## 513    A_insularis_334038  0.9           504
## 514  A_sumichrasti_343502  0.9           503
## 515  A_sumichrasti_343503  0.9           503
## 516  A_sumichrasti_343510  0.9           504
## 517  A_sumichrasti_343511  0.9           504
## 518  A_sumichrasti_343512  0.9           504
## 519  A_sumichrasti_343513  0.9           390
## 520  A_sumichrasti_343514  0.9           500
## 521  A_sumichrasti_343515  0.9           503
## 522  A_sumichrasti_393633  0.9           339
## 523  A_sumichrasti_393635  0.9           503
## 524  A_sumichrasti_393636  0.9           461
## 525  A_sumichrasti_393637  0.9           281
## 526  A_sumichrasti_393638  0.9           500
## 527  A_sumichrasti_393639  0.9           159
## 528  A_sumichrasti_393640  0.9           504
## 529  A_woodhouseii_334059  0.9             4
## 530  A_woodhouseii_334062  0.9           504
## 531  A_woodhouseii_334063  0.9           504
## 532  A_woodhouseii_334086  0.9           504
## 533  A_woodhouseii_334088  0.9           504
## 534  A_woodhouseii_334096  0.9           503
## 535  A_woodhouseii_334132  0.9           504
## 536  A_woodhouseii_334133  0.9           504
## 537  A_woodhouseii_334134  0.9           488
## 538  A_woodhouseii_334142  0.9           504
## 539  A_woodhouseii_334148  0.9           473
## 540  A_woodhouseii_334153  0.9           503
## 541  A_woodhouseii_334156  0.9           494
## 542  A_woodhouseii_334161  0.9           504
## 543  A_woodhouseii_334170  0.9           469
## 544  A_woodhouseii_334172  0.9             0
## 545  A_woodhouseii_334188  0.9           502
## 546  A_woodhouseii_334190  0.9           504
## 547  A_woodhouseii_334196  0.9           502
## 548  A_woodhouseii_334210  0.9           504
## 549  A_woodhouseii_334211  0.9           504
## 550  A_woodhouseii_334217  0.9           499
## 551  A_woodhouseii_334230  0.9           504
## 552  A_woodhouseii_334240  0.9            59
## 553  A_woodhouseii_334241  0.9           503
## 554  A_woodhouseii_334242  0.9           504
## 555  A_woodhouseii_334243  0.9           504
## 556  A_woodhouseii_334244  0.9           504
## 557  A_woodhouseii_334247  0.9           500
## 558  A_woodhouseii_334250  0.9           504
## 559  A_woodhouseii_343453  0.9           293
## 560  A_woodhouseii_343458  0.9           501
## 561  A_woodhouseii_343461  0.9           504
## 562  A_woodhouseii_343476  0.9           504
## 563  A_woodhouseii_343480  0.9           495
## 564  A_woodhouseii_343481  0.9           504
## 565  A_woodhouseii_343483  0.9           504
## 566  A_woodhouseii_343497  0.9           502
## 567  A_woodhouseii_393605  0.9           504
## 568  A_woodhouseii_393606  0.9           504
## 569  A_woodhouseii_393697  0.9           504
## 570  A_woodhouseii_393698  0.9           493
## 571  A_woodhouseii_393699  0.9           370
## 572  A_woodhouseii_393702  0.9           504
## 573  A_woodhouseii_393712  0.9           504
## 574  A_woodhouseii_393713  0.9           494
## 575  A_woodhouseii_395768  0.9           504
## 576  A_californica_333849  1.0             0
## 577  A_californica_333854  1.0             0
## 578  A_californica_333855  1.0             0
## 579  A_californica_333857  1.0             0
## 580  A_californica_333860  1.0             0
## 581  A_californica_333862  1.0             0
## 582  A_californica_333868  1.0             0
## 583  A_californica_333871  1.0             0
## 584  A_californica_333873  1.0             0
## 585  A_californica_333874  1.0             0
## 586  A_californica_333914  1.0             0
## 587  A_californica_333917  1.0             0
## 588  A_californica_333925  1.0             0
## 589  A_californica_333931  1.0             0
## 590  A_californica_333932  1.0             0
## 591  A_californica_333934  1.0             0
## 592  A_californica_334002  1.0             0
## 593  A_californica_334012  1.0             0
## 594  A_californica_334015  1.0             0
## 595  A_californica_334017  1.0             0
## 596  A_californica_334019  1.0             0
## 597  A_californica_334171  1.0             0
## 598  A_californica_342037  1.0             0
## 599  A_californica_342048  1.0             0
## 600  A_californica_342050  1.0             0
## 601  A_californica_342051  1.0             0
## 602  A_californica_342066  1.0             0
## 603  A_californica_342072  1.0             0
## 604  A_californica_343428  1.0             0
## 605  A_californica_343430  1.0             0
## 606  A_californica_343432  1.0             0
## 607  A_californica_343438  1.0             0
## 608  A_californica_343442  1.0             0
## 609  A_californica_343451  1.0             0
## 610  A_californica_393615  1.0             0
## 611  A_californica_393616  1.0             0
## 612  A_californica_393721  1.0             0
## 613 A_coerulescens_396251  1.0             0
## 614 A_coerulescens_396254  1.0             0
## 615 A_coerulescens_396256  1.0             0
## 616 A_coerulescens_396259  1.0             0
## 617 A_coerulescens_396262  1.0             0
## 618 A_coerulescens_396263  1.0             0
## 619 A_coerulescens_396264  1.0             0
## 620 A_coerulescens_396265  1.0             0
## 621    A_insularis_334025  1.0             0
## 622    A_insularis_334029  1.0             0
## 623    A_insularis_334031  1.0             0
## 624    A_insularis_334032  1.0             0
## 625    A_insularis_334033  1.0             0
## 626    A_insularis_334034  1.0             0
## 627    A_insularis_334037  1.0             0
## 628    A_insularis_334038  1.0             0
## 629  A_sumichrasti_343502  1.0             0
## 630  A_sumichrasti_343503  1.0             0
## 631  A_sumichrasti_343510  1.0             0
## 632  A_sumichrasti_343511  1.0             0
## 633  A_sumichrasti_343512  1.0             0
## 634  A_sumichrasti_343513  1.0             0
## 635  A_sumichrasti_343514  1.0             0
## 636  A_sumichrasti_343515  1.0             0
## 637  A_sumichrasti_393633  1.0             0
## 638  A_sumichrasti_393635  1.0             0
## 639  A_sumichrasti_393636  1.0             0
## 640  A_sumichrasti_393637  1.0             0
## 641  A_sumichrasti_393638  1.0             0
## 642  A_sumichrasti_393639  1.0             0
## 643  A_sumichrasti_393640  1.0             0
## 644  A_woodhouseii_334059  1.0             0
## 645  A_woodhouseii_334062  1.0             0
## 646  A_woodhouseii_334063  1.0             0
## 647  A_woodhouseii_334086  1.0             0
## 648  A_woodhouseii_334088  1.0             0
## 649  A_woodhouseii_334096  1.0             0
## 650  A_woodhouseii_334132  1.0             0
## 651  A_woodhouseii_334133  1.0             0
## 652  A_woodhouseii_334134  1.0             0
## 653  A_woodhouseii_334142  1.0             0
## 654  A_woodhouseii_334148  1.0             0
## 655  A_woodhouseii_334153  1.0             0
## 656  A_woodhouseii_334156  1.0             0
## 657  A_woodhouseii_334161  1.0             0
## 658  A_woodhouseii_334170  1.0             0
## 659  A_woodhouseii_334172  1.0             0
## 660  A_woodhouseii_334188  1.0             0
## 661  A_woodhouseii_334190  1.0             0
## 662  A_woodhouseii_334196  1.0             0
## 663  A_woodhouseii_334210  1.0             0
## 664  A_woodhouseii_334211  1.0             0
## 665  A_woodhouseii_334217  1.0             0
## 666  A_woodhouseii_334230  1.0             0
## 667  A_woodhouseii_334240  1.0             0
## 668  A_woodhouseii_334241  1.0             0
## 669  A_woodhouseii_334242  1.0             0
## 670  A_woodhouseii_334243  1.0             0
## 671  A_woodhouseii_334244  1.0             0
## 672  A_woodhouseii_334247  1.0             0
## 673  A_woodhouseii_334250  1.0             0
## 674  A_woodhouseii_343453  1.0             0
## 675  A_woodhouseii_343458  1.0             0
## 676  A_woodhouseii_343461  1.0             0
## 677  A_woodhouseii_343476  1.0             0
## 678  A_woodhouseii_343480  1.0             0
## 679  A_woodhouseii_343481  1.0             0
## 680  A_woodhouseii_343483  1.0             0
## 681  A_woodhouseii_343497  1.0             0
## 682  A_woodhouseii_393605  1.0             0
## 683  A_woodhouseii_393606  1.0             0
## 684  A_woodhouseii_393697  1.0             0
## 685  A_woodhouseii_393698  1.0             0
## 686  A_woodhouseii_393699  1.0             0
## 687  A_woodhouseii_393702  1.0             0
## 688  A_woodhouseii_393712  1.0             0
## 689  A_woodhouseii_393713  1.0             0
## 690  A_woodhouseii_395768  1.0             0
#run function to drop samples above the threshold we want from the vcf
vcfR<-missing.by.sample(vcfR=vcfR, cutoff = .91)

## [1] "20 samples are above a 0.91 missing data cutoff, and were removed from VCF"
#subset popmap to only include retained individuals
popmap<-popmap[popmap$id %in% colnames(vcfR@gt),]

#alternatively, you can drop individuals from vcfR manually using the following syntax, if a strict cutoff doesn't work for your dataset
#vcfR@gt <- vcfR@gt[,colnames(vcfR@gt) != "E_trichroa_4702" & colnames(vcfR@gt) != "E_trichroa_27882"]

Step 3: Set the arbitrary missing data cutoff

We can visualize the effect that typical missing data cutoffs will have on both the number of SNPs retained and the total missing data in our entire dataset. We want to choose a cutoff that minimizes the overall missing data in the dataset, while maximizing the total number of loci retained. Note: This will also depend on the samples that we decided to include above. A good rule of thumb is that samples shouldn't be above 50% missing data after applying this cutoff. So if we are retaining low data samples out of necessity or project design, we may have to set a more stringent cutoff at the expense of total SNPs retained for downstream analyses.

#visualize missing data by SNP and the effect of various cutoffs on the missingness of each sample
missing.by.snp(vcfR)
## Picking joint bandwidth of 0.0654

##    filt missingness snps.retained
## 1  0.30  0.32226058         54233
## 2  0.50  0.20868664         38731
## 3  0.60  0.16241954         32583
## 4  0.65  0.13794427         29282
## 5  0.70  0.11634976         26265
## 6  0.75  0.09487303         23110
## 7  0.80  0.07845268         20534
## 8  0.85  0.05813809         17086
## 9  0.90  0.03908239         13323
## 10 0.95  0.01882440          8286
## 11 1.00  0.00000000          1809
#choose a value that retains an acceptable amount of missing data in each sample, and maximizes SNPs retained while minimizing overall missing data, and filter vcf
vcfR<-missing.by.snp(vcfR, cutoff = .85)

## [1] "90.68% of SNPs fell below a completeness cutoff of 0.85 and were removed from the VCF"

Step 4: investigate the effect of a minor allele frequency cutoff on our downstream inferences.

MAF cutoffs can be helpful in removing spurious and uninformative loci from the dataset, but also have the potential to bias downstream inferences. Linck and Battey (2019) have an excellent paper on just this topic. From the paper-

"We recommend researchers using model‐based programs to describe population structure observe the following best practices: (a) duplicate analyses with nonparametric methods suchas PCA and DAPC with cross validation (b) exclude singletons (c) compare alignments with multiple assembly parameters When seeking to exclude only singletons in alignments with missing data (a ubiquitous problem for reduced‐representation library preparation methods), it is preferable to filter by the count (rather than frequency) of the minor allele, because variation in the amount of missing data across an alignment will cause a static frequency cutoff to remove different SFS classes at different sites""

Our package contains a convenient wrapper functions that streamline the investigation of different MAF cutoffs. Warning: this is a heavy function if it is run without the 'min.mac' option. It is converting the entire vcf and running 6 unique dapc iterations, which will take time for large datasets. If you set 'min.mac' it will just filter your dataset, and should run quickly.

#use min.mac() to investigate the effect of multiple cutoffs
min_mac(vcfR = vcfR, popmap = popmap)

## [1] "for 1 minimum MAC cutoff, compare k means clustering to popmap assignment"
##            
##              1  2  3  4  5
##   californi 31  0  0  1  0
##   coerulesc  0  0  6  0  0
##   insularis  0  0  0  0  8
##   sumichras  0 10  0  0  0
##   woodhouse  0  0  0 39  0

## [1] "DAPC with min. MAC 1 and 16307 total SNPs, complete"
## [1] "for 2 minimum MAC cutoff, compare k means clustering to popmap assignment"
##            
##              1  2  3  4  5
##   californi  1  0  0  0 31
##   coerulesc  0  0  0  6  0
##   insularis  0  8  0  0  0
##   sumichras  0  0 10  0  0
##   woodhouse 39  0  0  0  0

## [1] "DAPC with min. MAC 2 and 11273 total SNPs, complete"
## [1] "for 3 minimum MAC cutoff, compare k means clustering to popmap assignment"
##            
##              1  2  3  4  5
##   californi  0  1  0  0 31
##   coerulesc  6  0  0  0  0
##   insularis  0  0  8  0  0
##   sumichras  0  0  0 10  0
##   woodhouse  0 39  0  0  0

## [1] "DAPC with min. MAC 3 and 8582 total SNPs, complete"
## [1] "for 4 minimum MAC cutoff, compare k means clustering to popmap assignment"
##            
##              1  2  3  4  5
##   californi  0  0  0 31  1
##   coerulesc  0  6  0  0  0
##   insularis  8  0  0  0  0
##   sumichras  0  0 10  0  0
##   woodhouse  0  0  0  0 39

## [1] "DAPC with min. MAC 4 and 7107 total SNPs, complete"
## [1] "for 5 minimum MAC cutoff, compare k means clustering to popmap assignment"
##            
##              1  2  3  4  5
##   californi  0 31  0  1  0
##   coerulesc  0  0  6  0  0
##   insularis  8  0  0  0  0
##   sumichras  0  0  0  0 10
##   woodhouse  0  0  0 39  0

## [1] "DAPC with min. MAC 5 and 6064 total SNPs, complete"
## [1] "for 10 minimum MAC cutoff, compare k means clustering to popmap assignment"
##            
##              1  2  3  4  5
##   californi 31  1  0  0  0
##   coerulesc  0  0  0  0  6
##   insularis  0  0  0  8  0
##   sumichras  0  0 10  0  0
##   woodhouse  0 39  0  0  0

## [1] "DAPC with min. MAC 10 and 3574 total SNPs, complete"
#based on these visualizations, I will remove singletons from the dataset, as it doesn't affect group inference
vcfR<-min_mac(vcfR, min.mac = 1)

## [1] "4.56% of SNPs fell below a minor allele count of 1 and were removed from the VCF"

check once more to see how many SNPs and individuals remain compared to our original, unfiltered vcf

vcfR
## ***** Object of Class vcfR *****
## 95 samples
## 36 CHROMs
## 16,307 variants
## Object size: 111.2 Mb
## 5.743 percent missing data
## *****        *****         *****
#plot depth per snp and per sample
dp <- extract.gt(vcfR, element = "DP", as.numeric=TRUE)
heatmap.bp(dp, rlabels = FALSE)

#plot genotype quality per snp and per sample
gq <- extract.gt(vcfR, element = "GQ", as.numeric=TRUE)
heatmap.bp(gq, rlabels = FALSE)

We can use the convenient function 'write.vcf' from vcfR to export our filtered vcf file for downstream analyses

#fix mislabeled sample
colnames(vcfR@gt)[colnames(vcfR@gt) == "A_californica_334171"]<-"A_woodhouseii_334171"
write.vcf(vcfR, "~/Desktop/aph.data/filtered.vcf")

If you need physically unlinked loci (which is a requirement of some programs, e.g. structure) this filtering step should always be done last, because it is not quality aware. Introducing a quality-blind linkage filter before doing the quality based filtering shown here would potentially remove quality SNPs while leaving us with only the low quality SNPs in a locus or genomic region. If filtering for linkage is needed, it can be done on our output vcf file with a simple one-liner via vcftools (set thin to whatever bp distance you assume linakge decays at in your study organism) vcftools --vcf vcf.vcf --thin 10000 --recode --out vcf