library(SNPfiltR)
## This is SNPfiltR v.1.0.0
##
## Detailed usage information is available at: devonderaad.github.io/SNPfiltR/
##
## If you use SNPfiltR in your published work, please cite the following papers:
##
## DeRaad, D.A. (2022), SNPfiltR: an R package for interactive and reproducible SNP filtering. Molecular Ecology Resources, 00, 1-15. http://doi.org/10.1111/1755-0998.13618
##
## Knaus, Brian J., and Niklaus J. Grunwald. 2017. VCFR: a package to manipulate and visualize variant call format data in R. Molecular Ecology Resources, 17.1:44-53. http://doi.org/10.1111/1755-0998.12549
library(vcfR)
##
## ***** *** vcfR *** *****
## This is vcfR 1.12.0
## browseVignettes('vcfR') # Documentation
## citation('vcfR') # Citation
## ***** ***** ***** *****
#read in vcf as vcfR
v<- read.vcfR("~/Desktop/phil.dicaeum/n8.vcf")
## Scanning file to determine attributes.
## File attributes:
## meta lines: 14
## header_line: 15
## variant count: 250231
## column count: 73
##
Meta line 14 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
## Character matrix gt rows: 250231
## Character matrix gt cols: 73
## skip: 0
## nrows: 250231
## row_num: 0
##
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant 6000
Processed variant 7000
Processed variant 8000
Processed variant 9000
Processed variant 10000
Processed variant 11000
Processed variant 12000
Processed variant 13000
Processed variant 14000
Processed variant 15000
Processed variant 16000
Processed variant 17000
Processed variant 18000
Processed variant 19000
Processed variant 20000
Processed variant 21000
Processed variant 22000
Processed variant 23000
Processed variant 24000
Processed variant 25000
Processed variant 26000
Processed variant 27000
Processed variant 28000
Processed variant 29000
Processed variant 30000
Processed variant 31000
Processed variant 32000
Processed variant 33000
Processed variant 34000
Processed variant 35000
Processed variant 36000
Processed variant 37000
Processed variant 38000
Processed variant 39000
Processed variant 40000
Processed variant 41000
Processed variant 42000
Processed variant 43000
Processed variant 44000
Processed variant 45000
Processed variant 46000
Processed variant 47000
Processed variant 48000
Processed variant 49000
Processed variant 50000
Processed variant 51000
Processed variant 52000
Processed variant 53000
Processed variant 54000
Processed variant 55000
Processed variant 56000
Processed variant 57000
Processed variant 58000
Processed variant 59000
Processed variant 60000
Processed variant 61000
Processed variant 62000
Processed variant 63000
Processed variant 64000
Processed variant 65000
Processed variant 66000
Processed variant 67000
Processed variant 68000
Processed variant 69000
Processed variant 70000
Processed variant 71000
Processed variant 72000
Processed variant 73000
Processed variant 74000
Processed variant 75000
Processed variant 76000
Processed variant 77000
Processed variant 78000
Processed variant 79000
Processed variant 80000
Processed variant 81000
Processed variant 82000
Processed variant 83000
Processed variant 84000
Processed variant 85000
Processed variant 86000
Processed variant 87000
Processed variant 88000
Processed variant 89000
Processed variant 90000
Processed variant 91000
Processed variant 92000
Processed variant 93000
Processed variant 94000
Processed variant 95000
Processed variant 96000
Processed variant 97000
Processed variant 98000
Processed variant 99000
Processed variant 100000
Processed variant 101000
Processed variant 102000
Processed variant 103000
Processed variant 104000
Processed variant 105000
Processed variant 106000
Processed variant 107000
Processed variant 108000
Processed variant 109000
Processed variant 110000
Processed variant 111000
Processed variant 112000
Processed variant 113000
Processed variant 114000
Processed variant 115000
Processed variant 116000
Processed variant 117000
Processed variant 118000
Processed variant 119000
Processed variant 120000
Processed variant 121000
Processed variant 122000
Processed variant 123000
Processed variant 124000
Processed variant 125000
Processed variant 126000
Processed variant 127000
Processed variant 128000
Processed variant 129000
Processed variant 130000
Processed variant 131000
Processed variant 132000
Processed variant 133000
Processed variant 134000
Processed variant 135000
Processed variant 136000
Processed variant 137000
Processed variant 138000
Processed variant 139000
Processed variant 140000
Processed variant 141000
Processed variant 142000
Processed variant 143000
Processed variant 144000
Processed variant 145000
Processed variant 146000
Processed variant 147000
Processed variant 148000
Processed variant 149000
Processed variant 150000
Processed variant 151000
Processed variant 152000
Processed variant 153000
Processed variant 154000
Processed variant 155000
Processed variant 156000
Processed variant 157000
Processed variant 158000
Processed variant 159000
Processed variant 160000
Processed variant 161000
Processed variant 162000
Processed variant 163000
Processed variant 164000
Processed variant 165000
Processed variant 166000
Processed variant 167000
Processed variant 168000
Processed variant 169000
Processed variant 170000
Processed variant 171000
Processed variant 172000
Processed variant 173000
Processed variant 174000
Processed variant 175000
Processed variant 176000
Processed variant 177000
Processed variant 178000
Processed variant 179000
Processed variant 180000
Processed variant 181000
Processed variant 182000
Processed variant 183000
Processed variant 184000
Processed variant 185000
Processed variant 186000
Processed variant 187000
Processed variant 188000
Processed variant 189000
Processed variant 190000
Processed variant 191000
Processed variant 192000
Processed variant 193000
Processed variant 194000
Processed variant 195000
Processed variant 196000
Processed variant 197000
Processed variant 198000
Processed variant 199000
Processed variant 200000
Processed variant 201000
Processed variant 202000
Processed variant 203000
Processed variant 204000
Processed variant 205000
Processed variant 206000
Processed variant 207000
Processed variant 208000
Processed variant 209000
Processed variant 210000
Processed variant 211000
Processed variant 212000
Processed variant 213000
Processed variant 214000
Processed variant 215000
Processed variant 216000
Processed variant 217000
Processed variant 218000
Processed variant 219000
Processed variant 220000
Processed variant 221000
Processed variant 222000
Processed variant 223000
Processed variant 224000
Processed variant 225000
Processed variant 226000
Processed variant 227000
Processed variant 228000
Processed variant 229000
Processed variant 230000
Processed variant 231000
Processed variant 232000
Processed variant 233000
Processed variant 234000
Processed variant 235000
Processed variant 236000
Processed variant 237000
Processed variant 238000
Processed variant 239000
Processed variant 240000
Processed variant 241000
Processed variant 242000
Processed variant 243000
Processed variant 244000
Processed variant 245000
Processed variant 246000
Processed variant 247000
Processed variant 248000
Processed variant 249000
Processed variant 250000
Processed variant: 250231
## All variants processed
#make popmap
pops<-read.csv("~/Desktop/phil.dicaeum/dicaeum.sampling.csv")
pops<-pops[pops$ID %in% colnames(v@gt),]
popmap<-pops[,c(1,4)]
#remove obviously contaminated sample from looking at the preliminary splitstree
v<-v[,colnames(v@gt)!= "D_hypoleucum_28663"]
#fix popmap
pops<-pops[pops$ID %in% colnames(v@gt),]
popmap<-pops[,c(1,4)]
#visualize distributions
hard_filter(v)
## no depth cutoff provided, exploratory visualization will be generated.

## no genotype quality cutoff provided, exploratory visualization will be generated.


## ***** Object of Class vcfR *****
## 63 samples
## 47564 CHROMs
## 250,231 variants
## Object size: 625.3 Mb
## 58.08 percent missing data
## ***** ***** *****
#hard filter to minimum depth of 5, and minimum genotype quality of 30
v<-hard_filter(v, depth = 5, gq = 35)
## 0.05% of genotypes fall below a read depth of 5 and were converted to NA
## 2.54% of genotypes fall below a genotype quality of 35 and were converted to NA
#execute allele balance filter
v<-filter_allele_balance(v)
## 16.48% of het genotypes (0.95% of all genotypes) fall outside of 0.25 - 0.75 allele balance ratio and were converted to NA

#visualize and pick appropriate max depth cutoff
max_depth(v)
## cutoff is not specified, exploratory visualization will be generated.

## dashed line indicates a mean depth across all SNPs of 72.8

#filter for max depth
max_depth(v, maxdepth = 150)
## maxdepth cutoff is specified, filtered vcfR object will be returned
## 16.31% of SNPs were above a mean depth of 150 and were removed from the vcf

## ***** Object of Class vcfR *****
## 63 samples
## 41978 CHROMs
## 209,408 variants
## Object size: 450.1 Mb
## 64.05 percent missing data
## ***** ***** *****
#remove invariant sites generated by filtering
v<-min_mac(v, min.mac = 1)
## 17.36% of SNPs fell below a minor allele count of 1 and were removed from the VCF

#check vcfR to see how many SNPs we have left
v
## ***** Object of Class vcfR *****
## 63 samples
## 43982 CHROMs
## 206,800 variants
## Object size: 527.5 Mb
## 58.33 percent missing data
## ***** ***** *****
#investigate patterns of missingness per sample
missing_by_sample(v)
## No popmap provided


## indiv filt snps.retained
## 1 D_hypoleucum_18070 0.5 71578
## 2 D_hypoleucum_18191 0.5 73049
## 3 D_hypoleucum_14037 0.5 74014
## 4 D_hypoleucum_14079 0.5 60840
## 5 D_hypoleucum_14065 0.5 72256
## 6 D_hypoleucum_14120 0.5 46184
## 7 D_hypoleucum_17976 0.5 37744
## 8 D_hypoleucum_18159 0.5 72353
## 9 D_hypoleucum_14075 0.5 70540
## 10 D_hypoleucum_20218 0.5 67004
## 11 D_hypoleucum_19066 0.5 34890
## 12 D_hypoleucum_19177 0.5 72160
## 13 D_hypoleucum_19178 0.5 71258
## 14 D_hypoleucum_14146 0.5 36352
## 15 D_hypoleucum_20213 0.5 52551
## 16 D_hypoleucum_FMNH454949 0.5 55342
## 17 D_hypoleucum_25637 0.5 74455
## 18 D_hypoleucum_1271 0.5 71906
## 19 D_hypoleucum_1273 0.5 74371
## 20 D_hypoleucum_1275 0.5 59957
## 21 D_hypoleucum_25921 0.5 69190
## 22 D_hypoleucum_3274 0.5 68561
## 23 D_hypoleucum_26984 0.5 73388
## 24 D_hypoleucum_3208 0.5 74503
## 25 D_hypoleucum_3158 0.5 74560
## 26 D_hypoleucum_2253 0.5 63952
## 27 D_hypoleucum_3095 0.5 68611
## 28 D_hypoleucum_462070 0.5 73605
## 29 D_hypoleucum_25675 0.5 44555
## 30 D_hypoleucum_454950 0.5 73786
## 31 D_hypoleucum_25880 0.5 68333
## 32 D_hypoleucum_3314 0.5 72117
## 33 D_hypoleucum_357608 0.5 45510
## 34 D_hypoleucum_357615 0.5 75155
## 35 D_hypoleucum_357612 0.5 39822
## 36 D_hypoleucum_19638 0.5 66516
## 37 D_hypoleucum_25868 0.5 74066
## 38 D_hypoleucum_17969 0.5 68845
## 39 D_hypoleucum_25672 0.5 46130
## 40 D_hypoleucum_26975 0.5 64410
## 41 D_hypoleucum_2229 0.5 52940
## 42 D_hypoleucum_2067 0.5 75507
## 43 D_hypoleucum_28329 0.5 72370
## 44 D_hypoleucum_28361 0.5 61609
## 45 D_hypoleucum_28376 0.5 66034
## 46 D_hypoleucum_2208 0.5 75500
## 47 D_hypoleucum_28294 0.5 61791
## 48 D_hypoleucum_1956 0.5 76320
## 49 D_hypoleucum_25670 0.5 58783
## 50 D_hypoleucum_20921 0.5 75733
## 51 D_hypoleucum_20193 0.5 41715
## 52 D_hypoleucum_28416 0.5 70945
## 53 D_hypoleucum_18193 0.5 75608
## 54 D_hypoleucum_27450 0.5 74751
## 55 D_hypoleucum_19046 0.5 67654
## 56 D_hypoleucum_19136 0.5 69792
## 57 D_hypoleucum_28676 0.5 73799
## 58 D_nigrilore_KU28413 0.5 59445
## 59 D_nigrilore_KU28414 0.5 63208
## 60 D_hypoleucum_29945 0.5 74446
## 61 D_hypoleucum_31636 0.5 74750
## 62 D_hypoleucum_29951 0.5 72999
## 63 D_hypoleucum_31644 0.5 71622
## 64 D_hypoleucum_18070 0.6 63616
## 65 D_hypoleucum_18191 0.6 64388
## 66 D_hypoleucum_14037 0.6 64024
## 67 D_hypoleucum_14079 0.6 54176
## 68 D_hypoleucum_14065 0.6 62307
## 69 D_hypoleucum_14120 0.6 41210
## 70 D_hypoleucum_17976 0.6 33983
## 71 D_hypoleucum_18159 0.6 62882
## 72 D_hypoleucum_14075 0.6 60317
## 73 D_hypoleucum_20218 0.6 56835
## 74 D_hypoleucum_19066 0.6 31233
## 75 D_hypoleucum_19177 0.6 63991
## 76 D_hypoleucum_19178 0.6 62982
## 77 D_hypoleucum_14146 0.6 32573
## 78 D_hypoleucum_20213 0.6 47108
## 79 D_hypoleucum_FMNH454949 0.6 49611
## 80 D_hypoleucum_25637 0.6 64130
## 81 D_hypoleucum_1271 0.6 61842
## 82 D_hypoleucum_1273 0.6 65590
## 83 D_hypoleucum_1275 0.6 49600
## 84 D_hypoleucum_25921 0.6 58984
## 85 D_hypoleucum_3274 0.6 57740
## 86 D_hypoleucum_26984 0.6 63989
## 87 D_hypoleucum_3208 0.6 64870
## 88 D_hypoleucum_3158 0.6 65443
## 89 D_hypoleucum_2253 0.6 57370
## 90 D_hypoleucum_3095 0.6 61178
## 91 D_hypoleucum_462070 0.6 63547
## 92 D_hypoleucum_25675 0.6 39939
## 93 D_hypoleucum_454950 0.6 64997
## 94 D_hypoleucum_25880 0.6 57854
## 95 D_hypoleucum_3314 0.6 63991
## 96 D_hypoleucum_357608 0.6 40601
## 97 D_hypoleucum_357615 0.6 65342
## 98 D_hypoleucum_357612 0.6 36032
## 99 D_hypoleucum_19638 0.6 59324
## 100 D_hypoleucum_25868 0.6 65354
## 101 D_hypoleucum_17969 0.6 58527
## 102 D_hypoleucum_25672 0.6 41468
## 103 D_hypoleucum_26975 0.6 57578
## 104 D_hypoleucum_2229 0.6 47430
## 105 D_hypoleucum_2067 0.6 66059
## 106 D_hypoleucum_28329 0.6 63738
## 107 D_hypoleucum_28361 0.6 51416
## 108 D_hypoleucum_28376 0.6 56036
## 109 D_hypoleucum_2208 0.6 65821
## 110 D_hypoleucum_28294 0.6 51968
## 111 D_hypoleucum_1956 0.6 66784
## 112 D_hypoleucum_25670 0.6 52767
## 113 D_hypoleucum_20921 0.6 66196
## 114 D_hypoleucum_20193 0.6 37341
## 115 D_hypoleucum_28416 0.6 61005
## 116 D_hypoleucum_18193 0.6 65329
## 117 D_hypoleucum_27450 0.6 65793
## 118 D_hypoleucum_19046 0.6 57096
## 119 D_hypoleucum_19136 0.6 59218
## 120 D_hypoleucum_28676 0.6 63096
## 121 D_nigrilore_KU28413 0.6 52730
## 122 D_nigrilore_KU28414 0.6 54986
## 123 D_hypoleucum_29945 0.6 65331
## 124 D_hypoleucum_31636 0.6 65145
## 125 D_hypoleucum_29951 0.6 62654
## 126 D_hypoleucum_31644 0.6 63346
## 127 D_hypoleucum_18070 0.7 46250
## 128 D_hypoleucum_18191 0.7 46917
## 129 D_hypoleucum_14037 0.7 45940
## 130 D_hypoleucum_14079 0.7 40450
## 131 D_hypoleucum_14065 0.7 44765
## 132 D_hypoleucum_14120 0.7 30839
## 133 D_hypoleucum_17976 0.7 25388
## 134 D_hypoleucum_18159 0.7 45366
## 135 D_hypoleucum_14075 0.7 43208
## 136 D_hypoleucum_20218 0.7 40280
## 137 D_hypoleucum_19066 0.7 23463
## 138 D_hypoleucum_19177 0.7 46811
## 139 D_hypoleucum_19178 0.7 46325
## 140 D_hypoleucum_14146 0.7 24959
## 141 D_hypoleucum_20213 0.7 35396
## 142 D_hypoleucum_FMNH454949 0.7 37435
## 143 D_hypoleucum_25637 0.7 45900
## 144 D_hypoleucum_1271 0.7 44130
## 145 D_hypoleucum_1273 0.7 47527
## 146 D_hypoleucum_1275 0.7 34457
## 147 D_hypoleucum_25921 0.7 42260
## 148 D_hypoleucum_3274 0.7 40688
## 149 D_hypoleucum_26984 0.7 45975
## 150 D_hypoleucum_3208 0.7 46547
## 151 D_hypoleucum_3158 0.7 47346
## 152 D_hypoleucum_2253 0.7 42719
## 153 D_hypoleucum_3095 0.7 44832
## 154 D_hypoleucum_462070 0.7 45405
## 155 D_hypoleucum_25675 0.7 30064
## 156 D_hypoleucum_454950 0.7 47221
## 157 D_hypoleucum_25880 0.7 40835
## 158 D_hypoleucum_3314 0.7 46664
## 159 D_hypoleucum_357608 0.7 30345
## 160 D_hypoleucum_357615 0.7 46858
## 161 D_hypoleucum_357612 0.7 27495
## 162 D_hypoleucum_19638 0.7 43688
## 163 D_hypoleucum_25868 0.7 47240
## 164 D_hypoleucum_17969 0.7 41687
## 165 D_hypoleucum_25672 0.7 31019
## 166 D_hypoleucum_26975 0.7 42627
## 167 D_hypoleucum_2229 0.7 35465
## 168 D_hypoleucum_2067 0.7 47550
## 169 D_hypoleucum_28329 0.7 46794
## 170 D_hypoleucum_28361 0.7 36628
## 171 D_hypoleucum_28376 0.7 40345
## 172 D_hypoleucum_2208 0.7 47613
## 173 D_hypoleucum_28294 0.7 36843
## 174 D_hypoleucum_1956 0.7 47959
## 175 D_hypoleucum_25670 0.7 39754
## 176 D_hypoleucum_20921 0.7 47391
## 177 D_hypoleucum_20193 0.7 27971
## 178 D_hypoleucum_28416 0.7 44200
## 179 D_hypoleucum_18193 0.7 46632
## 180 D_hypoleucum_27450 0.7 47454
## 181 D_hypoleucum_19046 0.7 40743
## 182 D_hypoleucum_19136 0.7 42205
## 183 D_hypoleucum_28676 0.7 45089
## 184 D_nigrilore_KU28413 0.7 39038
## 185 D_nigrilore_KU28414 0.7 39764
## 186 D_hypoleucum_29945 0.7 47370
## 187 D_hypoleucum_31636 0.7 46995
## 188 D_hypoleucum_29951 0.7 44864
## 189 D_hypoleucum_31644 0.7 46351
## 190 D_hypoleucum_18070 0.8 23008
## 191 D_hypoleucum_18191 0.8 23340
## 192 D_hypoleucum_14037 0.8 22784
## 193 D_hypoleucum_14079 0.8 21227
## 194 D_hypoleucum_14065 0.8 22274
## 195 D_hypoleucum_14120 0.8 16317
## 196 D_hypoleucum_17976 0.8 13692
## 197 D_hypoleucum_18159 0.8 22717
## 198 D_hypoleucum_14075 0.8 21843
## 199 D_hypoleucum_20218 0.8 20799
## 200 D_hypoleucum_19066 0.8 12903
## 201 D_hypoleucum_19177 0.8 23346
## 202 D_hypoleucum_19178 0.8 23085
## 203 D_hypoleucum_14146 0.8 13558
## 204 D_hypoleucum_20213 0.8 18965
## 205 D_hypoleucum_FMNH454949 0.8 19615
## 206 D_hypoleucum_25637 0.8 22793
## 207 D_hypoleucum_1271 0.8 22216
## 208 D_hypoleucum_1273 0.8 23477
## 209 D_hypoleucum_1275 0.8 17998
## 210 D_hypoleucum_25921 0.8 21498
## 211 D_hypoleucum_3274 0.8 20545
## 212 D_hypoleucum_26984 0.8 22952
## 213 D_hypoleucum_3208 0.8 23091
## 214 D_hypoleucum_3158 0.8 23255
## 215 D_hypoleucum_2253 0.8 22029
## 216 D_hypoleucum_3095 0.8 22559
## 217 D_hypoleucum_462070 0.8 22609
## 218 D_hypoleucum_25675 0.8 16111
## 219 D_hypoleucum_454950 0.8 23330
## 220 D_hypoleucum_25880 0.8 20945
## 221 D_hypoleucum_3314 0.8 23270
## 222 D_hypoleucum_357608 0.8 16081
## 223 D_hypoleucum_357615 0.8 23068
## 224 D_hypoleucum_357612 0.8 15103
## 225 D_hypoleucum_19638 0.8 22099
## 226 D_hypoleucum_25868 0.8 23429
## 227 D_hypoleucum_17969 0.8 21357
## 228 D_hypoleucum_25672 0.8 16416
## 229 D_hypoleucum_26975 0.8 21598
## 230 D_hypoleucum_2229 0.8 18202
## 231 D_hypoleucum_2067 0.8 23497
## 232 D_hypoleucum_28329 0.8 23158
## 233 D_hypoleucum_28361 0.8 19196
## 234 D_hypoleucum_28376 0.8 20922
## 235 D_hypoleucum_2208 0.8 23394
## 236 D_hypoleucum_28294 0.8 19190
## 237 D_hypoleucum_1956 0.8 23595
## 238 D_hypoleucum_25670 0.8 20870
## 239 D_hypoleucum_20921 0.8 23380
## 240 D_hypoleucum_20193 0.8 15025
## 241 D_hypoleucum_28416 0.8 22394
## 242 D_hypoleucum_18193 0.8 23043
## 243 D_hypoleucum_27450 0.8 23464
## 244 D_hypoleucum_19046 0.8 20824
## 245 D_hypoleucum_19136 0.8 21414
## 246 D_hypoleucum_28676 0.8 22688
## 247 D_nigrilore_KU28413 0.8 20070
## 248 D_nigrilore_KU28414 0.8 20117
## 249 D_hypoleucum_29945 0.8 23414
## 250 D_hypoleucum_31636 0.8 23227
## 251 D_hypoleucum_29951 0.8 22688
## 252 D_hypoleucum_31644 0.8 23044
## 253 D_hypoleucum_18070 0.9 4692
## 254 D_hypoleucum_18191 0.9 4760
## 255 D_hypoleucum_14037 0.9 4657
## 256 D_hypoleucum_14079 0.9 4482
## 257 D_hypoleucum_14065 0.9 4603
## 258 D_hypoleucum_14120 0.9 3788
## 259 D_hypoleucum_17976 0.9 3325
## 260 D_hypoleucum_18159 0.9 4668
## 261 D_hypoleucum_14075 0.9 4623
## 262 D_hypoleucum_20218 0.9 4525
## 263 D_hypoleucum_19066 0.9 3289
## 264 D_hypoleucum_19177 0.9 4752
## 265 D_hypoleucum_19178 0.9 4691
## 266 D_hypoleucum_14146 0.9 3429
## 267 D_hypoleucum_20213 0.9 4346
## 268 D_hypoleucum_FMNH454949 0.9 4390
## 269 D_hypoleucum_25637 0.9 4710
## 270 D_hypoleucum_1271 0.9 4613
## 271 D_hypoleucum_1273 0.9 4790
## 272 D_hypoleucum_1275 0.9 4118
## 273 D_hypoleucum_25921 0.9 4611
## 274 D_hypoleucum_3274 0.9 4449
## 275 D_hypoleucum_26984 0.9 4662
## 276 D_hypoleucum_3208 0.9 4712
## 277 D_hypoleucum_3158 0.9 4707
## 278 D_hypoleucum_2253 0.9 4638
## 279 D_hypoleucum_3095 0.9 4679
## 280 D_hypoleucum_462070 0.9 4658
## 281 D_hypoleucum_25675 0.9 3893
## 282 D_hypoleucum_454950 0.9 4730
## 283 D_hypoleucum_25880 0.9 4549
## 284 D_hypoleucum_3314 0.9 4727
## 285 D_hypoleucum_357608 0.9 3790
## 286 D_hypoleucum_357615 0.9 4678
## 287 D_hypoleucum_357612 0.9 3629
## 288 D_hypoleucum_19638 0.9 4632
## 289 D_hypoleucum_25868 0.9 4730
## 290 D_hypoleucum_17969 0.9 4542
## 291 D_hypoleucum_25672 0.9 3920
## 292 D_hypoleucum_26975 0.9 4614
## 293 D_hypoleucum_2229 0.9 4134
## 294 D_hypoleucum_2067 0.9 4754
## 295 D_hypoleucum_28329 0.9 4721
## 296 D_hypoleucum_28361 0.9 4306
## 297 D_hypoleucum_28376 0.9 4585
## 298 D_hypoleucum_2208 0.9 4723
## 299 D_hypoleucum_28294 0.9 4408
## 300 D_hypoleucum_1956 0.9 4770
## 301 D_hypoleucum_25670 0.9 4495
## 302 D_hypoleucum_20921 0.9 4789
## 303 D_hypoleucum_20193 0.9 3691
## 304 D_hypoleucum_28416 0.9 4635
## 305 D_hypoleucum_18193 0.9 4701
## 306 D_hypoleucum_27450 0.9 4755
## 307 D_hypoleucum_19046 0.9 4474
## 308 D_hypoleucum_19136 0.9 4624
## 309 D_hypoleucum_28676 0.9 4680
## 310 D_nigrilore_KU28413 0.9 4303
## 311 D_nigrilore_KU28414 0.9 4345
## 312 D_hypoleucum_29945 0.9 4742
## 313 D_hypoleucum_31636 0.9 4734
## 314 D_hypoleucum_29951 0.9 4680
## 315 D_hypoleucum_31644 0.9 4750
## 316 D_hypoleucum_18070 1.0 2
## 317 D_hypoleucum_18191 1.0 2
## 318 D_hypoleucum_14037 1.0 2
## 319 D_hypoleucum_14079 1.0 2
## 320 D_hypoleucum_14065 1.0 2
## 321 D_hypoleucum_14120 1.0 2
## 322 D_hypoleucum_17976 1.0 2
## 323 D_hypoleucum_18159 1.0 2
## 324 D_hypoleucum_14075 1.0 2
## 325 D_hypoleucum_20218 1.0 2
## 326 D_hypoleucum_19066 1.0 2
## 327 D_hypoleucum_19177 1.0 2
## 328 D_hypoleucum_19178 1.0 2
## 329 D_hypoleucum_14146 1.0 2
## 330 D_hypoleucum_20213 1.0 2
## 331 D_hypoleucum_FMNH454949 1.0 2
## 332 D_hypoleucum_25637 1.0 2
## 333 D_hypoleucum_1271 1.0 2
## 334 D_hypoleucum_1273 1.0 2
## 335 D_hypoleucum_1275 1.0 2
## 336 D_hypoleucum_25921 1.0 2
## 337 D_hypoleucum_3274 1.0 2
## 338 D_hypoleucum_26984 1.0 2
## 339 D_hypoleucum_3208 1.0 2
## 340 D_hypoleucum_3158 1.0 2
## 341 D_hypoleucum_2253 1.0 2
## 342 D_hypoleucum_3095 1.0 2
## 343 D_hypoleucum_462070 1.0 2
## 344 D_hypoleucum_25675 1.0 2
## 345 D_hypoleucum_454950 1.0 2
## 346 D_hypoleucum_25880 1.0 2
## 347 D_hypoleucum_3314 1.0 2
## 348 D_hypoleucum_357608 1.0 2
## 349 D_hypoleucum_357615 1.0 2
## 350 D_hypoleucum_357612 1.0 2
## 351 D_hypoleucum_19638 1.0 2
## 352 D_hypoleucum_25868 1.0 2
## 353 D_hypoleucum_17969 1.0 2
## 354 D_hypoleucum_25672 1.0 2
## 355 D_hypoleucum_26975 1.0 2
## 356 D_hypoleucum_2229 1.0 2
## 357 D_hypoleucum_2067 1.0 2
## 358 D_hypoleucum_28329 1.0 2
## 359 D_hypoleucum_28361 1.0 2
## 360 D_hypoleucum_28376 1.0 2
## 361 D_hypoleucum_2208 1.0 2
## 362 D_hypoleucum_28294 1.0 2
## 363 D_hypoleucum_1956 1.0 2
## 364 D_hypoleucum_25670 1.0 2
## 365 D_hypoleucum_20921 1.0 2
## 366 D_hypoleucum_20193 1.0 2
## 367 D_hypoleucum_28416 1.0 2
## 368 D_hypoleucum_18193 1.0 2
## 369 D_hypoleucum_27450 1.0 2
## 370 D_hypoleucum_19046 1.0 2
## 371 D_hypoleucum_19136 1.0 2
## 372 D_hypoleucum_28676 1.0 2
## 373 D_nigrilore_KU28413 1.0 2
## 374 D_nigrilore_KU28414 1.0 2
## 375 D_hypoleucum_29945 1.0 2
## 376 D_hypoleucum_31636 1.0 2
## 377 D_hypoleucum_29951 1.0 2
## 378 D_hypoleucum_31644 1.0 2
#investigate patterns of missingness per SNP
missing_by_snp(v)
## cutoff is not specified, exploratory visualizations will be generated
## Picking joint bandwidth of 0.0671


## filt missingness snps.retained
## 1 0.30 0.35380374 118045
## 2 0.50 0.27764467 90571
## 3 0.60 0.24277876 75418
## 4 0.65 0.22409186 66391
## 5 0.70 0.19504400 51513
## 6 0.75 0.16960576 38350
## 7 0.80 0.13985079 24435
## 8 0.85 0.10807756 12851
## 9 0.90 0.07399186 4827
## 10 0.95 0.03932816 963
## 11 1.00 0.00000000 2
#implement per sample completeness filter
v<-missing_by_sample(v, cutoff = .785)
## 3 samples are above a 0.785 missing data cutoff, and were removed from VCF

#remove invariant sites generated by dropping individuals
v<-min_mac(v, min.mac = 1)
## 0.64% of SNPs fell below a minor allele count of 1 and were removed from the VCF

#fix popmap
pops<-pops[pops$ID %in% colnames(v@gt),]
popmap<-popmap[popmap$ID %in% colnames(v@gt),]
colnames(popmap)<-c('id','pop')
missing_by_snp(v)
## cutoff is not specified, exploratory visualizations will be generated
## Picking joint bandwidth of 0.0636


## filt missingness snps.retained
## 1 0.30 0.34418169 119207
## 2 0.50 0.26937724 93086
## 3 0.60 0.23288863 77879
## 4 0.65 0.21423534 69242
## 5 0.70 0.19260721 58548
## 6 0.75 0.16827150 46121
## 7 0.80 0.14037959 32140
## 8 0.85 0.10845811 18499
## 9 0.90 0.07491050 8101
## 10 0.95 0.03711678 1881
## 11 1.00 0.00000000 62
#reorder popmap to match vcf, this is a bug, you shouldn't have to do this
popmaps<-popmap[order(match(popmap$id,colnames(v@gt)[-1])),]
miss<-assess_missing_data_tsne(v, popmap=popmaps, threshold=c(.85,.9,.95), clustering = FALSE)
## cutoff is specified, filtered vcfR object will be returned
## 91% of SNPs fell below a completeness cutoff of 0.85 and were removed from the VCF
## Loading required namespace: adegenet


## cutoff is specified, filtered vcfR object will be returned

## 97.32% of SNPs fell below a completeness cutoff of 0.9 and were removed from the VCF


## cutoff is specified, filtered vcfR object will be returned

## 99.08% of SNPs fell below a completeness cutoff of 0.95 and were removed from the VCF



#missing data doesn't seem to be driving clustering regardless of threhsold
#implement missing data per SNP filter
v<-missing_by_snp(v, cutoff=.85)
## cutoff is specified, filtered vcfR object will be returned
## 91% of SNPs fell below a completeness cutoff of 0.85 and were removed from the VCF

#investigate clustering patterns with a minor allele cutoff
v.mac<-min_mac(v, min.mac = 2)
## 40.57% of SNPs fell below a minor allele count of 2 and were removed from the VCF

v.mac
## ***** Object of Class vcfR *****
## 60 samples
## 2389 CHROMs
## 10,994 variants
## Object size: 54.9 Mb
## 11.01 percent missing data
## ***** ***** *****
assess_missing_data_tsne(v.mac, popmap=popmaps, clustering = FALSE)


## tsne.ax1 tsne.ax2 popmap.pop
## 1 23.285083 44.283749 Luzon
## 2 52.012789 -29.714065 Zamboanga
## 3 3.045111 -1.630845 Mindanao
## 4 1.823689 -3.004490 Mindanao
## 5 5.069517 -1.928278 Mindanao
## 6 -21.486003 -20.117547 Mindanao
## 7 51.187124 -27.261521 Zamboanga
## 8 3.688963 -3.300658 Mindanao
## 9 36.229384 36.102958 Luzon
## 10 51.127101 -31.785423 Zamboanga
## 11 50.403871 -29.546373 Zamboanga
## 12 26.245166 47.584687 Luzon
## 13 24.933394 48.755476 Luzon
## 14 38.368464 37.057353 Luzon
## 15 -32.112363 -7.306372 Mindanao
## 16 -40.417793 -11.877839 Mindanao
## 17 -30.359613 -8.896437 Mindanao
## 18 34.622797 37.416804 Luzon
## 19 -41.437414 -14.264191 Mindanao
## 20 34.119261 35.824224 Luzon
## 21 -31.498155 -11.099356 Mindanao
## 22 -22.038848 -26.867581 Mindanao
## 23 -21.399663 -31.884239 Mindanao
## 24 -32.210442 -9.779238 Mindanao
## 25 28.078437 42.480649 Luzon
## 26 20.932720 53.772649 Luzon
## 27 35.526005 39.006265 Luzon
## 28 27.468343 44.819879 Luzon
## 29 -36.804177 -8.645378 Mindanao
## 30 -17.671575 -19.275233 Mindanao
## 31 -21.809102 -29.518483 Mindanao
## 32 -18.646999 -20.203929 Mindanao
## 33 24.864162 44.419736 Luzon
## 34 24.022050 39.589388 Luzon
## 35 21.760675 41.772599 Luzon
## 36 22.679416 53.651250 Luzon
## 37 24.143084 46.289444 Luzon
## 38 -22.493396 -32.838849 Mindanao
## 39 -22.188346 -35.655099 Mindanao
## 40 -35.332834 -16.593551 Mindanao
## 41 -24.290429 -19.816074 Mindanao
## 42 -24.114153 -17.526218 Mindanao
## 43 -20.520434 -35.460526 Mindanao
## 44 -36.534331 -15.110493 Mindanao
## 45 -20.740588 -37.575934 Mindanao
## 46 22.013085 45.310476 Luzon
## 47 -42.849258 -11.619304 Mindanao
## 48 21.772676 55.228000 Luzon
## 49 -39.616292 -10.707290 Mindanao
## 50 53.613577 -31.041270 Zamboanga
## 51 -25.534335 -16.229277 Mindanao
## 52 -26.884330 -15.495748 Mindanao
## 53 -26.351991 -20.047901 Mindanao
## 54 -34.322229 -14.904292 Mindanao
## 55 -14.544637 -8.102419 nigrilore
## 56 -14.891110 -6.991050 nigrilore
## 57 50.735209 -33.781664 Zamboanga
## 58 -30.879867 -18.007916 Mindanao
## 59 49.243578 -31.930041 Zamboanga
## 60 -33.034026 -16.023193 Mindanao
#seeing potentially spurious groupings with singletons removed, we will ignore for now
#plot depth per snp and per sample
dp <- extract.gt(v, element = "DP", as.numeric=TRUE)
heatmap.bp(dp, rlabels = FALSE)

#plot genotype quality per snp and per sample
gq <- extract.gt(v, element = "GQ", as.numeric=TRUE)
heatmap.bp(gq, rlabels = FALSE)

#write out vcf with all SNPs
#vcfR::write.vcf(v, "~/Desktop/phil.dicaeum/filtered.85.vcf.gz")
#linkage filter vcf to thin SNPs to one per 500bp
v.thin<-distance_thin(v, min.distance = 500)
##
|
| | 0%
|
| | 1%
|
|= | 1%
|
|= | 2%
|
|== | 2%
|
|== | 3%
|
|== | 4%
|
|=== | 4%
|
|=== | 5%
|
|==== | 5%
|
|==== | 6%
|
|===== | 6%
|
|===== | 7%
|
|===== | 8%
|
|====== | 8%
|
|====== | 9%
|
|======= | 9%
|
|======= | 10%
|
|======= | 11%
|
|======== | 11%
|
|======== | 12%
|
|========= | 12%
|
|========= | 13%
|
|========= | 14%
|
|========== | 14%
|
|========== | 15%
|
|=========== | 15%
|
|=========== | 16%
|
|============ | 16%
|
|============ | 17%
|
|============ | 18%
|
|============= | 18%
|
|============= | 19%
|
|============== | 19%
|
|============== | 20%
|
|============== | 21%
|
|=============== | 21%
|
|=============== | 22%
|
|================ | 22%
|
|================ | 23%
|
|================ | 24%
|
|================= | 24%
|
|================= | 25%
|
|================== | 25%
|
|================== | 26%
|
|=================== | 26%
|
|=================== | 27%
|
|=================== | 28%
|
|==================== | 28%
|
|==================== | 29%
|
|===================== | 29%
|
|===================== | 30%
|
|===================== | 31%
|
|====================== | 31%
|
|====================== | 32%
|
|======================= | 32%
|
|======================= | 33%
|
|======================= | 34%
|
|======================== | 34%
|
|======================== | 35%
|
|========================= | 35%
|
|========================= | 36%
|
|========================== | 36%
|
|========================== | 37%
|
|========================== | 38%
|
|=========================== | 38%
|
|=========================== | 39%
|
|============================ | 39%
|
|============================ | 40%
|
|============================ | 41%
|
|============================= | 41%
|
|============================= | 42%
|
|============================== | 42%
|
|============================== | 43%
|
|============================== | 44%
|
|=============================== | 44%
|
|=============================== | 45%
|
|================================ | 45%
|
|================================ | 46%
|
|================================= | 46%
|
|================================= | 47%
|
|================================= | 48%
|
|================================== | 48%
|
|================================== | 49%
|
|=================================== | 49%
|
|=================================== | 50%
|
|=================================== | 51%
|
|==================================== | 51%
|
|==================================== | 52%
|
|===================================== | 52%
|
|===================================== | 53%
|
|===================================== | 54%
|
|====================================== | 54%
|
|====================================== | 55%
|
|======================================= | 55%
|
|======================================= | 56%
|
|======================================== | 56%
|
|======================================== | 57%
|
|======================================== | 58%
|
|========================================= | 58%
|
|========================================= | 59%
|
|========================================== | 59%
|
|========================================== | 60%
|
|========================================== | 61%
|
|=========================================== | 61%
|
|=========================================== | 62%
|
|============================================ | 62%
|
|============================================ | 63%
|
|============================================ | 64%
|
|============================================= | 64%
|
|============================================= | 65%
|
|============================================== | 65%
|
|============================================== | 66%
|
|=============================================== | 66%
|
|=============================================== | 67%
|
|=============================================== | 68%
|
|================================================ | 68%
|
|================================================ | 69%
|
|================================================= | 69%
|
|================================================= | 70%
|
|================================================= | 71%
|
|================================================== | 71%
|
|================================================== | 72%
|
|=================================================== | 72%
|
|=================================================== | 73%
|
|=================================================== | 74%
|
|==================================================== | 74%
|
|==================================================== | 75%
|
|===================================================== | 75%
|
|===================================================== | 76%
|
|====================================================== | 76%
|
|====================================================== | 77%
|
|====================================================== | 78%
|
|======================================================= | 78%
|
|======================================================= | 79%
|
|======================================================== | 79%
|
|======================================================== | 80%
|
|======================================================== | 81%
|
|========================================================= | 81%
|
|========================================================= | 82%
|
|========================================================== | 82%
|
|========================================================== | 83%
|
|========================================================== | 84%
|
|=========================================================== | 84%
|
|=========================================================== | 85%
|
|============================================================ | 85%
|
|============================================================ | 86%
|
|============================================================= | 86%
|
|============================================================= | 87%
|
|============================================================= | 88%
|
|============================================================== | 88%
|
|============================================================== | 89%
|
|=============================================================== | 89%
|
|=============================================================== | 90%
|
|=============================================================== | 91%
|
|================================================================ | 91%
|
|================================================================ | 92%
|
|================================================================= | 92%
|
|================================================================= | 93%
|
|================================================================= | 94%
|
|================================================================== | 94%
|
|================================================================== | 95%
|
|=================================================================== | 95%
|
|=================================================================== | 96%
|
|==================================================================== | 96%
|
|==================================================================== | 97%
|
|==================================================================== | 98%
|
|===================================================================== | 98%
|
|===================================================================== | 99%
|
|======================================================================| 99%
|
|======================================================================| 100%
## 2590 out of 18499 input SNPs were not located within 500 base-pairs of another SNP and were retained despite filtering
#write out thinned vcf
#vcfR::write.vcf(v.thin, "~/Desktop/phil.dicaeum/filtered.85.unlinked.vcf.gz")