#load packages
library(vcfR)
library(ggplot2)
library(adegenet)
library(SNPfiltR)
library(StAMPP)
#read in vcf as vcfR
#this has been pre-filtered to remove SNPs near indels and SNPs mapping with low confidence
vcfR <- read.vcfR("~/Desktop/mex.towhees/genotyped_X_samples_only_PASS_snp.vcf")
#12,580 SNPs
#check out the details on the unfiltered vcf
vcfR.unfilt <- read.vcfR("~/Desktop/mex.towhees/genotyped_X_samples_snps.vcf")
#16,237 SNPs
#read in sample info csv
sample.info<-read.csv("~/Desktop/mex.towhees/sample.locs.csv")
#make sure sampling file matches vcf
sample.info$ID %in% colnames(vcfR@gt)[-1]
colnames(vcfR@gt)[-1] %in% sample.info$ID
#if needed, subset sampling file to include only samples in vcf
#sample.info<-sample.info[sample.info$id %in% colnames(vcfR@gt)[-1],]
#reorder sampling file to match order of samples in vcf
sample.info<-sample.info[match(colnames(vcfR@gt)[-1], sample.info$ID),]
sample.info$ID == colnames(vcfR@gt)[-1]
#retain only biallelic SNPs
vcfR<-filter_biallelic(vcfR)
## [1] "84 SNPs, 0.007% of all input SNPs, contained more than 2 alleles, and were removed from the VCF"
#execute allele balance filter
vcfR<-filter_allele_balance(vcfR)
## [1] "50.83% of het genotypes (7.8% of all genotypes) fall outside of .25 - .75 allele balance and were converted to NA"
#visualize and pick appropriate max depth cutoff
max_depth(vcfR)
## [1] "cutoff is not specified, exploratory visualization will be generated."
## [1] "dashed line indicates a mean depth across all SNPs of 26.2"
#filter vcf by the max depth cutoff you chose
vcfR<-max_depth(vcfR, maxdepth = 100)
## [1] "maxdepth cutoff is specified, filtered vcfR object will be returned"
## [1] "0.77% of SNPs were above a mean depth of 100 and were removed from the vcf"
#remove invariant SNPs
vcfR<-min_mac(vcfR, min.mac = 1)
## [1] "4.6% of SNPs fell below a minor allele count of 1 and were removed from the VCF"
#check vcfR to see how many SNPs we have left
vcfR
## ***** Object of Class vcfR *****
## 32 samples
## 3925 CHROMs
## 11,830 variants
## Object size: 13.8 Mb
## 5.903 percent missing data
## ***** ***** *****
#run function to visualize samples
miss<-missing_by_sample(vcfR=vcfR)
## [1] "No popmap provided"
#every sample is missing less than 60% of SNPs, which is great
#this means we should be able to retain all samples with a reasonable per-SNP filtering threshold
#visualize missing data by SNP and the effect of various cutoffs on the missingness of each sample
missing_by_snp(vcfR)
## [1] "cutoff is not specified, exploratory visualizations will be generated"
## Picking joint bandwidth of 0.0474
## filt missingness snps.retained
## 1 0.30 0.093612804 11161
## 2 0.50 0.071585015 10701
## 3 0.60 0.054557050 10241
## 4 0.65 0.050501211 10113
## 5 0.70 0.041647536 9801
## 6 0.75 0.036809098 9607
## 7 0.80 0.028097483 9199
## 8 0.85 0.019351419 8667
## 9 0.90 0.014520240 8288
## 10 0.95 0.005079213 7180
## 11 1.00 0.000000000 6013
#make popmap
popmap<-sample.info[sample.info$ID %in% colnames(vcfR@gt)[-1],c(5,1)]
colnames(popmap)<-c("id","pop")
popmap$pop<-as.factor(popmap$pop)
#assess missing data effects on clustering
assess_missing_data_pca(vcfR = vcfR, popmap = popmap, thresholds = c(.75,.85,1), clustering = FALSE)
## [1] "cutoff is specified, filtered vcfR object will be returned"
## [1] "18.79% of SNPs fell below a completeness cutoff of 0.75 and were removed from the VCF"
## [1] "cutoff is specified, filtered vcfR object will be returned"
## [1] "26.74% of SNPs fell below a completeness cutoff of 0.85 and were removed from the VCF"
## [1] "cutoff is specified, filtered vcfR object will be returned"
## [1] "49.17% of SNPs fell below a completeness cutoff of 1 and were removed from the VCF"
## [[1]]
## PC1 PC2 PC3 PC4 PC5 PC6
## MLZ23076 -3.4408945 1.5016270 -2.35242478 -0.9060119 -0.54757361 0.68560557
## MLZ23077 -2.5312853 1.0641920 -2.06490549 -2.6010390 -1.08664967 -0.51033977
## MLZ24113 0.6453732 -1.3806667 3.61106095 -1.2432372 2.91734383 -5.21618261
## MLZ24115 0.6667550 -1.2911924 2.69656079 -1.1333566 2.27836429 -3.30948232
## MLZ24116 0.9960392 -0.8624820 3.53880305 -1.0347133 -1.21515595 3.11350449
## MLZ24118 0.8004817 -1.4152129 4.73754400 -2.0286371 -1.64504306 1.95423351
## MLZ26978 2.7944173 -2.5415003 -0.91278475 -0.3707517 2.69429738 -3.30827216
## MLZ26979 3.3577174 -3.8640117 -4.85923768 0.5185918 3.45175223 3.14722725
## MLZ26980 3.0828073 -3.7644489 -2.49239605 -1.5958699 0.77105390 0.03157791
## MLZ26984 3.5314322 -2.5249725 -3.72432453 0.3980606 2.17716761 1.71632722
## MLZ30405 -2.6320915 1.7434823 -0.17855993 5.7167553 1.38074762 -0.13152295
## MLZ30408 -1.8021263 1.2032514 0.14874709 0.4721152 -0.89520509 -0.37710142
## MLZ30413 -1.7770906 0.5596514 0.09159234 0.5969241 -0.36308270 -0.17732847
## MLZ30415 -2.1575796 0.9828752 -0.19806036 3.2383487 -0.56296525 -1.97127307
## MLZ41012 -2.0732084 0.6534767 -0.64925132 -4.4770531 -0.47059593 1.87086167
## MLZ41050 -1.8702176 0.2961247 -0.50518103 -3.5404503 -1.09260510 -0.18549831
## MLZ41051 -2.6924658 0.9251661 -1.51585508 -3.3135022 -0.09847242 -0.13584982
## MLZ41271 -2.9130433 1.0028442 -0.78248288 0.7694928 -0.34640993 2.05545290
## MLZ46829 -3.4967650 1.8819678 -2.08379080 0.7223026 1.65107132 -0.39541773
## MLZ46851 -3.1444035 1.5822846 -1.52075885 -1.7973585 -0.78719357 -1.28326524
## MLZ55360 -0.1420718 -1.4795700 3.82240740 0.7814072 1.68554710 0.76761071
## MLZ55361 0.3112816 -1.5363408 3.97389919 1.6130831 1.71631136 4.69249548
## MLZ55363 0.7227579 -0.7919691 1.65649599 -0.0232489 -0.54299147 0.68583497
## MLZ55366 0.1476713 -0.8057730 1.45176389 -1.3149368 0.48354766 -0.46061558
## MLZ57701 -1.8249294 0.9740361 -0.68120711 1.8277736 1.46321387 -1.76989992
## MLZ57703 -2.1058177 0.7283156 0.15530202 2.4853046 -2.72484071 1.37741565
## MLZ57707 -1.5563299 0.4960458 0.48012631 1.1831584 1.06857218 0.16524956
## MLZ57719 -1.6818622 0.8481359 0.50010255 1.7162611 0.35184505 -0.01488890
## MVZ122216 9.5067437 11.4541280 0.23597477 -0.7288874 1.14205227 0.47310138
## MVZ122217 4.1270776 -2.3022831 -0.84310263 1.5306698 -5.14729653 -3.27055708
## MVZ122219 3.1754278 -1.6247092 -0.56797269 1.0107068 -5.88827058 -0.78643791
## MVZ122223 3.9761991 -1.7124722 -1.16808440 1.5280980 -1.81853609 0.56743500
## PC7 PC8 PC9 PC10 pop
## MLZ23076 -2.40541976 0.1207203 3.20576347 -2.0778858 El Venerable
## MLZ23077 -0.84698742 -2.6115741 -0.93273690 -1.1931067 El Venerable
## MLZ24113 0.82638797 2.3949536 0.57728566 -2.4406858 Rancho La Cofradia
## MLZ24115 1.62906009 1.4801747 -0.14081106 -1.2148973 Rancho La Cofradia
## MLZ24116 0.52274358 -1.4008361 1.91887145 1.4950702 Rancho La Cofradia
## MLZ24118 -0.17929094 -2.0525227 3.47848266 3.9851433 Rancho La Cofradia
## MLZ26978 -0.73196243 -0.4168690 -1.23462004 3.1534882 Tapalpa
## MLZ26979 1.56118453 1.3742507 1.38710840 -0.2508291 Tapalpa
## MLZ26980 0.43794277 -5.3283120 -3.61839817 0.8514866 Tapalpa
## MLZ26984 1.53800889 0.5933887 1.41151648 -0.1148743 Tapalpa
## MLZ30405 -0.92897309 0.5668339 1.57559697 3.5087113 Puerto Lengua de Vaca
## MLZ30408 1.90363770 0.1258526 -0.33212326 2.3990802 Puerto Lengua de Vaca
## MLZ30413 1.46107135 -0.6620272 -0.28255192 1.4765155 Puerto Lengua de Vaca
## MLZ30415 2.03593954 2.0655463 0.12385008 2.7765230 Puerto Lengua de Vaca
## MLZ41012 -0.73397017 3.0640379 0.70497978 1.1424606 Lerma
## MLZ41050 1.85598927 0.8665997 0.53396918 1.3256709 Lerma
## MLZ41051 0.61862620 1.6409468 -1.54656715 0.6903126 Lerma
## MLZ41271 -1.06547393 3.6631206 -4.78463484 1.1777819 Lerma
## MLZ46829 -4.14289007 -1.5979406 2.20828572 -2.3903870 El Venerable
## MLZ46851 -0.86239655 -0.1128000 -0.07905488 -1.1527436 El Venerable
## MLZ55360 -2.18956670 -1.9128176 -0.23844637 -1.6793211 Lagunita
## MLZ55361 -2.85341556 1.8399877 -3.66998181 -2.2462688 Lagunita
## MLZ55363 0.08647145 0.3258425 0.33775817 -0.3372391 Lagunita
## MLZ55366 0.63226944 -0.1848745 0.25290137 -1.6089418 Lagunita
## MLZ57701 -1.73844398 -3.4395377 -0.40504878 1.5609510 Puerto Morillos
## MLZ57703 6.87807376 -1.6788343 -0.57769899 -4.1007586 Puerto Morillos
## MLZ57707 0.79176713 0.2245275 -2.27812051 -0.4627647 Puerto Morillos
## MLZ57719 0.79033536 -2.4591786 1.56533876 -1.8138643 Puerto Morillos
## MVZ122216 0.21885919 -0.3140935 -0.63845158 -0.1579450 Manzamitla
## MVZ122217 -2.14792839 1.4326783 0.06911554 -1.2524034 Manzamitla
## MVZ122219 -2.18075736 -0.2722789 -2.43000302 0.1287506 Manzamitla
## MVZ122223 -0.78089188 2.6650349 3.83842560 -1.1770295 Manzamitla
## missing
## MLZ23076 0.11107354
## MLZ23077 0.10109890
## MLZ24113 0.21580727
## MLZ24115 0.23102282
## MLZ24116 0.13093829
## MLZ24118 0.13169907
## MLZ26978 0.11715976
## MLZ26979 0.12992392
## MLZ26980 0.12062553
## MLZ26984 0.07142857
## MLZ30405 0.09611158
## MLZ30408 0.17185123
## MLZ30413 0.22045647
## MLZ30415 0.11327134
## MLZ41012 0.10498732
## MLZ41050 0.25300085
## MLZ41051 0.13778529
## MLZ41271 0.16466610
## MLZ46829 0.15604396
## MLZ46851 0.13178360
## MLZ55360 0.23034658
## MLZ55361 0.14201183
## MLZ55363 0.15131023
## MLZ55366 0.13786982
## MLZ57701 0.14201183
## MLZ57703 0.08968724
## MLZ57707 0.11031276
## MLZ57719 0.12510566
## MVZ122216 0.03685545
## MVZ122217 0.07641589
## MVZ122219 0.08892646
## MVZ122223 0.07768385
#super weird that the sample we used as a reference pops out but only above a certain filtering threshold
#may be a good idea to remove singletons (degraded samples) see how this affects the number of SNPs retained
vcfR.mac2<-min_mac(vcfR, min.mac = 2)
## [1] "41.5% of SNPs fell below a minor allele count of 2 and were removed from the VCF"
missing_by_snp(vcfR.mac2)
## [1] "cutoff is not specified, exploratory visualizations will be generated"
## Picking joint bandwidth of 0.0497
## filt missingness snps.retained
## 1 0.30 0.133315260 6340
## 2 0.50 0.100704982 5931
## 3 0.60 0.075763049 5537
## 4 0.65 0.070091093 5434
## 5 0.70 0.057696480 5185
## 6 0.75 0.050715990 5028
## 7 0.80 0.038274113 4707
## 8 0.85 0.025748547 4300
## 9 0.90 0.019098945 4030
## 10 0.95 0.006557004 3298
## 11 1.00 0.000000000 2606
assess_missing_data_pca(vcfR = vcfR.mac2, popmap = popmap, thresholds = c(.70,.75,.80,1), clustering = FALSE)
## [1] "cutoff is specified, filtered vcfR object will be returned"
## [1] "25.07% of SNPs fell below a completeness cutoff of 0.7 and were removed from the VCF"
## [1] "cutoff is specified, filtered vcfR object will be returned"
## [1] "27.34% of SNPs fell below a completeness cutoff of 0.75 and were removed from the VCF"
## [1] "cutoff is specified, filtered vcfR object will be returned"
## [1] "31.98% of SNPs fell below a completeness cutoff of 0.8 and were removed from the VCF"
## [1] "cutoff is specified, filtered vcfR object will be returned"
## [1] "62.34% of SNPs fell below a completeness cutoff of 1 and were removed from the VCF"
## [[1]]
## PC1 PC2 PC3 PC4 PC5 PC6
## MLZ23076 -3.2942668 0.57578990 1.76898954 -0.18288791 -0.6455864 0.32994892
## MLZ23077 -2.4448982 0.41292758 1.61288714 0.19714496 -1.8616307 -0.17277867
## MLZ24113 0.9423257 -1.43110890 -3.95956282 5.81471689 0.1099429 -1.66652567
## MLZ24115 0.9812272 -1.37810038 -3.14394622 5.01993354 -0.2825499 -1.27489223
## MLZ24116 1.1544168 -0.50107052 -3.05756337 -2.79669038 -1.4051970 1.63827072
## MLZ24118 1.0449022 -1.17328450 -4.03737672 -2.61888374 -2.4552978 1.02600353
## MLZ26978 3.0592156 -1.48743997 0.67209490 1.94364025 0.5494955 -0.15281577
## MLZ26979 4.0980697 -2.95451617 4.82897935 1.09585037 1.5375207 3.53211360
## MLZ26980 3.7387102 -2.96291864 2.39980852 0.31204513 -1.0115112 0.46755584
## MLZ26984 3.8573000 -1.22240178 3.28131407 0.76675159 0.8048244 1.75938389
## MLZ30405 -2.7710595 1.23578526 0.34056583 -0.45581691 4.6574602 -0.63875281
## MLZ30408 -1.9586398 0.92847938 0.01715874 -0.17037373 -0.2610504 -1.10276798
## MLZ30413 -1.8839394 0.16636599 0.05588067 -0.37369109 0.3574970 -0.69634176
## MLZ30415 -2.2054466 0.55494864 0.29436003 0.04388828 2.1150433 -2.31687328
## MLZ41012 -2.1266419 0.06698752 0.74288542 -0.09247191 -4.2839188 2.39401090
## MLZ41050 -1.8806836 -0.19070397 0.57089620 0.39926739 -3.9120630 -0.02545807
## MLZ41051 -2.6316581 0.17464350 1.28151045 0.95874403 -2.4576924 0.41995546
## MLZ41271 -3.0233544 0.25179905 1.03430320 -1.14162199 0.5706904 0.54281377
## MLZ46829 -3.5478021 0.90794884 1.63792877 0.82644022 1.2517014 0.84263065
## MLZ46851 -3.1917576 0.74965912 1.21765527 0.83301606 -1.5472264 -0.73508160
## MLZ55360 0.1491273 -1.90639726 -3.76046135 -1.51418847 2.2675996 2.59093858
## MLZ55361 0.5754835 -1.52101894 -2.94616169 -2.01344548 2.0458038 2.87510625
## MLZ55363 0.8763117 -0.42545643 -1.35867144 -1.00898606 -0.2249254 0.05903291
## MLZ55366 0.3106687 -0.73516189 -1.26131868 0.58761989 -0.8570816 0.38811311
## MLZ57701 -1.8575950 0.57504928 0.58034934 0.72342269 2.0362192 -0.32229555
## MLZ57703 -2.0639895 0.27450232 0.11745141 -1.77175331 0.8776728 -1.76692060
## MLZ57707 -1.5564811 0.09737942 -0.29024386 0.09305635 1.4536445 0.46785912
## MLZ57719 -1.7307039 0.52951247 -0.29606497 -0.52348233 1.6541550 -0.04768159
## MVZ122216 5.4820434 10.96324446 -0.72634954 1.06185196 -0.2035199 1.97738666
## MVZ122217 4.3239346 -0.47463791 0.67214826 -1.66938024 -0.3335997 -4.75753793
## MVZ122219 3.3335934 -0.11447007 0.59679362 -3.00039727 -1.0428496 -4.27186902
## MVZ122223 4.2415873 0.01366461 1.11375991 -1.34331878 0.4964295 -1.36253140
## PC7 PC8 PC9 PC10 pop
## MLZ23076 -0.49495796 -0.68393072 2.34543404 0.31989879 El Venerable
## MLZ23077 -1.60349318 0.41360721 -0.04685268 0.45238044 El Venerable
## MLZ24113 0.81725363 -1.18227580 0.76123074 -0.41475089 Rancho La Cofradia
## MLZ24115 1.64181452 0.40494907 -1.35377628 1.05484303 Rancho La Cofradia
## MLZ24116 1.44476308 2.24970613 -0.03486463 1.97394486 Rancho La Cofradia
## MLZ24118 0.64413636 2.66605578 1.72071759 3.18008827 Rancho La Cofradia
## MLZ26978 -2.07531177 -0.40164048 0.18552622 0.32338945 Tapalpa
## MLZ26979 2.87741876 0.58664411 0.67406447 -0.29351188 Tapalpa
## MLZ26980 -4.42415581 2.59327576 -3.82047207 1.11749705 Tapalpa
## MLZ26984 1.35545706 0.97668290 0.24454660 -0.35358964 Tapalpa
## MLZ30405 1.21220163 0.54001544 0.99740567 3.16250943 Puerto Lengua de Vaca
## MLZ30408 1.35534786 1.40405041 -1.29675120 0.90000270 Puerto Lengua de Vaca
## MLZ30413 0.26102898 1.87908877 -0.62912955 -0.04287724 Puerto Lengua de Vaca
## MLZ30415 1.32442301 0.64845014 0.18041853 -0.50981586 Puerto Lengua de Vaca
## MLZ41012 0.77413132 -2.71120302 1.70103844 -0.91670237 Lerma
## MLZ41050 0.86488620 0.88644622 0.12547099 -1.09518287 Lerma
## MLZ41051 0.07956763 -0.79928716 -0.67457541 -0.27169254 Lerma
## MLZ41271 1.58075103 -4.45540853 -4.51598041 0.52704660 Lerma
## MLZ46829 -2.26984954 -0.69960060 2.86170558 1.87096156 El Venerable
## MLZ46851 -0.94675488 -0.25514312 0.19005450 0.52196888 El Venerable
## MLZ55360 -4.35620269 -1.31110094 1.84858899 -3.64447336 Lagunita
## MLZ55361 1.12309841 -3.11336492 -2.52566779 1.20097984 Lagunita
## MLZ55363 0.83427790 -0.33266722 0.04196921 -0.69571357 Lagunita
## MLZ55366 -0.50133478 0.07434165 0.07799107 -1.89531401 Lagunita
## MLZ57701 -2.38173119 1.06534472 0.27857053 2.26819647 Puerto Morillos
## MLZ57703 1.88101581 3.49370736 -1.78401212 -4.92201166 Puerto Morillos
## MLZ57707 -0.41295168 -0.49304093 -1.50461936 -1.65225725 Puerto Morillos
## MLZ57719 -0.48248678 1.79617735 1.00463034 -1.87982956 Puerto Morillos
## MVZ122216 -0.66718472 0.18928607 -0.94259717 -0.43954151 Manzamitla
## MVZ122217 -0.93785669 -2.29361237 1.26749158 -0.59546151 Manzamitla
## MVZ122219 -1.14691870 -1.83700305 -1.11110950 1.21695780 Manzamitla
## MVZ122223 2.62961719 -1.29855025 3.73355305 -0.46793945 Manzamitla
## missing
## MLZ23076 0.15823699
## MLZ23077 0.14768786
## MLZ24113 0.29465318
## MLZ24115 0.31387283
## MLZ24116 0.18583815
## MLZ24118 0.18656069
## MLZ26978 0.16864162
## MLZ26979 0.18453757
## MLZ26980 0.17167630
## MLZ26984 0.10447977
## MLZ30405 0.14248555
## MLZ30408 0.23294798
## MLZ30413 0.30722543
## MLZ30415 0.16271676
## MLZ41012 0.15404624
## MLZ41050 0.33786127
## MLZ41051 0.19797688
## MLZ41271 0.23063584
## MLZ46829 0.22369942
## MLZ46851 0.19089595
## MLZ55360 0.30924855
## MLZ55361 0.20317919
## MLZ55363 0.21473988
## MLZ55366 0.19335260
## MLZ57701 0.20476879
## MLZ57703 0.13135838
## MLZ57707 0.16069364
## MLZ57719 0.18208092
## MVZ122216 0.04985549
## MVZ122217 0.11242775
## MVZ122219 0.13236994
## MVZ122223 0.11372832
#again, the sample we used as a reference pops out but only above a certain filtering threshold
#definitely something to watch for going forward
#set 75% cutoff without minor allele filter
vcfRa<-missing_by_snp(vcfR, cutoff = .75)
## [1] "cutoff is specified, filtered vcfR object will be returned"
## [1] "18.79% of SNPs fell below a completeness cutoff of 0.75 and were removed from the VCF"
#convert each to genlight
gena<-vcfR2genlight(vcfRa)
pop(gena)<-gena@ind.names
sample.div.75 <- stamppNeisD(gena, pop = FALSE)
#export for splitstree
#stamppPhylip(distance.mat=sample.div.75, file="~/Desktop/mex.towhees/towhee.75.splits.txt")
#75% completeness cutoff splitstree
knitr::include_graphics(c("/Users/devder/Desktop/mex.towhees/towhee.75.splitstree.png"))
#set 75% cutoff with mac = 2 filter
vcfRb<-missing_by_snp(vcfR.mac2, cutoff = .75)
## [1] "cutoff is specified, filtered vcfR object will be returned"
## [1] "27.34% of SNPs fell below a completeness cutoff of 0.75 and were removed from the VCF"
#convert each to genlight
genb<-vcfR2genlight(vcfRb)
pop(genb)<-genb@ind.names
sample.div.mac.75 <- stamppNeisD(genb, pop = FALSE)
#export for splitstree
#stamppPhylip(distance.mat=sample.div.mac.75, file="~/Desktop/mex.towhees/towhee.75.mac.splits.txt")
#75% completeness cutoff with MAC splitstree
knitr::include_graphics(c("/Users/devder/Desktop/mex.towhees/towhee.75.mac.splitstree.png"))
#set 85% cutoff with mac = 2 filter
vcfRc<-missing_by_snp(vcfR.mac2, cutoff = .85)
## [1] "cutoff is specified, filtered vcfR object will be returned"
## [1] "37.86% of SNPs fell below a completeness cutoff of 0.85 and were removed from the VCF"
#convert each to genlight
genc<-vcfR2genlight(vcfRc)
pop(genc)<-genc@ind.names
sample.div.mac.85 <- stamppNeisD(genc, pop = FALSE)
#export for splitstree
#stamppPhylip(distance.mat=sample.div.mac.85, file="~/Desktop/mex.towhees/towhee.85.mac.splits.txt")
#85% completeness cutoff with MAC splitstree
knitr::include_graphics(c("/Users/devder/Desktop/mex.towhees/towhee.85.mac.splitstree.png"))
#for whatever reason, MVZ12226, our pseudo-reference sample, pops out on a long branch at a certain filtering threshold.
#still not sure why that is occurring
#good news is that other than this sample, missing data proportion is not affecting sample clustering patterns in any discernable way
#plot depth per snp and per sample
dp <- extract.gt(vcfRb, element = "DP", as.numeric=TRUE)
heatmap.bp(dp, rlabels = FALSE)
#plot genotype quality per snp and per sample
gq <- extract.gt(vcfRb, element = "GQ", as.numeric=TRUE)
heatmap.bp(gq, rlabels = FALSE)
###We can use the convenient function ‘write.vcf’ from vcfR to export our filtered vcf file for downstream analyses
#write out vcf
#vcfR::write.vcf(vcfRb, file = "~/Desktop/mex.towhees/towhee.75.mac2.vcf.gz")
#perform linkage filtering to get a reduced vcf with only unlinked (one per UCE) SNPs
vcfR.thin<-distance_thin(vcfRb, min.distance = 10000)
##
|
| | 0%
|
| | 1%
|
|= | 1%
|
|= | 2%
|
|== | 2%
|
|== | 3%
|
|== | 4%
|
|=== | 4%
|
|=== | 5%
|
|==== | 5%
|
|==== | 6%
|
|===== | 6%
|
|===== | 7%
|
|===== | 8%
|
|====== | 8%
|
|====== | 9%
|
|======= | 9%
|
|======= | 10%
|
|======= | 11%
|
|======== | 11%
|
|======== | 12%
|
|========= | 12%
|
|========= | 13%
|
|========= | 14%
|
|========== | 14%
|
|========== | 15%
|
|=========== | 15%
|
|=========== | 16%
|
|============ | 16%
|
|============ | 17%
|
|============ | 18%
|
|============= | 18%
|
|============= | 19%
|
|============== | 19%
|
|============== | 20%
|
|============== | 21%
|
|=============== | 21%
|
|=============== | 22%
|
|================ | 22%
|
|================ | 23%
|
|================ | 24%
|
|================= | 24%
|
|================= | 25%
|
|================== | 25%
|
|================== | 26%
|
|=================== | 26%
|
|=================== | 27%
|
|=================== | 28%
|
|==================== | 28%
|
|==================== | 29%
|
|===================== | 29%
|
|===================== | 30%
|
|===================== | 31%
|
|====================== | 31%
|
|====================== | 32%
|
|======================= | 32%
|
|======================= | 33%
|
|======================= | 34%
|
|======================== | 34%
|
|======================== | 35%
|
|========================= | 35%
|
|========================= | 36%
|
|========================== | 36%
|
|========================== | 37%
|
|========================== | 38%
|
|=========================== | 38%
|
|=========================== | 39%
|
|============================ | 39%
|
|============================ | 40%
|
|============================ | 41%
|
|============================= | 41%
|
|============================= | 42%
|
|============================== | 42%
|
|============================== | 43%
|
|============================== | 44%
|
|=============================== | 44%
|
|=============================== | 45%
|
|================================ | 45%
|
|================================ | 46%
|
|================================= | 46%
|
|================================= | 47%
|
|================================= | 48%
|
|================================== | 48%
|
|================================== | 49%
|
|=================================== | 49%
|
|=================================== | 50%
|
|=================================== | 51%
|
|==================================== | 51%
|
|==================================== | 52%
|
|===================================== | 52%
|
|===================================== | 53%
|
|===================================== | 54%
|
|====================================== | 54%
|
|====================================== | 55%
|
|======================================= | 55%
|
|======================================= | 56%
|
|======================================== | 56%
|
|======================================== | 57%
|
|======================================== | 58%
|
|========================================= | 58%
|
|========================================= | 59%
|
|========================================== | 59%
|
|========================================== | 60%
|
|========================================== | 61%
|
|=========================================== | 61%
|
|=========================================== | 62%
|
|============================================ | 62%
|
|============================================ | 63%
|
|============================================ | 64%
|
|============================================= | 64%
|
|============================================= | 65%
|
|============================================== | 65%
|
|============================================== | 66%
|
|=============================================== | 66%
|
|=============================================== | 67%
|
|=============================================== | 68%
|
|================================================ | 68%
|
|================================================ | 69%
|
|================================================= | 69%
|
|================================================= | 70%
|
|================================================= | 71%
|
|================================================== | 71%
|
|================================================== | 72%
|
|=================================================== | 72%
|
|=================================================== | 73%
|
|=================================================== | 74%
|
|==================================================== | 74%
|
|==================================================== | 75%
|
|===================================================== | 75%
|
|===================================================== | 76%
|
|====================================================== | 76%
|
|====================================================== | 77%
|
|====================================================== | 78%
|
|======================================================= | 78%
|
|======================================================= | 79%
|
|======================================================== | 79%
|
|======================================================== | 80%
|
|======================================================== | 81%
|
|========================================================= | 81%
|
|========================================================= | 82%
|
|========================================================== | 82%
|
|========================================================== | 83%
|
|========================================================== | 84%
|
|=========================================================== | 84%
|
|=========================================================== | 85%
|
|============================================================ | 85%
|
|============================================================ | 86%
|
|============================================================= | 86%
|
|============================================================= | 87%
|
|============================================================= | 88%
|
|============================================================== | 88%
|
|============================================================== | 89%
|
|=============================================================== | 89%
|
|=============================================================== | 90%
|
|=============================================================== | 91%
|
|================================================================ | 91%
|
|================================================================ | 92%
|
|================================================================= | 92%
|
|================================================================= | 93%
|
|================================================================= | 94%
|
|================================================================== | 94%
|
|================================================================== | 95%
|
|=================================================================== | 95%
|
|=================================================================== | 96%
|
|==================================================================== | 96%
|
|==================================================================== | 97%
|
|==================================================================== | 98%
|
|===================================================================== | 98%
|
|===================================================================== | 99%
|
|======================================================================| 99%
|
|======================================================================| 100%
## [1] "2669 out of 5028 input SNPs were not located within 10000 base-pairs of another SNP and were retained despite filtering"
vcfR.thin #retains 2,669 SNPs
## ***** Object of Class vcfR *****
## 32 samples
## 2669 CHROMs
## 2,669 variants
## Object size: 4.7 Mb
## 2.879 percent missing data
## ***** ***** *****
#vcfR::write.vcf(vcfR.thin, file = "~/Desktop/mex.towhees/towhee.75.mac2.unlinked.vcf.gz")