read in your filtered vcf file
library(vcfR)
##
## ***** *** vcfR *** *****
## This is vcfR 1.12.0
## browseVignettes('vcfR') # Documentation
## citation('vcfR') # Citation
## ***** ***** ***** *****
library(adegenet)
## Loading required package: ade4
##
## /// adegenet 2.1.5 is loaded ////////////
##
## > overview: '?adegenet'
## > tutorials/doc/questions: 'adegenetWeb()'
## > bug reports/feature requests: adegenetIssues()
library(StAMPP)
## Loading required package: pegas
## Loading required package: ape
## Registered S3 method overwritten by 'pegas':
## method from
## print.amova ade4
##
## Attaching package: 'pegas'
## The following object is masked from 'package:ape':
##
## mst
## The following object is masked from 'package:ade4':
##
## amova
## The following objects are masked from 'package:vcfR':
##
## getINFO, write.vcf
library(ggplot2)
#read vcf
v<-read.vcfR("~/Desktop/phil.dicaeum/filtered.85.vcf")
## Scanning file to determine attributes.
## File attributes:
## meta lines: 14
## header_line: 15
## variant count: 18499
## column count: 69
##
Meta line 14 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
## Character matrix gt rows: 18499
## Character matrix gt cols: 69
## skip: 0
## nrows: 18499
## row_num: 0
##
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant 6000
Processed variant 7000
Processed variant 8000
Processed variant 9000
Processed variant 10000
Processed variant 11000
Processed variant 12000
Processed variant 13000
Processed variant 14000
Processed variant 15000
Processed variant 16000
Processed variant 17000
Processed variant 18000
Processed variant: 18499
## All variants processed
#read in your sample info file
pops<-read.csv("~/Desktop/phil.dicaeum/dicaeum.sampling.csv")
#it should look roughly like this:
head(pops)
## ID Tissue Species Taxa State
## 1 D_hypoleucum_18070 18070 hypoleucum Luzon Camarines Norte
## 2 D_hypoleucum_18191 18191 hypoleucum Zamboanga
## 3 D_hypoleucum_14061 14061 hypoleucum Mindanao Dinagat
## 4 D_hypoleucum_14037 14037 hypoleucum Mindanao Dinagat
## 5 D_hypoleucum_14079 14079 hypoleucum Mindanao Dinagat
## 6 D_hypoleucum_14065 14065 hypoleucum Mindanao Dinagat
## Specificlocality
## 1 Luzon Island; Municipality Labo; Barangay Tulay na Lupa, Mt. Labo
## 2 Mindanao Island, City of Zamboanga; Brgy. Pasonanca, Pasonanca Nat. Park
## 3 Dinagat Island, Muncipality of Loreto, Brgy. Santiago, Sitio Cambinliw, Sudlon
## 4 Dinagat Island, Muncipality of Loreto, Brgy. Santiago, Sitio Cambinliw, Sudlon
## 5 Dinagat Island, Muncipality of Loreto, Brgy. Santiago, Sitio Cambinliw, Sudlon
## 6 Dinagat Island, Municipality Loreto, Brgy. Santiago, Sitio Cambinliu, Sudlon
## Latitude Longitude Altitude
## 1 14.039000 122.7860 580
## 2 7.018483 122.0273 750
## 3 10.343680 125.6181 200
## 4 10.343680 125.6181 200
## 5 10.343680 125.6181 200
## 6 10.343680 125.6181 190
#retain only samples that passed all filtering protocols (assumes that the 'ID' column is identical to sample names in the vcf)
pops<-pops[pops$ID %in% colnames(v@gt),]
#reorder the sample info file to match the order of samples in the vcf
pops<-pops[order(match(pops$ID,colnames(v@gt)[-1])),]
Remove the outgroup and repeat the procedure
#remove two nigrilore samples from the vcf
v.sub<-v[,colnames(v@gt) != "D_nigrilore_KU28413" & colnames(v@gt) != "D_nigrilore_KU28414"]
#remove the two samples from the sample info file
pops.sub<-pops[pops$ID != "D_nigrilore_KU28413" & pops$ID != "D_nigrilore_KU28414",]
#convert vcfR to genlight
gen<-vcfR2genlight(v.sub)
#perform PCA
di.pca<-glPca(gen, nf=6)
#isolate PCA scores as a dataframe
di.pca.scores<-as.data.frame(di.pca$scores)
#make sure your sample info file is identical in order to the resulting PCA output
rownames(di.pca.scores) == pops.sub$ID #all should return true
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [46] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
#add in the relevant population identifier you would like to color-code by
di.pca.scores$pop<-pops.sub$Taxa
#ggplot PCA colored by your identifier
ggplot(di.pca.scores, aes(x=PC1, y=PC2, color=pop)) +
geom_point(cex = 5, alpha=.5)+
theme_classic()

#implement a custom color scheme
ggplot(di.pca.scores, aes(x=PC1, y=PC2)) +
geom_point(aes(fill=pop), pch=21, size=5)+
scale_fill_manual(values=c("black", "grey", "white"))+
theme_classic()

#if you want to add the proportion of variance explained by each principal component axis, you can calculate it like this:
#porportion of variance explained by PC1
di.pca[["eig"]][1]/sum(di.pca[["eig"]])
## [1] 0.1124937
#PC2
di.pca[["eig"]][2]/sum(di.pca[["eig"]])
## [1] 0.05286486
#replot the PCA with that information included
ggplot(di.pca.scores, aes(x=PC1, y=PC2)) +
geom_point(aes(fill=pop), pch=21, size=5)+
scale_fill_manual(values=c("black", "grey", "white"))+
xlab("PC1, 11.2% variance explained")+
ylab("PC2, 5.3% variance explained")+
theme_classic()

library(ggview)
ggview(units="in", width=4.5, height=3)
#ggsave(file="~/Desktop/phil.dicaeum/dicaeum.pca.pdf", units="in",width=4.5,height=3) #saves g