make PCA

read in your filtered vcf file

library(vcfR)

## 
##    *****       ***   vcfR   ***       *****
##    This is vcfR 1.12.0 
##      browseVignettes('vcfR') # Documentation
##      citation('vcfR') # Citation
##    *****       *****      *****       *****

library(adegenet)

## Loading required package: ade4

## 
##    /// adegenet 2.1.5 is loaded ////////////
## 
##    > overview: '?adegenet'
##    > tutorials/doc/questions: 'adegenetWeb()' 
##    > bug reports/feature requests: adegenetIssues()

library(StAMPP)

## Loading required package: pegas

## Loading required package: ape

## Registered S3 method overwritten by 'pegas':
##   method      from
##   print.amova ade4

## 
## Attaching package: 'pegas'

## The following object is masked from 'package:ape':
## 
##     mst

## The following object is masked from 'package:ade4':
## 
##     amova

## The following objects are masked from 'package:vcfR':
## 
##     getINFO, write.vcf

library(ggplot2)
#read vcf
v<-read.vcfR("~/Desktop/phil.dicaeum/filtered.85.vcf")

## Scanning file to determine attributes.
## File attributes:
##   meta lines: 14
##   header_line: 15
##   variant count: 18499
##   column count: 69
## 
Meta line 14 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
##   Character matrix gt rows: 18499
##   Character matrix gt cols: 69
##   skip: 0
##   nrows: 18499
##   row_num: 0
## 
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant 6000
Processed variant 7000
Processed variant 8000
Processed variant 9000
Processed variant 10000
Processed variant 11000
Processed variant 12000
Processed variant 13000
Processed variant 14000
Processed variant 15000
Processed variant 16000
Processed variant 17000
Processed variant 18000
Processed variant: 18499
## All variants processed

#read in your sample info file
pops<-read.csv("~/Desktop/phil.dicaeum/dicaeum.sampling.csv")
#it should look roughly like this:
head(pops)

##                   ID Tissue    Species      Taxa           State
## 1 D_hypoleucum_18070  18070 hypoleucum     Luzon Camarines Norte
## 2 D_hypoleucum_18191  18191 hypoleucum Zamboanga                
## 3 D_hypoleucum_14061  14061 hypoleucum  Mindanao         Dinagat
## 4 D_hypoleucum_14037  14037 hypoleucum  Mindanao         Dinagat
## 5 D_hypoleucum_14079  14079 hypoleucum  Mindanao         Dinagat
## 6 D_hypoleucum_14065  14065 hypoleucum  Mindanao         Dinagat
##                                                                 Specificlocality
## 1              Luzon Island; Municipality Labo; Barangay Tulay na Lupa, Mt. Labo
## 2       Mindanao Island, City of Zamboanga; Brgy. Pasonanca, Pasonanca Nat. Park
## 3 Dinagat Island, Muncipality of Loreto, Brgy. Santiago, Sitio Cambinliw, Sudlon
## 4 Dinagat Island, Muncipality of Loreto, Brgy. Santiago, Sitio Cambinliw, Sudlon
## 5 Dinagat Island, Muncipality of Loreto, Brgy. Santiago, Sitio Cambinliw, Sudlon
## 6   Dinagat Island, Municipality Loreto, Brgy. Santiago, Sitio Cambinliu, Sudlon
##    Latitude Longitude Altitude
## 1 14.039000  122.7860      580
## 2  7.018483  122.0273      750
## 3 10.343680  125.6181      200
## 4 10.343680  125.6181      200
## 5 10.343680  125.6181      200
## 6 10.343680  125.6181      190

#retain only samples that passed all filtering protocols (assumes that the 'ID' column is identical to sample names in the vcf)
pops<-pops[pops$ID %in% colnames(v@gt),]
#reorder the sample info file to match the order of samples in the vcf
pops<-pops[order(match(pops$ID,colnames(v@gt)[-1])),]

use the adegenet package to perform PCA on a genlight object

#convert vcfR to genlight
gen<-vcfR2genlight(v)
#perform PCA
di.pca<-glPca(gen, nf=6)
#isolate PCA scores as a dataframe
di.pca.scores<-as.data.frame(di.pca$scores)
#make sure your sample info file is identical in order to the resulting PCA output
rownames(di.pca.scores) == pops$ID #all should return true

##  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [46] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE

#add in the relevant population identifier you would like to color-code by
di.pca.scores$pop<-pops$Taxa

#ggplot PCA colored by your identifier
ggplot(di.pca.scores, aes(x=PC1, y=PC2, color=pop)) +
  geom_point(cex = 5, alpha=.5)+
  theme_classic()

Remove the outgroup and repeat the procedure

#remove two nigrilore samples from the vcf
v.sub<-v[,colnames(v@gt) != "D_nigrilore_KU28413" & colnames(v@gt) != "D_nigrilore_KU28414"]
#remove the two samples from the sample info file
pops.sub<-pops[pops$ID != "D_nigrilore_KU28413" & pops$ID != "D_nigrilore_KU28414",]

#convert vcfR to genlight
gen<-vcfR2genlight(v.sub)
#perform PCA
di.pca<-glPca(gen, nf=6)
#isolate PCA scores as a dataframe
di.pca.scores<-as.data.frame(di.pca$scores)
#make sure your sample info file is identical in order to the resulting PCA output
rownames(di.pca.scores) == pops.sub$ID #all should return true

##  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [46] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE

#add in the relevant population identifier you would like to color-code by
di.pca.scores$pop<-pops.sub$Taxa

#ggplot PCA colored by your identifier
ggplot(di.pca.scores, aes(x=PC1, y=PC2, color=pop)) +
  geom_point(cex = 5, alpha=.5)+
  theme_classic()

#implement a custom color scheme
ggplot(di.pca.scores, aes(x=PC1, y=PC2)) +
  geom_point(aes(fill=pop), pch=21, size=5)+
  scale_fill_manual(values=c("black", "grey", "white"))+
  theme_classic()

#if you want to add the proportion of variance explained by each principal component axis, you can calculate it like this:
#porportion of variance explained by PC1
di.pca[["eig"]][1]/sum(di.pca[["eig"]])

## [1] 0.1124937

#PC2
di.pca[["eig"]][2]/sum(di.pca[["eig"]])

## [1] 0.05286486

#replot the PCA with that information included
ggplot(di.pca.scores, aes(x=PC1, y=PC2)) +
  geom_point(aes(fill=pop), pch=21, size=5)+
  scale_fill_manual(values=c("black", "grey", "white"))+
  xlab("PC1, 11.2% variance explained")+
  ylab("PC2, 5.3% variance explained")+
  theme_classic()

library(ggview)
ggview(units="in", width=4.5, height=3)
#ggsave(file="~/Desktop/phil.dicaeum/dicaeum.pca.pdf", units="in",width=4.5,height=3) #saves g

make PCA

Devon DeRaad

2022-11-30

read in your filtered vcf file

use the adegenet package to perform PCA on a genlight object

Remove the outgroup and repeat the procedure