perform species delimitation

library(adegenet)

## Loading required package: ade4

## Warning: package 'ade4' was built under R version 3.5.2

## 
##    /// adegenet 2.1.3 is loaded ////////////
## 
##    > overview: '?adegenet'
##    > tutorials/doc/questions: 'adegenetWeb()' 
##    > bug reports/feature requests: adegenetIssues()

library(randomForest)

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

library(PCDimension)

## Warning: package 'PCDimension' was built under R version 3.5.2

## Loading required package: ClassDiscovery

## Loading required package: cluster

## Warning: package 'cluster' was built under R version 3.5.2

## Loading required package: oompaBase

## Warning: package 'oompaBase' was built under R version 3.5.2

library(mclust)

## Package 'mclust' version 5.4.7
## Type 'citation("mclust")' for citing this R package in publications.

library(cluster)
library(MASS)
library(factoextra)

## Loading required package: ggplot2

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:randomForest':
## 
##     margin

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

library(tsne)
library(vcfR)

## 
##    *****       ***   vcfR   ***       *****
##    This is vcfR 1.12.0 
##      browseVignettes('vcfR') # Documentation
##      citation('vcfR') # Citation
##    *****       *****      *****       *****

library(ggforce)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:MASS':
## 
##     select

## The following object is masked from 'package:randomForest':
## 
##     combine

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(gridExtra)

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

## The following object is masked from 'package:randomForest':
## 
##     combine

#read in vcf as vcfR
vcfR <- read.vcfR("~/Desktop/aph.data/unzipped.filtered.vcf")

## Scanning file to determine attributes.
## File attributes:
##   meta lines: 14
##   header_line: 15
##   variant count: 16307
##   column count: 104
## 
Meta line 14 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
##   Character matrix gt rows: 16307
##   Character matrix gt cols: 104
##   skip: 0
##   nrows: 16307
##   row_num: 0
## 
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant 6000
Processed variant 7000
Processed variant 8000
Processed variant 9000
Processed variant 10000
Processed variant 11000
Processed variant 12000
Processed variant 13000
Processed variant 14000
Processed variant 15000
Processed variant 16000
Processed variant: 16307
## All variants processed

dim(vcfR@gt)

## [1] 16307    96

#filter out SNPs with anymissing data
vcfR@fix<-vcfR@fix[rowSums(is.na(vcfR@gt)) == 0,]
vcfR@gt<-vcfR@gt[rowSums(is.na(vcfR@gt)) == 0,]
vcfR #95 samples and ~1779 SNPs with no missing data

## ***** Object of Class vcfR *****
## 95 samples
## 28 CHROMs
## 1,779 variants
## Object size: 14.1 Mb
## 0 percent missing data
## *****        *****         *****

#write out vcf with no missing data for input into popVAE
#write.vcf(vcfR, "~/Downloads/aphelocoma.no.missing.vcf.gz")

#convert vcfR into a 'genind' object
data<-vcfR2genind(vcfR)
#convert to genlight
gen<-vcfR2genlight(vcfR)

#scale the genind
data_scaled <- scaleGen(data, center=FALSE, scale=FALSE, NA.method=c("mean"), nf)
data_scaled <- scaleGen(data, center=FALSE, scale=FALSE)

#bring in locality info
locs<-read.csv("~/Desktop/aph.data/rad.sampling.csv")
#subset locs to include only samples that passed filtering, and have been retained in the vcf
locs<-locs[locs$id %in% colnames(vcfR@gt),]
locs[20,5]<-"woodhouseii"
#order by species assingments
locs<-locs[c(33:46,1:19,21:32,57:73,81:95,20,74:80,47:56),]
rownames(locs)<-1:95
#combine lat/longs for nearby sampling localities 
locs[20,c(2,3)]<-locs[21,c(2,3)]
locs[34,c(2,3)]<-locs[37,c(2,3)]
locs[43,c(2,3)]<-locs[44,c(2,3)]
locs[79,c(2,3)]<-locs[80,c(2,3)]
locs[70,c(2,3)]<-locs[68,c(2,3)]
locs[71,c(2,3)]<-locs[68,c(2,3)]
locs[74,c(2,3)]<-locs[64,c(2,3)]

locs$species<-as.character(locs$species)
locs[c(79:85),4]<-c(rep("texana", times=7))
locs$species<-as.factor(locs$species)

#add sampling locality to locs df
locs$number<-c(rep(26, times=6),rep(11, times=8),rep(10, times=5),rep(8, times=4),rep(7, times=2),
               rep(9, times=3),rep(6, times=5),4,5,5,4,4,rep(3, times=3),2,1,1,2,14,14,13,13,13,
               12,12,12,15,15,18,17,17,17,16,16,16,23,23,20,21,21,21,22,21,21,22,22,23,20,20,22,
               18,rep(19, times=7),25,25,rep(24, times=5),25,25,25)
loc.frame<-locs[,c(1,10)]

table(locs$species)

## 
##  californica coerulescens    insularis  sumichrasti       texana  woodhouseii 
##           31            6            8           10            7           33

#split df by species
spec.dfs<-split(locs, locs$species)

#init sampling.df which will be a df of samples grouped by unique lat/long
sampling.df<-data.frame(NULL)
for (i in names(spec.dfs)){
  samps<-spec.dfs[[i]] %>% dplyr::group_by(decimallatitude, decimallongitude) %>% dplyr::summarize(count=n())
  df<-cbind(rep(i, times=nrow(samps)), samps)
  sampling.df<-as.data.frame(rbind(sampling.df, df))
}

## `summarise()` has grouped output by 'decimallatitude'. You can override using the `.groups` argument.

## New names:
## * NA -> ...1

## `summarise()` has grouped output by 'decimallatitude'. You can override using the `.groups` argument.

## New names:
## * NA -> ...1

## `summarise()` has grouped output by 'decimallatitude'. You can override using the `.groups` argument.

## New names:
## * NA -> ...1

## `summarise()` has grouped output by 'decimallatitude'. You can override using the `.groups` argument.

## New names:
## * NA -> ...1

## `summarise()` has grouped output by 'decimallatitude'. You can override using the `.groups` argument.

## New names:
## * NA -> ...1

## `summarise()` has grouped output by 'decimallatitude'. You can override using the `.groups` argument.

## New names:
## * NA -> ...1

#fix colnames
colnames(sampling.df)<-c("species","lat","long","count")
sampling.df$grps<-sampling.df$species

#use only currently recognized species
sampling.df$species[13:15]<-c("woodhouseii","woodhouseii","woodhouseii")
sampling.df$grps[16:19]<-"mex"

#make full map
pac<-map_data("world")
#
ggplot()+
  geom_polygon(data = pac, aes(x=long, y = lat, group = group), fill="grey90", col="black", cex=.1)+
  coord_sf(xlim = c(-123, -80), ylim = c(16, 45)) + 
  geom_point(data = sampling.df, aes(x = long, y = lat, color=grps, shape=species, size=count), alpha =.8, show.legend=TRUE) +
  scale_shape_manual(values = c(15,18,17,16))+
  scale_size_continuous(breaks = c(2,4,6,8), range = c(4,8))+
  theme_classic()+
  geom_text(data = sampling.df, aes(x = long, y = lat, label = c(1:10,26,11,25,24,19,23,22,21,20,18,17,14,13,12,16,15)),size = 3)+
  guides(shape = guide_legend(override.aes = list(size = 4), order=1, label.theme = element_text(face = "italic"), title=NULL),
         size = guide_legend(nrow = 1, order = 2), color=FALSE)+
  theme(legend.position = c(0.01, 0.01), legend.justification = c(0.01, 0.01),
        legend.background = element_blank())+
  xlab("longitude")+
  ylab("latitude")

#####
#Perform DAPC
#####
#Do dapc and choose groups
#assign samples to the number of groups present in popmap, retain all PCAs
#grp<-find.clusters(gen, max.n.clust=20)
#set manually the values I chose based on looking at the scree plots. DAPC can't discriminate significantly between 5 and 6 species models based on BIC.
grp<-find.clusters(gen, n.pca=79, n.clust=5)
#run dapc, retain all discriminant axes, and enough PC axes to explain 75% of variance
#dapc1<-dapc(gen, grp$grp)
#set manually the values I chose from scree plots
dapc1<-dapc(gen, grp$grp, n.pca = 30, n.da = 5)

plot.df<-as.data.frame(dapc1$ind.coord)
ggplot(data=plot.df, aes(x=LD3, y=LD4, color=grp$grp))+
  geom_point(cex=3)+
  theme_classic()+
  labs(x="Linear discriminant 3",y="Linear discriminant 4")

#  scale_color_manual(name="Cluster",
#                     limits = c(1,6,3,4,2,5),
#                     values=c("red","blue","green","orange","purple","pink"),
#                     labels = c("Island","California","Woodhouse","Texas","Sumichrast","Florida"))+
  #theme(legend.position = "none")

###############################################
###############################################
# into the Random Forest, unsupervised
###############################################
###############################################

# convert genind scaled data to factors for randomForest
data_conv <- as.data.frame(data_scaled)
data_conv[is.na(data_conv)] <- ""
data_conv[sapply(data_conv, is.integer)] <- lapply(data_conv[sapply(data_conv, is.integer)], as.factor)
data_conv[sapply(data_conv, is.character)] <- lapply(data_conv[sapply(data_conv, is.character)], as.factor)
nsamp <- nrow(data_conv)

# unsupervised random forest
set.seed(69)
rftest <- randomForest(data_conv, ntree=5000)
#rftest <- randomForest(pca1$tab, ntree=500)
#rftest <- randomForest(data_scaled, ntree=500)

###############
# classic MDS
###############
# cMDS with optimal number of components to retain using broken-stick
cmdsplot1 <- MDSplot(rf=rftest, fac=plot.df$clust.pc, k=10) # may need to adjust number of dimensions if given error

## Warning in RColorBrewer::brewer.pal(nlevs, "Set1"): minimal value for n is 3, returning requested palette with 3 different levels

cmdsplot_bstick <- PCDimension::bsDimension(cmdsplot1$eig)
cmdsplot2 <- MDSplot(rftest, plot.df$clust.pc, cmdsplot_bstick)

## Warning in RColorBrewer::brewer.pal(nlevs, "Set1"): minimal value for n is 3, returning requested palette with 3 different levels

#cMDS plot from random forest run with the 10 groups identified by dapc labeled
cmds<-as.data.frame(cmdsplot2$points)
plot.df$rf.cmds1<-cmdsplot2$points[,1]
plot.df$rf.cmds2<-cmdsplot2$points[,2]
plot.df$rf.cmds3<-cmdsplot2$points[,3]
plot.df$rf.cmds4<-cmdsplot2$points[,4]

pop<-c()
# pam clustering
for (i in 2:10){
  pop[i]<-mean(silhouette(pam(cmdsplot2$points, i))[, "sil_width"])
}
plot(pop,type = "o", xlab = "K", ylab = "PAM silhouette", main="random forest PAM")

#prefers 5 groups, matching dapc
DAPC_pam_clust_prox <- pam(cmdsplot2$points, 5)
plot.df$rf.cmds.pam<-as.factor(DAPC_pam_clust_prox$clustering)

#plot with color = island and circles showing pam PCA clustering
ggplot(data=plot.df, aes(x=rf.cmds1, y=rf.cmds4, col=rf.cmds.pam))+
  geom_point(cex=3)+
  theme_classic()+
  labs(x="Dimension 1",y="Dimension 4")

  #+theme(legend.position = "none")

# determine optimal k from cMDS via hierarchical clustering with BIC
# adjust G option to reasonable potential cluster values, e.g. for up to 12 clusters, G=1:12
cmdsplot_clust <- Mclust(cmdsplot2$points)
cmdsplot_clust$G

## [1] 6

#hierarchical clustering of random forest identifies 6 groups 

# cMDS with optimal k and clusters of RF via hierarchical clustering
plot.df$rf.cmds.mclust<-as.factor(cmdsplot_clust$classification)
ggplot(data=plot.df, aes(x=rf.cmds1, y=rf.cmds4, col=rf.cmds.mclust))+
  geom_point(cex=3)+
  theme_classic()+
  labs(x="Dimension 1",y="Dimension 4")

  #+theme(legend.position = "none")

###############################################
###############################################
# t-SNE
###############################################
###############################################

#perform PCA
pca1 <- dudi.pca(data_scaled, center=FALSE, scale=FALSE, scannf=FALSE, nf=10)

# t-SNE on principal components of scaled data
# adjust perplexity, initial_dims
# can do k=3 for 3D plot
# should do only <50 variables
# can do it on pca1$li (if you reduce the number of components), or on cmdsplot2$points
# PCA, can adjust nf to include more components
set.seed(689)
tsne_p5<-tsne(pca1$li, max_iter=5000, perplexity=5, initial_dims=5)

## sigma summary: Min. : 0.135687046791248 |1st Qu. : 0.196874003775216 |Median : 0.236041063421948 |Mean : 0.291447030256798 |3rd Qu. : 0.307988560158939 |Max. : 0.688919195471719 |

## Epoch: Iteration #100 error is: 24.7457133991451

## Epoch: Iteration #200 error is: 3.20336168814578

## Epoch: Iteration #300 error is: 2.43059395528619

## Epoch: Iteration #400 error is: 2.14210748851023

## Epoch: Iteration #500 error is: 1.73195760899423

## Epoch: Iteration #600 error is: 1.31117062472133

## Epoch: Iteration #700 error is: 1.00085810058363

## Epoch: Iteration #800 error is: 1.41393790353311

## Epoch: Iteration #900 error is: 1.24725423119464

## Epoch: Iteration #1000 error is: 1.07037731154596

## Epoch: Iteration #1100 error is: 0.926760895227284

## Epoch: Iteration #1200 error is: 0.783225938153033

## Epoch: Iteration #1300 error is: 0.693972276527729

## Epoch: Iteration #1400 error is: 0.591786945473336

## Epoch: Iteration #1500 error is: 0.513167097947575

## Epoch: Iteration #1600 error is: 0.421325739294948

## Epoch: Iteration #1700 error is: 0.602504143150802

## Epoch: Iteration #1800 error is: 0.408985223493635

## Epoch: Iteration #1900 error is: 0.366406579484385

## Epoch: Iteration #2000 error is: 0.414252054618815

## Epoch: Iteration #2100 error is: 0.377309824757537

## Epoch: Iteration #2200 error is: 0.255165328715125

## Epoch: Iteration #2300 error is: 0.231292007204337

## Epoch: Iteration #2400 error is: 0.205024562664131

## Epoch: Iteration #2500 error is: 0.144856768148042

## Epoch: Iteration #2600 error is: 0.137165701069451

## Epoch: Iteration #2700 error is: 0.132907336086374

## Epoch: Iteration #2800 error is: 0.130902708064943

## Epoch: Iteration #2900 error is: 0.129679624928636

## Epoch: Iteration #3000 error is: 0.128836161199226

## Epoch: Iteration #3100 error is: 0.128248956653936

## Epoch: Iteration #3200 error is: 0.127797093907453

## Epoch: Iteration #3300 error is: 0.12746667577204

## Epoch: Iteration #3400 error is: 0.12720212257459

## Epoch: Iteration #3500 error is: 0.127002740067447

## Epoch: Iteration #3600 error is: 0.126841084246458

## Epoch: Iteration #3700 error is: 0.126708865073659

## Epoch: Iteration #3800 error is: 0.126600282046506

## Epoch: Iteration #3900 error is: 0.126509814109857

## Epoch: Iteration #4000 error is: 0.126437535713752

## Epoch: Iteration #4100 error is: 0.126376318640734

## Epoch: Iteration #4200 error is: 0.126324745868129

## Epoch: Iteration #4300 error is: 0.126282261688541

## Epoch: Iteration #4400 error is: 0.126246977344483

## Epoch: Iteration #4500 error is: 0.126217314634362

## Epoch: Iteration #4600 error is: 0.126191381978424

## Epoch: Iteration #4700 error is: 0.126170100459412

## Epoch: Iteration #4800 error is: 0.126151553554613

## Epoch: Iteration #4900 error is: 0.126135869489738

## Epoch: Iteration #5000 error is: 0.126122662296132

#add tsne coordinates to plotting df
plot.df$tsne.1<-tsne_p5[,1]
plot.df$tsne.2<-tsne_p5[,2]

pop<-c()
# pam clustering
for (i in 2:10){
  pop[i]<-mean(silhouette(pam(tsne_p5, i))[, "sil_width"])
}
plot(pop,type = "o", xlab = "K", ylab = "PAM silhouette", main="t-SNE PAM")

#pam prefers 6 groups
tsne.pam<-pam(tsne_p5, 6)

# tsne with optimal k and clustering identified via PAM
plot.df$tsne.pam<-as.factor(tsne.pam$clustering)
ggplot(data=plot.df, aes(x=tsne.1, y=tsne.2, col=tsne.pam))+
  geom_point(cex=3)+
  theme_classic()+
  labs(x="Dimension 1",y="Dimension 2")

  #+theme(legend.position = "none")

# determine optimal k of tSNE via hierarchical clustering with BIC
# adjust G option to reasonable potential cluster values, e.g. for up to 12 clusters, G=1:12
tsne_p5_clust <- Mclust(tsne_p5)
tsne_p5_clust$G # of clusters preferred

## [1] 6

# tsne with optimal k and clustering identified via PAM
plot.df$tsne.mclust<-as.factor(tsne_p5_clust$classification)
ggplot(data=plot.df, aes(x=tsne.1, y=tsne.2, col=tsne.mclust))+
  geom_point(cex=3)+
  theme_classic()+
  labs(x="Dimension 1",y="Dimension 2")

  #+theme(legend.position = "none")

#####
#popVAE
#####

#run the following code in order to execute popVAE from the command line 10 times
#for i in {1..10}; do mkdir out.${i}; popvae.py --infile /Users/devder/Downloads/aphelocoma.no.missing.vcf.gz --out out.${i}/scrub.rad --patience 500 --seed 99; done

#bring in latent space coordinates from 10 popVAE runs and do clustering analysis here:
popvae<-list()
for (i in 1:10){
  popvae[[i]]<-read.table(paste0("/Users/devder/popvae/out.",{i},"/scrub.rad_latent_coords.txt"), header = T)
}
#check that the order matches the order of our existing data frame
rownames(plot.df) == popvae[[i]]$sampleID

##  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [46] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [61] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [76] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [91] TRUE TRUE TRUE TRUE TRUE

## pam clustering on each of the 10 iterations
pop<-c()
pardefault <- par()
par(mfrow=c(2,2))
for (j in 1:10){
  for (i in 2:10){
    pop[i]<-mean(silhouette(pam(popvae[[j]][,c(1,2)], i))[, "sil_width"])
  }
plot(pop,type = "o", xlab = "K", ylab = "PAM silhouette", main = paste0("iteration ",j))
}

par(pardefault)

## Warning in par(pardefault): graphical parameter "cin" cannot be set

## Warning in par(pardefault): graphical parameter "cra" cannot be set

## Warning in par(pardefault): graphical parameter "csi" cannot be set

## Warning in par(pardefault): graphical parameter "cxy" cannot be set

## Warning in par(pardefault): graphical parameter "din" cannot be set

## Warning in par(pardefault): graphical parameter "page" cannot be set

##pam prefers 6 groups twice, 8 groups once, and 7 groups 7 times.
#I will pick one of the 7 grouped iterations to present
vae.pam<-pam(popvae[[1]][,c(1,2)], 7)
plot.df$vae.ld1<-popvae[[1]]$mean1
plot.df$vae.ld2<-popvae[[1]]$mean2

## vae with optimal k and clustering identified via PAM
plot.df$vae.pam<-as.factor(vae.pam$clustering)
ggplot(data=plot.df, aes(x=vae.ld1, y=vae.ld2, col=vae.pam))+
  geom_point(cex=3)+
  theme_classic()+
  labs(x="LD 1",y="LD 2")

  #+theme(legend.position = "none")

#
## determine optimal k of vae via hierarchical clustering with BIC
## adjust G option to reasonable potential cluster values, e.g. for up to 12 clusters, G=1:12
vae_clust <- Mclust(popvae[[1]][,c(1,2)])
#optimal number of clusters
vae_clust$G

## [1] 7

#
## popvae with optimal k and clustering identified via PAM
plot.df$vae.mclust<-as.factor(vae_clust$classification)
ggplot(data=plot.df, aes(x=vae.ld1, y=vae.ld2, col=as.factor(vae_clust$classification)))+
  geom_point(cex=3)+
  theme_classic()+
  labs(x="LD 1",y="LD 2")

  #+theme(legend.position = "none")

sampling.df[16,3]<- -102.3
sampling.df[17,3]<- -100.5
map<-ggplot()+
  geom_polygon(data = pac, aes(x=long, y = lat, group = group), fill="grey90", col="black", cex=.1)+
  coord_sf(xlim = c(-123, -80), ylim = c(16, 45)) + 
  geom_point(data = sampling.df, aes(x = long, y = lat, color=grps, shape=species, size=count), alpha =.8, show.legend=TRUE) +
  scale_shape_manual(values = c(15,18,17,16))+
  scale_size_continuous(breaks = c(2,5,8), range = c(4,8))+
  theme_classic()+
  geom_text(data = sampling.df, aes(x = long, y = lat, label = c(1:10,26,11,25,24,19,23,22,21,20,18,17,14,13,12,16,15)),size = 3)+
  #guides(shape = guide_legend(override.aes = list(size = 4), order=1, label.theme = element_text(face = "italic"),
  #                            title=NULL),size = guide_legend(nrow = 1, order = 2), color=FALSE)+
  guides(shape = FALSE,size = guide_legend(nrow = 1, order = 2), color=FALSE)+
  theme(legend.position = c(0.01, 0.01), legend.justification = c(0.01, 0.01),
        legend.background = element_blank())+
  xlab("longitude")+
  ylab("latitude")

d.pc1<-ggplot(data=plot.df, aes(x=LD3, y=LD4, color=grp$grp))+
  geom_point(cex=3)+
  theme_classic()+
  labs(x="LD 3",y="LD 4")+
  theme(legend.position = "none", axis.text = element_blank(),plot.title = element_text(hjust = 0.5))+
  ggtitle("DAPC, K=5")

new.grp<-find.clusters(gen, n.pca=79, n.clust=6)
new.dapc1<-dapc(gen, new.grp$grp, n.pca = 30, n.da = 5)
plot.df$dpcld3<-new.dapc1$ind.coord[,3]
plot.df$dpcld4<-new.dapc1$ind.coord[,4]

d.pc2<-ggplot(data=plot.df, aes(x=dpcld3, y=dpcld4, color=new.grp$grp))+
  geom_point(cex=3)+
  theme_classic()+
  labs(x="LD 3",y="LD 4")+
  theme(legend.position = "none", axis.text = element_blank(),plot.title = element_text(hjust = 0.5))+
  ggtitle("DAPC, K=6")

#plot with color = island and circles showing pam PCA clustering
rf.pam<-ggplot(data=plot.df, aes(x=rf.cmds1, y=rf.cmds4, col=rf.cmds.pam))+
  geom_point(cex=3)+
  theme_classic()+
  labs(x="Dimension 1",y="Dimension 4")+
  theme(legend.position = "none", axis.text = element_blank(),plot.title = element_text(hjust = 0.5))+
  ggtitle("RF + PAM")

# cMDS with optimal k and clusters of RF via hierarchical clustering
rf.mclust<-ggplot(data=plot.df, aes(x=rf.cmds1, y=rf.cmds4, col=rf.cmds.mclust))+
  geom_point(cex=3)+
  theme_classic()+
  labs(x="Dimension 1",y="Dimension 4")+
  theme(legend.position = "none", axis.text = element_blank(),plot.title = element_text(hjust = 0.5))+
  ggtitle("RF + HAC")

# tsne with optimal k and clustering identified via PAM
tsne.pam<-ggplot(data=plot.df, aes(x=tsne.1, y=tsne.2, col=tsne.pam))+
  geom_point(cex=3)+
  theme_classic()+
  labs(x="Dimension 1",y="Dimension 2")+
  theme(legend.position = "none", axis.text = element_blank(),plot.title = element_text(hjust = 0.5))+
  ggtitle("t-SNE + PAM")

# tsne with optimal k and clustering identified via mclust
tsne.mclust<-ggplot(data=plot.df, aes(x=tsne.1, y=tsne.2, col=tsne.mclust))+
  geom_point(cex=3)+
  theme_classic()+
  labs(x="Dimension 1",y="Dimension 2")+
  theme(legend.position = "none", axis.text = element_blank(),plot.title = element_text(hjust = 0.5))+
  ggtitle("t-SNE + HAC")

#popvae
vae.pam<-ggplot(data=plot.df, aes(x=vae.ld1, y=vae.ld2, col=vae.pam))+
  geom_point(cex=3)+
  theme_classic()+
  labs(x="LD 1",y="LD 2")+
  theme(legend.position = "none", axis.text = element_blank(),plot.title = element_text(hjust = 0.5))+
  ggtitle("VAE + PAM")

## popvae with optimal k and clustering identified via mclust
vae.mclust<-ggplot(data=plot.df, aes(x=vae.ld1, y=vae.ld2, col=as.factor(vae_clust$classification)))+
  geom_point(cex=3)+
  theme_classic()+
  labs(x="LD 1",y="LD 2")+
  theme(legend.position = "none", axis.text = element_blank(),plot.title = element_text(hjust = 0.5))+
  ggtitle("VAE + HAC")


#make same figure and add compoplot
lay <- rbind(c(1,1),
             c(2,3),
             c(4,5),
             c(6,7),
             c(8,9))
g1<-arrangeGrob(map,d.pc1,d.pc2,rf.pam,rf.mclust,tsne.pam,tsne.mclust,vae.pam,vae.mclust,
                layout_matrix = lay, heights=c(2,1,1,1,1))

#reorder compoplot and dotchart as Island, Cali, Woodhouse, Tex, Sumi, Florida
#ggsave(filename = "~/Downloads/specs.pdf", plot = g1, width = 4.25, height=11, units="in")

grid.arrange(map,d.pc1,d.pc2,rf.pam,rf.mclust,tsne.pam,tsne.mclust,vae.pam,vae.mclust,
                layout_matrix = lay, heights=c(2,1,1,1,1))

Scrub-Jay species delimitation

Devon DeRaad

3/27/2021

perform species delimitation