run.eems

#from https://github.com/dipetkov/eems/blob/master/README.md
#points outer from http://www.birdtheme.org/useful/v3tool.html
#devtools::install_github("dipetkov/eems/plotting/rEEMSplots")
#install.packages("rworldxtra")
#install.packages("rgeos")
#setwd("/Users/devder/Downloads/eems-master/plotting")
#install.packages("rEEMSplots", repos=NULL, type="source")
library("rEEMSplots")

## rEEMSplots uses DarkOrange to Blue color scheme, with 'white' as the midpoint color.
## It combines two color schemes from the 'dichromat' package, which itself is based on
## a collection of color schemes for scientific data graphics:
##  Light A and Bartlein PJ (2004).
##  The End of the Rainbow? Color Schemes for Improved Data Graphics.
##  EOS Transactions of the American Geophysical Union, 85(40), 385.
## See also http://geog.uoregon.edu/datagraphics/color_scales.htm

library("rgdal")

## Loading required package: sp

## Please note that rgdal will be retired by the end of 2023,
## plan transition to sf/stars/terra functions using GDAL and PROJ
## at your earliest convenience.
## 
## rgdal: version: 1.5-28, (SVN revision 1158)
## Geospatial Data Abstraction Library extensions to R successfully loaded
## Loaded GDAL runtime: GDAL 3.2.1, released 2020/12/29
## Path to GDAL shared files: /Library/Frameworks/R.framework/Versions/4.1/Resources/library/rgdal/gdal
## GDAL binary built with GEOS: TRUE 
## Loaded PROJ runtime: Rel. 7.2.1, January 1st, 2021, [PJ_VERSION: 721]
## Path to PROJ shared files: /Library/Frameworks/R.framework/Versions/4.1/Resources/library/rgdal/proj
## PROJ CDN enabled: FALSE
## Linking to sp version:1.4-6
## To mute warnings of possible GDAL/OSR exportToProj4() degradation,
## use options("rgdal_show_exportToProj4_warnings"="none") before loading sp or rgdal.
## Overwritten PROJ_LIB was /Library/Frameworks/R.framework/Versions/4.1/Resources/library/rgdal/proj

library("rworldmap")

## ### Welcome to rworldmap ###

## For a short introduction type :   vignette('rworldmap')

library("rworldxtra")
library("ggplot2")
library("dplyr")

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library("vcfR")

## 
##    *****       ***   vcfR   ***       *****
##    This is vcfR 1.12.0 
##      browseVignettes('vcfR') # Documentation
##      citation('vcfR') # Citation
##    *****       *****      *****       *****

library("SNPfiltR")

#bring in and subset files
#read in vcf as vcfR
vcfR <- read.vcfR("~/Desktop/aph.data/unzipped.filtered.vcf")

## Scanning file to determine attributes.
## File attributes:
##   meta lines: 14
##   header_line: 15
##   variant count: 16307
##   column count: 104
## 
Meta line 14 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
##   Character matrix gt rows: 16307
##   Character matrix gt cols: 104
##   skip: 0
##   nrows: 16307
##   row_num: 0
## 
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant 6000
Processed variant 7000
Processed variant 8000
Processed variant 9000
Processed variant 10000
Processed variant 11000
Processed variant 12000
Processed variant 13000
Processed variant 14000
Processed variant 15000
Processed variant 16000
Processed variant: 16307
## All variants processed

#bring in locs
locs<-read.csv("~/Desktop/aph.data/rad.sampling.csv")
#subset locs to include only samples that passed filtering, and have been retained in the vcf
locs<-locs[locs$id %in% colnames(vcfR@gt),]
locs[20,5]<-"woodhouseii"
rownames(locs)<-1:95
locs$species<-as.character(locs$species)
locs[74:80,4]<-"texana"
locs[81:95,4]<-"mex_wood"
locs[c(20,57:73),4]<-"wood_us"
#locs[33:38,4]<-"zfla"
locs$species<-as.factor(locs$species)
table(locs$species)

## 
##  californica coerulescens    insularis     mex_wood  sumichrasti       texana 
##           31            6            8           15           10            7 
##      wood_us 
##           18

#subset only populations from western us, tex and northern mex
locs.wood<-locs[c(20,57:95),]
rownames(locs.wood)<-1:40
vcf.wood<-vcfR[,c(1,21,58:96)]
dim(vcf.wood)

## variants fix_cols  gt_cols 
##    16307        8       41

vcf.wood@gt[1,]

##                                  FORMAT                    A_woodhouseii_334171 
##                        "GT:DP:AD:GQ:GL"     "0/0:22:22,0:40:-0.00,-8.62,-86.50" 
##                    A_woodhouseii_334062                    A_woodhouseii_334063 
##     "0/0:14:14,0:40:-0.00,-6.21,-55.57"     "0/0:18:18,0:40:-0.00,-7.42,-71.04" 
##                    A_woodhouseii_334086                    A_woodhouseii_334088 
##     "0/0:12:12,0:40:-0.00,-5.61,-47.84"     "0/0:18:18,0:40:-0.00,-7.42,-71.04" 
##                    A_woodhouseii_334096                    A_woodhouseii_334132 
##       "0/0:6:6,0:40:-0.00,-3.81,-24.65"    "0/0:57:57,0:40:0.00,-19.16,-221.81" 
##                    A_woodhouseii_334133                    A_woodhouseii_334134 
##    "0/0:84:84,0:40:0.00,-27.28,-326.19"     "0/0:13:13,0:40:-0.00,-5.91,-51.71" 
##                    A_woodhouseii_334142                    A_woodhouseii_334153 
##     "0/0:25:25,0:40:-0.00,-9.52,-98.10"     "0/0:18:18,0:40:-0.00,-7.42,-71.04" 
##                    A_woodhouseii_334161                    A_woodhouseii_334188 
##   "0/0:72:72,0:40:-0.00,-23.67,-279.80"     "0/0:13:13,0:40:-0.00,-5.91,-51.71" 
##                    A_woodhouseii_334190                    A_woodhouseii_334196 
##   "0/0:52:52,0:40:-0.00,-17.65,-202.48"   "0/0:49:49,0:40:-0.00,-16.75,-190.88" 
##                    A_woodhouseii_334210                    A_woodhouseii_334211 
##   "0/0:30:30,0:40:-0.00,-11.03,-117.43"   "0/0:41:41,0:40:-0.00,-14.34,-159.95" 
##                    A_woodhouseii_334217                    A_woodhouseii_334230 
##       "0/0:5:5,0:40:-0.00,-3.51,-20.78"     "0/0:14:14,0:40:-0.00,-6.21,-55.57" 
##                    A_woodhouseii_334241                    A_woodhouseii_334242 
##       "0/0:5:5,0:40:-0.00,-3.51,-20.78"                                      NA 
##                    A_woodhouseii_334243                    A_woodhouseii_334244 
##                                      NA   "0/0:49:49,0:40:-0.00,-16.75,-190.88" 
##                    A_woodhouseii_334247                    A_woodhouseii_334250 
##     "0/0:10:10,0:40:-0.00,-5.01,-40.11"   "0/0:50:50,0:40:-0.00,-17.05,-194.75" 
##                    A_woodhouseii_343458                    A_woodhouseii_343461 
##       "0/0:7:7,0:40:-0.00,-4.11,-28.51"   "0/0:43:43,0:40:-0.00,-14.94,-167.69" 
##                    A_woodhouseii_343476                    A_woodhouseii_343480 
##     "0/0:17:17,0:40:-0.00,-7.12,-67.17"   "0/0:43:43,0:40:-0.00,-14.94,-167.69" 
##                    A_woodhouseii_343481                    A_woodhouseii_343483 
##   "0/0:29:29,0:40:-0.00,-10.73,-113.56"  "0/0:171:171,0:40:0.00,-53.47,-662.53" 
##                    A_woodhouseii_343497                    A_woodhouseii_393605 
##       "0/0:7:7,0:40:-0.00,-4.11,-28.51"  "0/0:148:148,0:40:0.00,-46.54,-573.61" 
##                    A_woodhouseii_393606                    A_woodhouseii_393697 
##   "0/0:31:31,0:40:-0.00,-11.33,-121.30"    "0/0:81:81,0:40:0.00,-26.38,-314.59" 
##                    A_woodhouseii_393698                    A_woodhouseii_393702 
##     "0/0:18:18,0:40:-0.00,-7.42,-71.04"    "0/0:70:70,0:40:0.00,-23.07,-272.07" 
##                    A_woodhouseii_393712                    A_woodhouseii_393713 
## "0/0:106:106,0:40:-0.00,-33.90,-411.24"   "0/0:28:28,0:40:-0.00,-10.43,-109.70" 
##                    A_woodhouseii_395768 
##  "0/0:252:252,0:40:0.00,-77.84,-975.67"

#remove SNPs with no data using SNPfiltR function
vcf.wood<-SNPfiltR::min_mac(vcf.wood, min.mac = 1)

## [1] "52.15% of SNPs fell below a minor allele count of 1 and were removed from the VCF"

vcf.wood #7803 SNPs

## ***** Object of Class vcfR *****
## 40 samples
## 34 CHROMs
## 7,803 variants
## Object size: 26.6 Mb
## 7.009 percent missing data
## *****        *****         *****

#prepare input files for eems
###1### make 'coord' input file
#make sure vcf order matches sample order
locs.wood$id == colnames(vcf.wood@gt)[-1]

##  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE

#write out the lat/long for each of the 40 W US, N Mex, and Tex woodhouse's samples (tab separated)
#write.table(locs.wood[,c(2,3)], "~/Downloads/eems/datapath.coord.txt", col.names = F, row.names = F, quote = F, sep='\t')

###2### make genetic input file
#define function to convert a genotype matrix into a pairwise difference matrix
bed2diffs_v2 <- function(genotypes) {
  nIndiv <- nrow(genotypes)
  nSites <- ncol(genotypes)
  missing <- is.na(genotypes)
  ## Impute NAs with the column means (= twice the allele frequencies)
  geno_means <- colMeans(genotypes, na.rm = TRUE)
  # nIndiv rows of genotype means
  geno_means <- matrix(geno_means, nrow = nIndiv, ncol = nSites, byrow = TRUE)
  ## Set the means which correspond to observed genotypes to 0
  geno_means[missing == FALSE] <- 0
  ## Set the missing genotypes to 0 (used to be NA)
  genotypes[missing == TRUE] <- 0
  genotypes <- genotypes + geno_means
  similarities <- genotypes %*% t(genotypes) / nSites
  self_similarities <- diag(similarities)
  vector1s <- rep(1, nIndiv)
  diffs <- self_similarities %*% t(vector1s) + vector1s %*% t(self_similarities) - 2 * similarities
  diffs
}

#make genotype matrix
vcf.wood<-min_mac(vcf.wood, min.mac = 1) #make sure all invariant sites are removed

## [1] "0% of SNPs fell below a minor allele count of 1 and were removed from the VCF"

#7,803 variants
gen.wood<-vcfR2genlight(vcf.wood)

## Loading required namespace: adegenet

geno.mat<-as.matrix(gen.wood)
wood.diff.mat<-bed2diffs_v2(genotypes = geno.mat)
#make sure order matches
rownames(geno.mat) == locs.wood$id

##  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE

#write out the pairwise difference matrix to file
#write.table(wood.diff.mat, "~/Downloads/eems/datapath.diffs.txt", col.names = F, row.names = F, quote = F, sep='\t')

###3### make 'outer' boundaries input file
#make datapath.outer by making a counter-clockwise circle path here: http://www.birdtheme.org/useful/v3tool.html

use this code to run three eems iterations on the KU HPCC

#!/bin/sh
#
#SBATCH --job-name=aph.eems           # Job Name
#SBATCH --nodes=1             # nodes
#SBATCH --cpus-per-task=1               # CPU allocation per Task
#SBATCH --partition=bi            # Name of the Slurm partition used
#SBATCH --chdir=/home/d669d153/work/aph.rad/eems        # Set working d$
#SBATCH --mem-per-cpu=3gb            # memory requested
#SBATCH --time=10000

module load eems

runeems_snps --params /home/d669d153/work/aph.rad/eems/params.ndemes100.txt
runeems_snps --params /home/d669d153/work/aph.rad/eems/params.ndemes200.txt
runeems_snps --params /home/d669d153/work/aph.rad/eems/params.ndemes400.txt

example ‘params.ndemes100.txt’ file below

datapath = /home/d669d153/work/aph.rad/eems/data/datapath
mcmcpath = /home/d669d153/work/aph.rad/eems/rep100
nIndiv = 40
nSites = 7803
nDemes = 100
diploid = true
numMCMCIter = 2000000
numBurnIter = 1000000
numThinIter = 9999

vis eems results run on cluster using R functions

mcmcpath = "~/Downloads/eems/rep100"
plotpath = "~/Downloads/eems/rep100/plots"

projection_none <- "+proj=longlat +datum=WGS84"
projection_mercator <- "+proj=merc +datum=WGS84"

eemsResults <- rEEMSplots::eems.plots(mcmcpath, plotpath, longlat = T,out.png = FALSE, projection.in = projection_none, projection.out = projection_mercator)

## Input projection: +proj=longlat +datum=WGS84
## Output projection: +proj=merc +datum=WGS84

## Loading rgdal (required by projection.in)

## Using 'euclidean' distance to assign interpolation points to Voronoi tiles.

## Processing the following EEMS output directories :

## ~/Downloads/eems/rep100

## Plotting effective migration surface (posterior mean of m rates)

## ~/Downloads/eems/rep100

## Plotting effective diversity surface (posterior mean of q rates)

## ~/Downloads/eems/rep100

## Plotting posterior probability trace

## ~/Downloads/eems/rep100

## Plotting average dissimilarities within and between demes

## ~/Downloads/eems/rep100

#The function eems.plots generates several figures automatically (to encourage looking at all the figures). There are examples in EEMS-doc.pdf, with captions that explain each figure. eems.plots also saves several objects to an RData file, which can be read back from the file with load.
load("/Users/devder/Downloads/eems/rep100/plots-rdist.RData")

## Reproduce plotpath-rdist01.png,
## which plots observed vs fitted dissimilarities between demes
ggplot(B.component %>% filter(size > 1),
       aes(fitted, obsrvd)) +
  geom_point()

#The data frame B.component contains information about the dissimilarities between observed pairs of demes (alpha, beta). "Observed" means that each deme has at least one sampled individual assigned to it. Furthermore, each deme has two coordinates (x and y, longitude and latitude) and these are labeled alpha.x, alpha.y for deme alpha and beta.x, beta.y for deme beta.
## Reproduce plotpath-rdist02.png,
## which plots observed vs fitted dissimilarities within demes
ggplot(W.component %>% filter(size > 1),
       aes(fitted, obsrvd)) +
  geom_point()

## Reproduce plotpath-rdist03.png,
## which plots observed dissimilarities against great circle distances between demes
ggplot(W.component %>% filter(size > 1),
       aes(fitted, obsrvd)) +
  geom_point()

##from https://github.com/OKersten/PuffPopGen/blob/master/Nuclear/11.EEMS.md
#eems_results <- grep("Puffin_EEMS_output_chain", list.dirs(path = ".", full.names = TRUE, recursive = TRUE), value = TRUE)
#name_figures <- file.path("Puffin_EEMS_plot_results/Combined")

#plot eems custom
#eems.plots(mcmcpath, 
#           plotpath, 
#           longlat = T, 
#           out.png = F,
#           plot.height = 5, plot.width = 5,
#           add.outline = FALSE,  
#           projection.in = projection_none,
#           projection.out = projection_mercator,
#           add.map = TRUE, col.map = "black", lwd.map = 1,
#           add.title = FALSE,
#           add.demes=TRUE, max.cex.demes = 3, min.cex.demes = 1.5,
#           add.abline = TRUE, add.r.squared = TRUE)

Vis replicate with 200 demes

mcmcpath = "~/Downloads/eems/rep200"
plotpath = "~/Downloads/eems/rep200/plots"

eemsResults <- rEEMSplots::eems.plots(mcmcpath, plotpath, longlat = T,out.png = FALSE, projection.in = projection_none, projection.out = projection_mercator)

## Input projection: +proj=longlat +datum=WGS84
## Output projection: +proj=merc +datum=WGS84

## Loading rgdal (required by projection.in)

## Using 'euclidean' distance to assign interpolation points to Voronoi tiles.

## Processing the following EEMS output directories :

## ~/Downloads/eems/rep200

## Plotting effective migration surface (posterior mean of m rates)

## ~/Downloads/eems/rep200

## Plotting effective diversity surface (posterior mean of q rates)

## ~/Downloads/eems/rep200

## Plotting posterior probability trace

## ~/Downloads/eems/rep200

## Plotting average dissimilarities within and between demes

## ~/Downloads/eems/rep200

##from https://github.com/OKersten/PuffPopGen/blob/master/Nuclear/11.EEMS.md

##plot eems custom
#eems.plots(mcmcpath, 
#           plotpath, 
#           longlat = T, 
#           out.png = F,
#           plot.height = 7, plot.width = 7,
#           add.outline = FALSE,  
#           projection.in = projection_none,
#           projection.out = projection_mercator,
#           add.map = TRUE, col.map = "black", lwd.map = 1,
#           add.title = FALSE,
#           add.demes=TRUE, max.cex.demes = 3, min.cex.demes = 1.5,
#           add.abline = TRUE, add.r.squared = TRUE)

vis replicate with 400 demes

mcmcpath = "~/Downloads/eems/rep400"
plotpath = "~/Downloads/eems/rep400/plots"

eemsResults <- rEEMSplots::eems.plots(mcmcpath, plotpath, longlat = T,out.png = FALSE, projection.in = projection_none, projection.out = projection_mercator)

## Input projection: +proj=longlat +datum=WGS84
## Output projection: +proj=merc +datum=WGS84

## Loading rgdal (required by projection.in)

## Using 'euclidean' distance to assign interpolation points to Voronoi tiles.

## Processing the following EEMS output directories :

## ~/Downloads/eems/rep400

## Plotting effective migration surface (posterior mean of m rates)

## ~/Downloads/eems/rep400

## Plotting effective diversity surface (posterior mean of q rates)

## ~/Downloads/eems/rep400

## Plotting posterior probability trace

## ~/Downloads/eems/rep400

## Plotting average dissimilarities within and between demes

## ~/Downloads/eems/rep400

##from https://github.com/OKersten/PuffPopGen/blob/master/Nuclear/11.EEMS.md

#plot eems custom
#eems.plots(mcmcpath, 
#           plotpath, 
#           longlat = T, 
#           out.png = F,
#           plot.height = 5, plot.width = 5,
#           add.outline = FALSE,  
#           projection.in = projection_none,
#           projection.out = projection_mercator,
#           add.map = TRUE, col.map = "black", lwd.map = 1,
#           add.title = FALSE,
#           add.demes=TRUE, max.cex.demes = 3, min.cex.demes = 1.5,
#           add.abline = TRUE, add.r.squared = TRUE)

run.eems

Devon DeRaad

1/21/2022

use this code to run three eems iterations on the KU HPCC

example ‘params.ndemes100.txt’ file below

vis eems results run on cluster using R functions

Vis replicate with 200 demes

vis replicate with 400 demes