library(delimitR)
#force R to find compatible python installation
Sys.setenv(PATH = paste("/panfs/pfs.local/work/bi/bin/conda/bin/", Sys.getenv("PATH"), sep=":"))
#setwd
setwd("/home/d669d153/work/aph.rad/delimitr/ca.wood")
Set priors here:
#location of our observed sfs (cannot be full path, must be in wd, drop .obs file extension)
observedSFS <- 'ca_MSFS'
#location of our traits file (2 column file which maps alleles to species, must be in wd)
traitsfile <- 'ca.wood.traits.file.txt'
#guide tree
observedtree <- '(0,1);'
#migration matrix (must be symmetrical)
migmatrix <- matrix(c(FALSE, TRUE,
TRUE, FALSE),
nrow = 2, ncol = 2, byrow = TRUE)
#test divergence with gene flow?
divwgeneflow <- TRUE
#test secondary contact?
seccontact <- TRUE
#what is the maximum number of migration events to consider on your guide tree?
maxedges <- 1
#how many species are in your guide tree?
obsspecies<- 2
#the number of "alleles" retained after downsampling SFS
obssamplesize <- c(52,28)
#The user must specify the number of linkage blocks to simulate
#For unlinked SNPs, this is equal to the number of SNPs used to build your observed SFS
obssnps <- 1663
#The user must also provide a prefix
#This will be used to name the fsc2 input files, as well as other output files
#This should be unique for all guide tree + prior combinations in a folder, or files will be overwritten
obsprefix <- 'ca.wood.guidetree'
#The user must specify priors on population sizes
#The first vector is for population 0, the second for population 1, and the third for population 2
#Note that these are in terms of the number of haploid individuals (as specified in the fsc2 documentation)
obspopsizeprior <- list(c(100000,1000000),c(100000,1000000))
#priors for divergence times given in terms of the number of generations and must be supplied as a list of vectors
#Divergence time priors should be provided in order of coalescent interval
obsdivtimeprior <- list(c(100000,1000000))
#prior on migration rates, program only allows one prior for all migration rates in the default model sets
obsmigrateprior <- list(c(0.000005,0.0005))
Set up fastsimcoal models and sim 10K reps of each model:
#set up your prior models for fastsimcoal2
setup_fsc2(tree=observedtree,
nspec=obsspecies,
samplesizes=obssamplesize,
nsnps=obssnps,
prefix=obsprefix,
migmatrix=migmatrix,
popsizeprior=obspopsizeprior,
divtimeprior=obsdivtimeprior,
migrateprior=obsmigrateprior,
secondarycontact= seccontact,
divwgeneflow= divwgeneflow,
maxmigrations = maxedges)
# fastsimcoalsims() requires the prefix used to generate the model files, the path to fastsimcoal2, and # of reps
#Generally, a minimum of 10,000 replicates under each model should be simulated.
fastsimcoalsims(prefix=obsprefix,
pathtofsc='/panfs/pfs.local/work/bi/bin/fsc26_linux64/fsc26',
nreps=10000)
## [1] "/panfs/pfs.local/work/bi/bin/fsc26_linux64/fsc26 -t ca.wood.guidetree_1.tpl -e ca.wood.guidetree_1.est -n 1 --msfs -q --multiSFS -x -E10000"
## [1] "/panfs/pfs.local/work/bi/bin/fsc26_linux64/fsc26 -t ca.wood.guidetree_2.tpl -e ca.wood.guidetree_2.est -n 1 --msfs -q --multiSFS -x -E10000"
## [1] "/panfs/pfs.local/work/bi/bin/fsc26_linux64/fsc26 -t ca.wood.guidetree_3.tpl -e ca.wood.guidetree_3.est -n 1 --msfs -q --multiSFS -x -E10000"
## [1] "/panfs/pfs.local/work/bi/bin/fsc26_linux64/fsc26 -t ca.wood.guidetree_4.tpl -e ca.wood.guidetree_4.est -n 1 --msfs -q --multiSFS -x -E10000"
Build prior and reduced prior:
#need to turn our mSFS into a binned (bSFS) specify bin number here
#number should not be greater than the sample size of the population with the fewest samples, as this results in sparse sampling of the SFS
#Large values lead to a more complete summary of the data, but also lead to a more sparsely sampled SFS and increased computation times
nclasses <- 12
#to make prior, provide:
#prefix used to name the model files, the number of species, the number of classes to be included in the SFS,
#a path to the working directory, the name of the traits file, the threshold,
#the name of the folder to store the prior in, and the number of cores to use.
FullPrior <- makeprior(prefix=obsprefix,
nspec=obsspecies,
nclasses=nclasses,
getwd(),
traitsfile = traitsfile,
threshold=100,
thefolder = 'Prior',
ncores = 4)
#remove extraneous files
#clean_working(prefix=obsprefix)
#remove invariant data
ReducedPrior <- Prior_reduced(FullPrior)
build random forest classifier:
#build random forest
myRF <- RF_build_abcrf(ReducedPrior,FullPrior,5000)
## Warning in lda.default(x, grouping, ...): variables are collinear
## Growing trees.. Progress: 5%. Estimated remaining time: 9 minutes, 27 seconds.
## Growing trees.. Progress: 11%. Estimated remaining time: 8 minutes, 37 seconds.
## Growing trees.. Progress: 16%. Estimated remaining time: 7 minutes, 59 seconds.
## Growing trees.. Progress: 22%. Estimated remaining time: 7 minutes, 24 seconds.
## Growing trees.. Progress: 27%. Estimated remaining time: 6 minutes, 55 seconds.
## Growing trees.. Progress: 33%. Estimated remaining time: 6 minutes, 20 seconds.
## Growing trees.. Progress: 38%. Estimated remaining time: 5 minutes, 50 seconds.
## Growing trees.. Progress: 44%. Estimated remaining time: 5 minutes, 16 seconds.
## Growing trees.. Progress: 49%. Estimated remaining time: 4 minutes, 46 seconds.
## Growing trees.. Progress: 55%. Estimated remaining time: 4 minutes, 15 seconds.
## Growing trees.. Progress: 61%. Estimated remaining time: 3 minutes, 42 seconds.
## Growing trees.. Progress: 66%. Estimated remaining time: 3 minutes, 11 seconds.
## Growing trees.. Progress: 72%. Estimated remaining time: 2 minutes, 40 seconds.
## Growing trees.. Progress: 77%. Estimated remaining time: 2 minutes, 9 seconds.
## Growing trees.. Progress: 83%. Estimated remaining time: 1 minute, 38 seconds.
## Growing trees.. Progress: 88%. Estimated remaining time: 1 minute, 7 seconds.
## Growing trees.. Progress: 94%. Estimated remaining time: 35 seconds.
## Growing trees.. Progress: 99%. Estimated remaining time: 3 seconds.
#view RF
myRF
##
## Call:
## abcrf(formula = Models ~ ., data = Trainingdata, ntree = ntrees, paral = TRUE)
## includes the axes of a preliminary LDA
##
## Number of simulations: 40000
## Out-of-bag prior error rate: 4.7075%
##
## Confusion matrix:
## ca.wood.guidetree_1_MSFS ca.wood.guidetree_2_MSFS
## ca.wood.guidetree_1_MSFS 9968 0
## ca.wood.guidetree_2_MSFS 0 8966
## ca.wood.guidetree_3_MSFS 304 0
## ca.wood.guidetree_4_MSFS 0 485
## ca.wood.guidetree_3_MSFS ca.wood.guidetree_4_MSFS
## ca.wood.guidetree_1_MSFS 32 0
## ca.wood.guidetree_2_MSFS 0 1034
## ca.wood.guidetree_3_MSFS 9674 22
## ca.wood.guidetree_4_MSFS 6 9509
## class.error
## ca.wood.guidetree_1_MSFS 0.0032
## ca.wood.guidetree_2_MSFS 0.1034
## ca.wood.guidetree_3_MSFS 0.0326
## ca.wood.guidetree_4_MSFS 0.0491
#view misclassification rate for simulated data under our four models
classification.error<-myRF[["model.rf"]]$confusion.matrix
classification.error
## ca.wood.guidetree_1_MSFS ca.wood.guidetree_2_MSFS
## ca.wood.guidetree_1_MSFS 9968 0
## ca.wood.guidetree_2_MSFS 0 8966
## ca.wood.guidetree_3_MSFS 304 0
## ca.wood.guidetree_4_MSFS 0 485
## ca.wood.guidetree_3_MSFS ca.wood.guidetree_4_MSFS
## ca.wood.guidetree_1_MSFS 32 0
## ca.wood.guidetree_2_MSFS 0 1034
## ca.wood.guidetree_3_MSFS 9674 22
## ca.wood.guidetree_4_MSFS 6 9509
## class.error
## ca.wood.guidetree_1_MSFS 0.0032
## ca.wood.guidetree_2_MSFS 0.1034
## ca.wood.guidetree_3_MSFS 0.0326
## ca.wood.guidetree_4_MSFS 0.0491
#write error rates to file
write.csv(classification.error, file = "classification.error.csv")
plot(myRF, training = ReducedPrior)
## Press <ENTER> to Continue
#prep data
myobserved <- prepobserved(
observedSFS,
FullPrior,
ReducedPrior,
nclasses,
obsspecies,
traitsfile=traitsfile,
threshold = 100)
apply RF classifier to our observed data
#Now, we're ready to apply the RF classifier to the observed data.
#we use the function RF_predict_abcrf(), which requires the RF object, the observed dataset,
#the Reduced Prior, the Full Prior, and the number of trees, which should match that used to construct the classifier
prediction <- RF_predict_abcrf(myRF, myobserved, ReducedPrior, FullPrior, 5000)
## Growing trees.. Progress: 6%. Estimated remaining time: 7 minutes, 58 seconds.
## Growing trees.. Progress: 13%. Estimated remaining time: 7 minutes, 8 seconds.
## Growing trees.. Progress: 19%. Estimated remaining time: 6 minutes, 30 seconds.
## Growing trees.. Progress: 26%. Estimated remaining time: 5 minutes, 58 seconds.
## Growing trees.. Progress: 32%. Estimated remaining time: 5 minutes, 28 seconds.
## Growing trees.. Progress: 39%. Estimated remaining time: 4 minutes, 55 seconds.
## Growing trees.. Progress: 45%. Estimated remaining time: 4 minutes, 22 seconds.
## Growing trees.. Progress: 52%. Estimated remaining time: 3 minutes, 51 seconds.
## Growing trees.. Progress: 58%. Estimated remaining time: 3 minutes, 21 seconds.
## Growing trees.. Progress: 65%. Estimated remaining time: 2 minutes, 49 seconds.
## Growing trees.. Progress: 71%. Estimated remaining time: 2 minutes, 18 seconds.
## Growing trees.. Progress: 78%. Estimated remaining time: 1 minute, 46 seconds.
## Growing trees.. Progress: 84%. Estimated remaining time: 1 minute, 15 seconds.
## Growing trees.. Progress: 91%. Estimated remaining time: 44 seconds.
## Growing trees.. Progress: 97%. Estimated remaining time: 12 seconds.
#show results
prediction
## selected model votes model1 votes model2 votes model3 votes model4
## 1 ca.wood.guidetree_3_MSFS 327 1448 1948 1277
## 2 ca.wood.guidetree_3_MSFS 327 1448 1948 1277
## 3 ca.wood.guidetree_3_MSFS 327 1448 1948 1277
## 4 ca.wood.guidetree_3_MSFS 327 1448 1948 1277
## post.proba
## 1 0.8448502
## 2 0.8448502
## 3 0.8448502
## 4 0.8448502
#write results to file
write.csv(as.matrix(prediction), file="prediction.csv")
#write out model info to file
cat("model 1",
readLines(paste0(obsprefix,"_1.tpl")),"",
"model 2",
readLines(paste0(obsprefix,"_2.tpl"))[15:16],"",
"model 3",
readLines(paste0(obsprefix,"_3.tpl"))[21:24],"",
"model 4",
readLines(paste0(obsprefix,"_4.tpl"))[21:24],
sep="\n", file ="models.txt")
#write model info to screen
cat("model 1",
readLines(paste0(obsprefix,"_1.tpl")),"",
"model 2",
readLines(paste0(obsprefix,"_2.tpl")),"",
"model 3",
readLines(paste0(obsprefix,"_3.tpl")),"",
"model 4",
readLines(paste0(obsprefix,"_4.tpl")),
sep="\n")
## model 1
## //Number of population samples (demes)
## 2
## //Population effective sizes (number of genes)
## N0$
## N1$
## //Sample sizes
## 52
## 28
## //Growth rates: negative growth implies population expansion
## 0
## 0
## //Number of migration matrices : 0 implies no migration between demes
## 0
## //historical event: time, source, sink, migrants, new size, new growth rate, migr. matrix
## 1 historical event
## 0 0 1 1 1 0 keep
## //Number of independent loci [chromosome]
## 1663
## //Per chromosome: Number of linkage blocks
## 1
## //per Block: data type, num loc. rec. rate and mut rate + optional paramters
## SNP 1 0 0 0.01
##
## model 2
## //Number of population samples (demes)
## 2
## //Population effective sizes (number of genes)
## N0$
## N1$
## //Sample sizes
## 52
## 28
## //Growth rates: negative growth implies population expansion
## 0
## 0
## //Number of migration matrices : 0 implies no migration between demes
## 0
## //historical event: time, source, sink, migrants, new size, new growth rate, migr. matrix
## 1 historical event
## Tdiv1$ 0 1 1 1 0 keep
## //Number of independent loci [chromosome]
## 1663
## //Per chromosome: Number of linkage blocks
## 1
## //per Block: data type, num loc. rec. rate and mut rate + optional paramters
## SNP 1 0 0 0.01
##
## model 3
## //Number of population samples (demes)
## 2
## //Population effective sizes (number of genes)
## N0$
## N1$
## //Sample sizes
## 52
## 28
## //Growth rates: negative growth implies population expansion
## 0
## 0
## //Number of migration matrices : 0 implies no migration between demes
## 2
## //migrationmatrix
## 0 0
## 0 0
## //migrationmatrix
## 0 MIG1$
## MIG1$ 0
## //historical event: time, source, sink, migrants, new size, new growth rate, migr. matrix
## 3 historical event
## Tdiv1$ 0 1 1 1 0 keep
## 0 0 0 0 1 0 1
## Tmig1end$ 0 0 0 1 0 0
## //Number of independent loci [chromosome]
## 1663
## //Per chromosome: Number of linkage blocks
## 1
## //per Block: data type, num loc. rec. rate and mut rate + optional paramters
## SNP 1 0 0 0.01
##
## model 4
## //Number of population samples (demes)
## 2
## //Population effective sizes (number of genes)
## N0$
## N1$
## //Sample sizes
## 52
## 28
## //Growth rates: negative growth implies population expansion
## 0
## 0
## //Number of migration matrices : 0 implies no migration between demes
## 2
## //migrationmatrix
## 0 0
## 0 0
## //migrationmatrix
## 0 MIG1$
## MIG1$ 0
## //historical event: time, source, sink, migrants, new size, new growth rate, migr. matrix
## 3 historical event
## Tdiv1$ 0 1 1 1 0 keep
## Tmig1init$ 0 0 0 1 0 1
## Tmig1end$ 0 0 0 1 0 0
## //Number of independent loci [chromosome]
## 1663
## //Per chromosome: Number of linkage blocks
## 1
## //per Block: data type, num loc. rec. rate and mut rate + optional paramters
## SNP 1 0 0 0.01