library(delimitR)

#force R to find compatible python installation
Sys.setenv(PATH = paste("/panfs/pfs.local/work/bi/bin/conda/bin/", Sys.getenv("PATH"), sep=":"))

#setwd
setwd("/home/d669d153/work/aph.rad/delimitr/ca.wood")

Set priors here:

#location of our observed sfs (cannot be full path, must be in wd, drop .obs file extension)
observedSFS <- 'ca_MSFS'

#location of our traits file (2 column file which maps alleles to species, must be in wd)
traitsfile <- 'ca.wood.traits.file.txt'

#guide tree
observedtree <- '(0,1);'

#migration matrix (must be symmetrical)
migmatrix <- matrix(c(FALSE, TRUE,
                      TRUE, FALSE),
                    nrow = 2, ncol = 2, byrow = TRUE)

#test divergence with gene flow?
divwgeneflow <- TRUE

#test secondary contact?
seccontact <- TRUE

#what is the maximum number of migration events to consider on your guide tree?
maxedges <- 1

#how many species are in your guide tree?
obsspecies<- 2

#the number of "alleles" retained after downsampling SFS
obssamplesize <- c(52,28)

#The user must specify the number of linkage blocks to simulate
#For unlinked SNPs, this is equal to the number of SNPs used to build your observed SFS
obssnps <- 1663

#The user must also provide a prefix
#This will be used to name the fsc2 input files, as well as other output files
#This should be unique for all guide tree + prior combinations in a folder, or files will be overwritten
obsprefix <- 'ca.wood.guidetree'

#The user must specify priors on population sizes
#The first vector is for population 0, the second for population 1, and the third for population 2
#Note that these are in terms of the number of haploid individuals (as specified in the fsc2 documentation)
obspopsizeprior <- list(c(100000,1000000),c(100000,1000000))

#priors for divergence times given in terms of the number of generations and must be supplied as a list of vectors
#Divergence time priors should be provided in order of coalescent interval
obsdivtimeprior <- list(c(100000,1000000))

#prior on migration rates, program only allows one prior for all migration rates in the default model sets
obsmigrateprior <- list(c(0.000005,0.0005))

Set up fastsimcoal models and sim 10K reps of each model:

#set up your prior models for fastsimcoal2
setup_fsc2(tree=observedtree,
           nspec=obsspecies,
           samplesizes=obssamplesize,
           nsnps=obssnps,
           prefix=obsprefix,
           migmatrix=migmatrix,
           popsizeprior=obspopsizeprior,
           divtimeprior=obsdivtimeprior,
           migrateprior=obsmigrateprior,
           secondarycontact= seccontact,
           divwgeneflow= divwgeneflow,
           maxmigrations = maxedges)

# fastsimcoalsims() requires the prefix used to generate the model files, the path to fastsimcoal2, and # of reps 
#Generally, a minimum of 10,000 replicates under each model should be simulated.
fastsimcoalsims(prefix=obsprefix,
                pathtofsc='/panfs/pfs.local/work/bi/bin/fsc26_linux64/fsc26',
                nreps=10000)
## [1] "/panfs/pfs.local/work/bi/bin/fsc26_linux64/fsc26 -t ca.wood.guidetree_1.tpl -e ca.wood.guidetree_1.est -n 1 --msfs -q --multiSFS -x -E10000"
## [1] "/panfs/pfs.local/work/bi/bin/fsc26_linux64/fsc26 -t ca.wood.guidetree_2.tpl -e ca.wood.guidetree_2.est -n 1 --msfs -q --multiSFS -x -E10000"
## [1] "/panfs/pfs.local/work/bi/bin/fsc26_linux64/fsc26 -t ca.wood.guidetree_3.tpl -e ca.wood.guidetree_3.est -n 1 --msfs -q --multiSFS -x -E10000"
## [1] "/panfs/pfs.local/work/bi/bin/fsc26_linux64/fsc26 -t ca.wood.guidetree_4.tpl -e ca.wood.guidetree_4.est -n 1 --msfs -q --multiSFS -x -E10000"

Build prior and reduced prior:

#need to turn our mSFS into a binned (bSFS) specify bin number here
#number should not be greater than the sample size of the population with the fewest samples, as this results in sparse sampling of the SFS
#Large values lead to a more complete summary of the data, but also lead to a more sparsely sampled SFS and increased computation times
nclasses <- 12

#to make prior, provide:
#prefix used to name the model files, the number of species, the number of classes to be included in the SFS,
#a path to the working directory, the name of the traits file, the threshold,
#the name of the folder to store the prior in, and the number of cores to use.

FullPrior <- makeprior(prefix=obsprefix,
                       nspec=obsspecies,
                       nclasses=nclasses,
                       getwd(),
                       traitsfile = traitsfile,
                       threshold=100, 
                       thefolder = 'Prior',
                       ncores = 4)

#remove extraneous files
#clean_working(prefix=obsprefix)

#remove invariant data
ReducedPrior <- Prior_reduced(FullPrior)

build random forest classifier:

#build random forest
myRF <- RF_build_abcrf(ReducedPrior,FullPrior,5000)
## Warning in lda.default(x, grouping, ...): variables are collinear
## Growing trees.. Progress: 5%. Estimated remaining time: 9 minutes, 27 seconds.
## Growing trees.. Progress: 11%. Estimated remaining time: 8 minutes, 37 seconds.
## Growing trees.. Progress: 16%. Estimated remaining time: 7 minutes, 59 seconds.
## Growing trees.. Progress: 22%. Estimated remaining time: 7 minutes, 24 seconds.
## Growing trees.. Progress: 27%. Estimated remaining time: 6 minutes, 55 seconds.
## Growing trees.. Progress: 33%. Estimated remaining time: 6 minutes, 20 seconds.
## Growing trees.. Progress: 38%. Estimated remaining time: 5 minutes, 50 seconds.
## Growing trees.. Progress: 44%. Estimated remaining time: 5 minutes, 16 seconds.
## Growing trees.. Progress: 49%. Estimated remaining time: 4 minutes, 46 seconds.
## Growing trees.. Progress: 55%. Estimated remaining time: 4 minutes, 15 seconds.
## Growing trees.. Progress: 61%. Estimated remaining time: 3 minutes, 42 seconds.
## Growing trees.. Progress: 66%. Estimated remaining time: 3 minutes, 11 seconds.
## Growing trees.. Progress: 72%. Estimated remaining time: 2 minutes, 40 seconds.
## Growing trees.. Progress: 77%. Estimated remaining time: 2 minutes, 9 seconds.
## Growing trees.. Progress: 83%. Estimated remaining time: 1 minute, 38 seconds.
## Growing trees.. Progress: 88%. Estimated remaining time: 1 minute, 7 seconds.
## Growing trees.. Progress: 94%. Estimated remaining time: 35 seconds.
## Growing trees.. Progress: 99%. Estimated remaining time: 3 seconds.
#view RF
myRF
## 
## Call:
##  abcrf(formula = Models ~ ., data = Trainingdata, ntree = ntrees, paral = TRUE) 
## includes the axes of a preliminary LDA
## 
## Number of simulations: 40000
## Out-of-bag prior error rate: 4.7075%
## 
## Confusion matrix:
##                          ca.wood.guidetree_1_MSFS ca.wood.guidetree_2_MSFS
## ca.wood.guidetree_1_MSFS                     9968                        0
## ca.wood.guidetree_2_MSFS                        0                     8966
## ca.wood.guidetree_3_MSFS                      304                        0
## ca.wood.guidetree_4_MSFS                        0                      485
##                          ca.wood.guidetree_3_MSFS ca.wood.guidetree_4_MSFS
## ca.wood.guidetree_1_MSFS                       32                        0
## ca.wood.guidetree_2_MSFS                        0                     1034
## ca.wood.guidetree_3_MSFS                     9674                       22
## ca.wood.guidetree_4_MSFS                        6                     9509
##                          class.error
## ca.wood.guidetree_1_MSFS      0.0032
## ca.wood.guidetree_2_MSFS      0.1034
## ca.wood.guidetree_3_MSFS      0.0326
## ca.wood.guidetree_4_MSFS      0.0491
#view misclassification rate for simulated data under our four models
classification.error<-myRF[["model.rf"]]$confusion.matrix
classification.error
##                          ca.wood.guidetree_1_MSFS ca.wood.guidetree_2_MSFS
## ca.wood.guidetree_1_MSFS                     9968                        0
## ca.wood.guidetree_2_MSFS                        0                     8966
## ca.wood.guidetree_3_MSFS                      304                        0
## ca.wood.guidetree_4_MSFS                        0                      485
##                          ca.wood.guidetree_3_MSFS ca.wood.guidetree_4_MSFS
## ca.wood.guidetree_1_MSFS                       32                        0
## ca.wood.guidetree_2_MSFS                        0                     1034
## ca.wood.guidetree_3_MSFS                     9674                       22
## ca.wood.guidetree_4_MSFS                        6                     9509
##                          class.error
## ca.wood.guidetree_1_MSFS      0.0032
## ca.wood.guidetree_2_MSFS      0.1034
## ca.wood.guidetree_3_MSFS      0.0326
## ca.wood.guidetree_4_MSFS      0.0491
#write error rates to file
write.csv(classification.error, file = "classification.error.csv")
plot(myRF, training = ReducedPrior)

## Press <ENTER> to Continue

#prep data
myobserved <- prepobserved(
  observedSFS,
  FullPrior,
  ReducedPrior,
  nclasses,
  obsspecies,
  traitsfile=traitsfile,
  threshold = 100)

apply RF classifier to our observed data

#Now, we're ready to apply the RF classifier to the observed data.
#we use the function RF_predict_abcrf(), which requires the RF object, the observed dataset,
#the Reduced Prior, the Full Prior, and the number of trees, which should match that used to construct the classifier
prediction <- RF_predict_abcrf(myRF, myobserved, ReducedPrior, FullPrior, 5000)
## Growing trees.. Progress: 6%. Estimated remaining time: 7 minutes, 58 seconds.
## Growing trees.. Progress: 13%. Estimated remaining time: 7 minutes, 8 seconds.
## Growing trees.. Progress: 19%. Estimated remaining time: 6 minutes, 30 seconds.
## Growing trees.. Progress: 26%. Estimated remaining time: 5 minutes, 58 seconds.
## Growing trees.. Progress: 32%. Estimated remaining time: 5 minutes, 28 seconds.
## Growing trees.. Progress: 39%. Estimated remaining time: 4 minutes, 55 seconds.
## Growing trees.. Progress: 45%. Estimated remaining time: 4 minutes, 22 seconds.
## Growing trees.. Progress: 52%. Estimated remaining time: 3 minutes, 51 seconds.
## Growing trees.. Progress: 58%. Estimated remaining time: 3 minutes, 21 seconds.
## Growing trees.. Progress: 65%. Estimated remaining time: 2 minutes, 49 seconds.
## Growing trees.. Progress: 71%. Estimated remaining time: 2 minutes, 18 seconds.
## Growing trees.. Progress: 78%. Estimated remaining time: 1 minute, 46 seconds.
## Growing trees.. Progress: 84%. Estimated remaining time: 1 minute, 15 seconds.
## Growing trees.. Progress: 91%. Estimated remaining time: 44 seconds.
## Growing trees.. Progress: 97%. Estimated remaining time: 12 seconds.
#show results
prediction
##             selected model votes model1 votes model2 votes model3 votes model4
## 1 ca.wood.guidetree_3_MSFS          327         1448         1948         1277
## 2 ca.wood.guidetree_3_MSFS          327         1448         1948         1277
## 3 ca.wood.guidetree_3_MSFS          327         1448         1948         1277
## 4 ca.wood.guidetree_3_MSFS          327         1448         1948         1277
##   post.proba
## 1  0.8448502
## 2  0.8448502
## 3  0.8448502
## 4  0.8448502
#write results to file
write.csv(as.matrix(prediction), file="prediction.csv")

#write out model info to file
cat("model 1",
    readLines(paste0(obsprefix,"_1.tpl")),"",
    "model 2",
    readLines(paste0(obsprefix,"_2.tpl"))[15:16],"",
    "model 3",
    readLines(paste0(obsprefix,"_3.tpl"))[21:24],"",
    "model 4",
    readLines(paste0(obsprefix,"_4.tpl"))[21:24],
    sep="\n", file ="models.txt")

#write model info to screen
cat("model 1",
    readLines(paste0(obsprefix,"_1.tpl")),"",
    "model 2",
    readLines(paste0(obsprefix,"_2.tpl")),"",
    "model 3",
    readLines(paste0(obsprefix,"_3.tpl")),"",
    "model 4",
    readLines(paste0(obsprefix,"_4.tpl")),
    sep="\n")
## model 1
## //Number of population samples (demes)
## 2
## //Population effective sizes (number of genes)
## N0$
## N1$
## //Sample sizes
## 52
## 28
## //Growth rates: negative growth implies population expansion
## 0
## 0
## //Number of migration matrices : 0 implies no migration between demes
## 0
## //historical event: time, source, sink, migrants, new size, new growth rate, migr. matrix
## 1 historical event
## 0 0 1 1 1 0 keep
## //Number of independent loci [chromosome]
## 1663
## //Per chromosome: Number of linkage blocks
## 1
## //per Block: data type, num loc. rec. rate and mut rate + optional paramters
## SNP 1 0 0 0.01
## 
## model 2
## //Number of population samples (demes)
## 2
## //Population effective sizes (number of genes)
## N0$
## N1$
## //Sample sizes
## 52
## 28
## //Growth rates: negative growth implies population expansion
## 0
## 0
## //Number of migration matrices : 0 implies no migration between demes
## 0
## //historical event: time, source, sink, migrants, new size, new growth rate, migr. matrix
## 1 historical event
## Tdiv1$ 0 1 1 1 0 keep
## //Number of independent loci [chromosome]
## 1663
## //Per chromosome: Number of linkage blocks
## 1
## //per Block: data type, num loc. rec. rate and mut rate + optional paramters
## SNP 1 0 0 0.01
## 
## model 3
## //Number of population samples (demes)
## 2
## //Population effective sizes (number of genes)
## N0$
## N1$
## //Sample sizes
## 52
## 28
## //Growth rates: negative growth implies population expansion
## 0
## 0
## //Number of migration matrices : 0 implies no migration between demes
## 2
## //migrationmatrix
## 0 0
## 0 0
## //migrationmatrix
## 0 MIG1$
## MIG1$ 0
## //historical event: time, source, sink, migrants, new size, new growth rate, migr. matrix
## 3 historical event
## Tdiv1$ 0 1 1 1 0 keep
## 0 0 0 0 1 0 1
## Tmig1end$ 0 0 0 1 0 0
## //Number of independent loci [chromosome]
## 1663
## //Per chromosome: Number of linkage blocks
## 1
## //per Block: data type, num loc. rec. rate and mut rate + optional paramters
## SNP 1 0 0 0.01
## 
## model 4
## //Number of population samples (demes)
## 2
## //Population effective sizes (number of genes)
## N0$
## N1$
## //Sample sizes
## 52
## 28
## //Growth rates: negative growth implies population expansion
## 0
## 0
## //Number of migration matrices : 0 implies no migration between demes
## 2
## //migrationmatrix
## 0 0
## 0 0
## //migrationmatrix
## 0 MIG1$
## MIG1$ 0
## //historical event: time, source, sink, migrants, new size, new growth rate, migr. matrix
## 3 historical event
## Tdiv1$ 0 1 1 1 0 keep
## Tmig1init$ 0 0 0 1 0 1
## Tmig1end$ 0 0 0 1 0 0
## //Number of independent loci [chromosome]
## 1663
## //Per chromosome: Number of linkage blocks
## 1
## //per Block: data type, num loc. rec. rate and mut rate + optional paramters
## SNP 1 0 0 0.01