invisible('

ot2 : 

match sample proportion in outbreak and reservoir 

Constant size reservoir with exponential demes: 
Small migration from reservoir to growing demes.

phydelity & clmp to be done separately
the latter seems to have convergence issues on some sims 
')


library(ape)
library(phydynR)
library( treestructure )
library( fastbaps ) 
library( phyclust) 
library( treedater ) 
library( clmp )
library( ggtree )


NCPU <- 32


demes <- c('reservoir', 'a')
m <- length(demes)
B <- matrix( '0', nrow =m ,ncol=m)
rownames(B) = colnames(B) <- demes 
M <- matrix( '0', nrow =m ,ncol=m)
rownames(M) = colnames(M) <- demes 
D <- setNames( rep('0', m ), demes )

B['reservoir','reservoir'] <- 'breservoir * reservoir' 
B['a', 'a'] <- '((T>otime)?ba:0.) * a'


#~ M['reservoir', 'a'] <- ' ((T<otime)?perLineageMigration*a:0.) '
#~ M[ 'a','reservoir'] <- ' ((T<otime)?perLineageMigration*a:0.) '
M['reservoir', 'a'] <- ' perLineageMigration*a '
M[ 'a','reservoir'] <- ' perLineageMigration*a '

D['reservoir'] <- 'breservoir * reservoir' 
D['a'] <- '((T>otime)?a:0.)'

tfin <- 200
phi = .7
nres = 800
na = 200
n <- nres + na 

theta0 <- c( a0 = 1,  breservoir = 1, reservoir0 = nres / phi, ba =1.5, perLineageMigration =1/20 , otime = 300, phi = phi )

rtheta <- function()
{
	.phi <- runif( 1, .05, .75)
	x = c( phi = .phi 
		 , ba = runif( 1, 1.05, 2)
		 #, perLineageMigration = runif( 1, .01, .2 )
		 , reservoir0 = nres / .phi 
		)
	theta <- theta0
	theta[ names(x) ] <- unname( x  )
	theta
}

if ( !('dm' %in% ls() ) )
	dm = build.demographic.process( B, migrations = M, deaths=D, parameterNames = names(theta0), rcpp = T)


#' simulate a tree from coalescent and apply partitioning/clusterign algorithms
sim.replicate <- function(seed)
{
	set.seed(seed)
	st0 <- Sys.time() 
	
	theta <- rtheta() 
	Na <- na / theta['phi']
	#~ 	exp( (tfin - theta['otime']) * (theta['ba'] - 1)) - Na
	ST = tfin <- theta['otime'] + log(Na) / ( theta['ba']-1 ) 

	sts =  setNames( rep( ST, n ), 1:n )
	# distribute the first few tips through time to facilitate molecular dating 
	sts[ 1:50 ] <- runif( 50, 0, ST )
	
	ssts <- matrix( 0, nrow = n, ncol = m )
	colnames(ssts ) <- demes 
	ssts[1:nres,'reservoir'] <- 1
	ssts[(1+nres):(na+nres),'a'] <- 1

	Y0 = c(reservoir = unname(theta['reservoir0']),  a = 1)
	tr = sim.co.tree(theta, dm, Y0, t0=0, sampleTimes = sts, sampleStates = ssts, res = 5000, integrationMethod = "lsoda")
	class(tr) <- 'phylo'
	
	# sim sequences (phcylust/seq-gen)
	seqgen( opts = '-mHKY -l1000 -s0.001', rooted.tree = tr ) -> seqs
	paste( seqs, collapse = '\n' ) -> seqstr 
	con <- textConnection( seqstr )
	d <- read.dna( con, format= 'interleaved')
	close(con)

	# nj + treedater
	njtr <- nj( dist.dna(d, model = 'F84' ) )
	td <- dater( unroot(njtr), s = 1e3
	  , sts = setNames( node.depth.edgelength( tr ) [1:Ntip(tr)] , tr$tip.label ) 
	  , strict = TRUE 
	  , ncpu = 1
	  , quiet = FALSE 
	)
	
	tr2 <- td
	class( tr2 ) <- 'phylo'
	# treestructure 
	sp2 <- trestruct( tr2, debugLevel = 1 )
	dtdf2 <- as.data.frame( sp2 )
	dtdf2$truth <- sapply( as.character( dtdf2$taxon) , function(x) which( tr$sampleStates[x, ]==1)  )
	dtdf2 <- cbind( taxa = as.character( dtdf2$taxon),  dtdf2 )
	
	# baps
	#~ snps <- fastbaps::import_fasta_sparse_nt(d) 
	tmpfn <- paste0('tmp', seed, '.fasta' )
	write.dna( d, file = tmpfn, format = 'fasta' )
	spd <- import_fasta_sparse_nt(tmpfn )
	spd <- optimise_prior(spd, type = "optimise.symmetric")
	b <- best_baps_partition( spd ,  tr2 ) 
	bdf <- data.frame(id = colnames(spd$snp.matrix), fastbaps = b, stringsAsFactors = FALSE)
	dtdf2$baps = bdf$fastbaps[ match( as.character(bdf$id), dtdf2$taxa ) ]
	file.remove( tmpfn )
	
	# clmp 
#~ 	m <- clmp( tr2 )
#~ 	dtdf2$clmp <- m$clusters[as.character( dtdf2$taxa ) ]
	
	st1 <- Sys.time() 
	
	pltr2 <- ggtree(tr2)
	
	rv = list( theta = theta 
	 , cotree = tr
	 , phylo = tr2
	 , df = dtdf2 
	 , debugdf = sp2$debugdf # contains z score for each node 
	 , seqs = d
	 , systime = st1 - st0 
	 , pl.ts = plot( sp2 ) 
	 , pl.truth = ggtree( tr2  ) %<+% dtdf2 + geom_tippoint( aes(color = truth ))
#~ 	 , pl.clmp = ggtree( tr2  ) %<+% dtdf2 + geom_tippoint( aes(color = clmp ))
	 , pl.baps = facet_plot(pltr2, panel = "fastbaps", data = dtdf2, geom = geom_tile, aes(x = baps), color = "blue")
	)
	suppressWarnings( dir.create( 'ot2.1' ) )
	write.tree( tr2, file = paste0( 'ot2.1/', seed, '.nwk' ) )
	saveRDS( rv, file = paste0( 'ot2.1/', seed, '.rds' ) )
	
	rv
}

library( parallel )
o = mclapply( 1:200, function(i) {
	tryCatch( sim.replicate( i  + 1111)
	 , error = function(e) NULL )
}, mc.cores = NCPU )

#~ with (dtdf2,  cocluster_accuracy( truth , partition ))
#~ with (bdf,  cocluster_accuracy( truth , fastbaps))



# inspect outputs 
if (FALSE){
	library( treestructure)
#~ 	o = sapply( list.files( path = 'ot2.1', pattern = 'rds', full.names=TRUE), readRDS)
	o = sapply( list.files( path = 'ot2.2', pattern = 'rds', full.names=TRUE), readRDS)
	nms <- c( 'ba', 'phi')#, 'perLineageMigration' )
	#print( (theta <- sapply( o, '[[', 'theta' )[nms, ]) )
	theta <- sapply( o, '[[', 'theta' )[nms, ]
	cca = sapply( o, function ( oo ) {
		with (oo$df,  c(
		 ts = cocluster_accuracy( truth , partition )
		 , baps = cocluster_accuracy( truth , baps)
		 )
		)
	})
	X <- as.data.frame( t( rbind( theta, cca ) ))
	print( 
	 summary( lm(ts~phi + ba, data = X ) )
	)
	print( 
	 summary( lm(baps~phi + ba, data = X ) )
	)
	
	library( aricode )
	nmi.ts <- sapply( o, function(oo) with( oo$df, NMI( truth, partition)))
	nmi.baps <- sapply( o, function(oo) with( oo$df, NMI( truth, baps)))
	nmi.tscl <- sapply( o, function(oo) with( oo$df, NMI( truth, cluster)))
	
	summary( nmi.ts <- sapply( o, function(oo) with( oo$df, NMI( truth, partition, 'sum'))) )
	summary( nmi.baps <- sapply( o, function(oo) with( oo$df, NMI( truth, baps, 'sum'))) )
	summary( nmi.tscl <- sapply( o, function(oo) with( oo$df, NMI( truth, cluster, 'sum'))) )
	
	cc.ts <- sapply( o, function(oo) with( oo$df, clustComp( truth, partition)))
	cc.baps <- sapply( o, function(oo) with( oo$df, clustComp( truth, baps)))
	cc.baps2 <- sapply( o, function(oo) with( oo$df, clustComp( truth, baps2)))
	cc.tscl <- sapply( o, function(oo) with( oo$df, clustComp( truth, cluster)))

	for (nm in rownames( cc.ts )){
		cat( nm )
		cat ( '\n' )
		print(
		c( mean( unlist( cc.ts[nm,]))
		 ,  mean( unlist( cc.baps[nm,]))
		 ,  mean( unlist( cc.baps2[nm,]))
		 )
		)
	}
}

invisible('
> print( 
  summary( lm(ts~phi + ba, data = X ) )
)

Call:
lm(formula = ts ~ phi + ba, data = X)

Residuals:
      Min        1Q    Median        3Q       Max 
-0.212349 -0.073139 -0.003443  0.069077  0.262654 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  0.58636    0.05716  10.258  < 2e-16 ***
phi         -0.22025    0.04349  -5.065 1.59e-06 ***
ba           0.13861    0.03452   4.015 0.000107 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.1019 on 114 degrees of freedom
Multiple R-squared:  0.2806,	Adjusted R-squared:  0.2679 
F-statistic: 22.23 on 2 and 114 DF,  p-value: 7.061e-09

> print( 
  summary( lm(baps~phi + ba, data = X ) )
)

Call:
lm(formula = baps ~ phi + ba, data = X)

Residuals:
       Min         1Q     Median         3Q        Max 
-0.0207353 -0.0045465  0.0002145  0.0050824  0.0221243 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 0.295464   0.004317  68.443  < 2e-16 ***
phi         0.026559   0.003284   8.087 7.27e-13 ***
ba          0.035375   0.002607  13.568  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.007693 on 114 degrees of freedom
Multiple R-squared:  0.6751,	Adjusted R-squared:  0.6694 
F-statistic: 118.4 on 2 and 114 DF,  p-value: < 2.2e-16


')

invisible('
RI
[1] 0.7140748 0.3597887
ARI
[1] 0.40656959 0.02318766
MI
[1] 0.2313616 0.3320168
VI
[1] 0.817614 3.124794
NVI
[1] 0.7696036 0.9026286
ID
[1] 0.5705611 2.9564084
NID
[1] 0.7121371 0.8981564
NMI
[1] 0.2878629 0.1018436

')

invisible('
RI rand index
W. M. Rand (1971). "Objective criteria for the evaluation of clustering methods". Journal of the American Statistical Association. American Statistical Association. 66 (336): 846–850. arXiv:1704.01036. doi:10.2307/2284239. JSTOR 2284239.
	probably same as cocluster_accuracy

ARI adjusted rand index 

MI is mutual information 
NMI normalized mutual information
Witten, Ian H. & Frank, Eibe (2005). Data Mining: Practical Machine Learning Tools and Techniques. Morgan Kaufmann, Amsterdam. ISBN 978-0-12-374856-0.

NVI is variance of infomration
	sort of the opposite of NMI 

ID and NID is information distance 
')
