library(ggplot2)
library(dplyr)
library(tidyr)
library(ggpubr)
library(rstatix)

dat <- read.csv('data/tricou_simulations_table.tsv', sep = '\t', row.names = 1)

nbsampled<-seq(100,10,by=-10)
percsampled<-paste((nbsampled/1000)*100, "%", sep="")

pdf('figures/Fig_S1.pdf', width = 9.5, height = 6)
ggplot(dat, aes(x=factor(prop, levels = percsampled), y=err, fill=factor(ext_rate), linetype=type)) +
  geom_boxplot() + 
  xlab("Percentage of the total number of species considered") +
  ylab("Percentage of the stem-length-based\npredictions of the order of events") +
  geom_hline(yintercept = 50, linetype="dashed", color="red") +
  geom_label(aes(x = 9, y = 90, label = "correct predictions"), fill = "grey", color="white") +
  geom_label(aes(x = 9, y = 10, label = "erroneous predictions"), fill = "grey", color="white") + theme(legend.position = "bottom")
dev.off()

sumdat <- dat %>%
  group_by(ext_rate, prop, type) %>%
  summarise(error_mean = mean(err))

sumdat <- spread(sumdat, type, error_mean)
sumdat[, 'correct_fold'] <- sumdat$`correct-pred` / sumdat$`erroneous-pred`

pdf('figures/Fig_S2.pdf', width = 6.6 * 0.9, height = 4 * 0.9)
ggplot(sumdat, aes(reorder(ext_rate, correct_fold), correct_fold,
                   colour = factor(prop, percsampled), group = prop)) +
  geom_point() +
  geom_line() +
  xlab('Extinction rate') +
  ylab('Correct prediction ratio') +
  labs(colour = 'Sampled tips')
dev.off()

means <- read.csv('data/Mean24fold.tsv', sep = '\t')
means$ext_rate <- factor(means$ext_rate)
means$prop <- factor(means$prop, levels= paste((seq(100,10,by=-10)/1000)*100, "%", sep=""))
means$type <- ifelse(means$type == 'correct-pred', 'Correct prediction', 'Erroneous prediction')

stat.test <- means %>%
  group_by(type, prop) %>%
  wilcox_test(mean ~ ext_rate, comparisons = list(c("0", "0.5"), c("0.5","0.9"))) %>%
  add_significance()
stat.test <- stat.test %>%
  add_xy_position(x = "ext_rate", fun="mean", step.increase = 0.03)

pdf('figures/Fig_S3.pdf', width = 10, height = 5)
ggplot(means, aes(x = ext_rate, y = mean)) +
  geom_boxplot(aes(fill = ext_rate, color = ext_rate), alpha = 0.6) +
  xlab("Percentage of the total number of species considered") +
  ylab("Mean percentage of the stem-length-based predictions\nof the order of events, per simulation") +
  geom_hline(yintercept = 50, linetype="dashed", color="darkgrey") +
  facet_grid(type~prop, scales="free_y") +
  stat_pvalue_manual(stat.test, hide.ns = TRUE, tip.length = 0)
dev.off()

dat <- read.csv('data/branch_lengths_sim.csv')
dat$death <- factor(dat$death)

pdf('figures/Fig_2A.pdf', width = 5.42, height = 3.15)
ggplot(dat, aes(x = tip_nlens_mean, colour = death)) +
  geom_density(size = 1) +
  xlab('Tip length / branch length mean') +
  ylab('Density') +
  labs(colour = 'Extinction\nrate') +
  scale_colour_manual(values = c('0.5' = 'darkorange3', '0.9' = 'steelblue'))
dev.off()
