# ----------------------------------------------------------------------------------------------------------
# global options

options(stringsAsFactors = FALSE)

# ----------------------------------------------------------------------------------------------------------
# load and install binary packages

ipak <- function(pkg){
    new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
    if(length(new.pkg)) install.packages(new.pkg, dependencies = TRUE)
    sapply(pkg, require, character.only = TRUE)
}

packages <- c("data.table", "lme4", "car", "effects", "lsmeans", "ResourceSelection",
                       "ggplot2", "ggthemes", "scales", "reshape2", "plyr")
ipak(packages)

# ----------------------------------------------------------------------------------------------------------
# set working directory

data_path <- "my_data_filepath"
results_path <- "my_results_filepath"
setwd(results_path)

# ------------------------------------------------------------------------------------
# set global ggplot theme

theme_publication <- function(base_size = 12, base_family = "Helvetica", ...) {
      require(grid)
      require(ggthemes)
      (theme_foundation(base_size = base_size, base_family = base_family)
       + theme(plot.title = element_text(face = "bold", size = rel(1.2), hjust = 0.5),
               text = element_text(),
               panel.background = element_rect(color = NA),
               plot.background = element_rect(color = NA),
               panel.border = element_rect(color = "black", size = 1),
               axis.title = element_text(face = "plain", size = rel(1)),
               axis.title.x = element_text(size = 8, margin = margin(t=10)),
               axis.title.y = element_text(size = 8, angle = 90, margin = margin(r=10)),
               axis.text.x = element_text(size = 7.5, angle = 0, margin = margin(t=2)),
               axis.text.y = element_text(size = 7.5, hjust = 1, margin = margin(r=2)),
               axis.ticks = element_line(),
               strip.text = element_text(size = 7.5),
               strip.background = element_blank(),
               plot.margin = unit(c(10,5,5,5),"mm"),
               panel.grid.minor = element_blank(),
               panel.grid.major.y = element_line(size=.5, color="#f0f0f0"),
               panel.grid.major.x = element_blank(),
               legend.key = element_rect(color = NA),
               legend.spacing = unit(0.1, "cm"),
               legend.margin = margin(t=3,r=3,b=3,l=3, unit="pt"),
               legend.title = element_text(size = 8, face = "plain"),
               legend.key.size = unit(.6, units = "line"),
               legend.text = element_text(size = 7),
               legend.background = element_rect(fill = "gray95", color = "gray20",
                                                size = 0.5, linetype = "dotted")
          ))
}

# ----------------------------------------------------------------------------------------------------------          
# load data

rawSimInt <- read.csv(file.path(data_path, "simulated_interval_individual_lme_data.csv"))                           
catSimInt <- read.csv(file.path(data_path, "simulated_summary_interval_individual_lme_data.csv"))

# ---------------------------------------------------------------------------------------------------------- 
# omit Cuscuta (only used for supplementary materials)

# rawSimInt <- rawSimInt[!(rawSimInt$group %in% "Cuscuta"), ]
# rawSimIntStack <- rawSimIntStack[!(rawSimIntStack$group %in% "Cuscuta"), ]
# catSimInt <- catSimInt[!(catSimInt$group %in% "Cuscuta"), ]

# ----------------------------------------------------------------------------------------------------------
# clean summary data

# exclude RAxML dating method
catSimInt <- catSimInt[!(catSimInt$dating %in% "RAxML"), ]

catSimInt$constraint <- gsub("(\\w+)\\d$", "\\1", catSimInt$constraint)

# convert to factors
facvars <- c("tree", "type", "dating", "group", "metric", "ab", "constraint")
catSimInt[, facvars] <- lapply(catSimInt[, facvars], factor)

catSimInt$tree <- relevel(catSimInt$tree, ref = "master")

# merge species pool PD data
species_pool_PD <- read.csv(file.path(data_path, "species_pool_PD2.csv"))
catSimInt_SP_PD <- merge(catSimInt, species_pool_PD, by = c("group", "constraint", "type", "dating"), all.x = TRUE)

# exclude master trees
catSimInt_SP_PD <- catSimInt_SP_PD[catSimInt_SP_PD$tree != "master", ]

# ---------------------------------------------------------------------------------------------------------- 
# clean and reshape raw data for counts

rawSimInt_long <- melt(rawSimInt, 
                       id.vars = c("main", "tree", "group", "type", "dating", "constraint", "community_ID"), 
                       measure.vars = c("error_type.MPD_beta", "error_type.MNND_beta", "error_type.MPD_alpha",
                                        "error_type.MNND_alpha", "error_type.PD_alpha"), 
                       variable.name = "metric", 
                       value.name = "error_type")
                       
rawSimInt_long$metric_ab <- gsub(".+\\.(.+)", "\\1", rawSimInt_long$metric)    
rawSimInt_long$metric <- gsub("(.+)\\_.+", "\\1", rawSimInt_long$metric_ab)
rawSimInt_long$ab <- gsub(".+\\_(.+)", "\\1", rawSimInt_long$metric_ab)         

# aggregate counts
rawSimInt_count <- ddply(rawSimInt_long, .(main, tree, group, type, dating, constraint, metric, ab, error_type), summarise, count = length(error_type))
rawSimInt_count <- ddply(rawSimInt_count, .(main, tree, group, type, dating, constraint, metric, ab), transform, total = sum(count))
rawSimInt_count <- na.omit(rawSimInt_count)

# exclude RAxML dating method
rawSimInt_count <- rawSimInt_count[!(rawSimInt_count$dating %in% "RAxML"), ]

# convert to factors
facvars <- c("error_type", "tree", "main", "group", "type", "dating", "constraint", "metric", "ab")
rawSimInt_count[, facvars] <- lapply(rawSimInt_count[, facvars], factor)

# exclude master trees
rawSimInt_count <- rawSimInt_count[rawSimInt_count$tree != "master", ]

# ----------------------------------------------------------------------------------------------------------
# clean and reshape raw data for metrics

rawSimInt <- within(rawSimInt, {
	PD_alpha.diff <- (TRUE_PD.c - tree.ML_PD.c) / TRUE_PD.c
	MPD_alpha.diff <- (TRUE_MPD.c - tree.ML_MPD.c) / TRUE_MPD.c
	MNND_alpha.diff <- (TRUE_MNND.c - tree.ML_MNND.c) / TRUE_MNND.c
	MPD_beta.diff <- (ML.beta.MPD - tree.ML.beta.MPD) / ML.beta.MPD
	MNND_beta.diff <- (ML.beta.MNND - tree.ML.beta.MNND) / ML.beta.MNND
	})

rawSimInt_diversity <- ddply(rawSimInt, .(main, tree, group, type, dating, constraint), summarise, 
    PD_alpha.diff_avg = mean(PD_alpha.diff),
    MPD_alpha.diff_avg = mean(MPD_alpha.diff),
    MNND_alpha.diff_avg = mean(MNND_alpha.diff),
    MPD_beta.diff_avg = mean(MPD_beta.diff),
    MNND_beta.diff_avg = mean(MNND_beta.diff)
    )
    
rawSimInt_diversity_long <- melt(rawSimInt_diversity, 
                       id.vars = c("main", "tree", "group", "type", "dating", "constraint"), 
                       measure.vars = c("PD_alpha.diff_avg", "MPD_alpha.diff_avg", "MNND_alpha.diff_avg",
                                        "MPD_beta.diff_avg", "MNND_beta.diff_avg"), 
                       variable.name = "metric", 
                       value.name = "diversity_diff")
                       
rawSimInt_diversity_long$metric_ab <- gsub("(.+)\\..+", "\\1", rawSimInt_diversity_long$metric)    
rawSimInt_diversity_long$metric <- gsub("(.+)\\_.+", "\\1", rawSimInt_diversity_long$metric_ab)
rawSimInt_diversity_long$ab <- gsub(".+\\_(.+)", "\\1", rawSimInt_diversity_long$metric_ab) 

# exclude RAxML dating method
rawSimInt_diversity_long <- rawSimInt_diversity_long[!(rawSimInt_diversity_long$dating %in% "RAxML"), ]

rawSimInt_diversity_long$diversity_diff_abs <- abs(rawSimInt_diversity_long$diversity_diff)

# convert to factors
facvars <- c("main", "tree", "group", "type", "dating", "constraint", "metric", "ab")
rawSimInt_diversity_long[, facvars] <- lapply(rawSimInt_diversity_long[, facvars], factor)


########################################################################
######### Inferred community phylogeny versus true community phylogeny #########
########################################################################

# ----------------------------------------------------------------------------------------------------------
model0a <- lm(diversity_diff ~ ab * metric + main + type + dating + constraint + group,
              data = rawSimInt_diversity_long[rawSimInt_diversity_long$tree != "master", ])
summary(model0a)
Anova(model0a)

# marginal effects
eff0a <- allEffects(model0a)
eff0a_df <- as.data.frame(eff0a[["ab:metric"]])
eff0a_df <- na.omit(eff0a_df)

fig3 <- ggplot(eff0a_df, aes(x = metric, y = fit)) +
    geom_bar(stat = "identity", position = position_dodge(width = 0.9), color = "black", fill = "grey70", size = 0.3) +
    geom_errorbar(aes(ymin = lower, ymax = upper), stat = "identity", position = position_dodge(width = 0.9), width = 0.3) +
    facet_grid(. ~ ab, scales = "free_x", space = "free_x") +
    scale_y_continuous(breaks = seq(-1.25, 0.25, by = 0.25), labels = percent, limits = c(-1.1, 0)) + 
    labs(x = "Metric", y = expression("Inferred community phylogeny versus\ntrue community phylogeny"("%"~~Delta))) +
    theme_publication() +
    theme(legend.position = c(0.83, 0.13))
ggsave(fig3, file = "Figure_3.pdf", height = 3.5, width = 3.5)

# ----------------------------------------------------------------------------------------------------------
model0b <- lm(diversity_diff ~ tree * ab * metric + main + type + dating + constraint + group,
              data = rawSimInt_diversity_long)
summary(model0b)
Anova(model0b)

# marginal effects
eff0b <- allEffects(model0b)
eff0b_df <- as.data.frame(eff0b[["tree:ab:metric"]])
eff0b_df <- na.omit(eff0b_df)

figSA2 <- ggplot(eff0b_df, aes(x = metric, y = fit, fill = tree)) +
    geom_bar(stat = "identity", position = position_dodge(width = 0.9), color = "black", size = 0.3) +
    geom_errorbar(aes(ymin = lower, ymax = upper), stat = "identity", position = position_dodge(width = 0.9), width = 0.3) +
    facet_grid(. ~ ab, scales = "free_x", space = "free_x") +
    scale_y_continuous(breaks = seq(-1.25, 0.25, by = 0.25), labels = percent, limits = c(-1.05, 0.25)) + # c(-1.38, 0.25)
    scale_fill_tableau(palette = "tableau10") +
    labs(x = "Metric", y = expression("Inferred community phylogeny versus\ntrue community phylogeny"("%"~~Delta))) +
    theme_publication() +
    theme(legend.position = c(0.83, 0.13))
ggsave(figSA2, file = "Figure_SA2.pdf", height = 3.5, width = 3.5)


########################################################################
########## Comparisons of error by community type and dating method ############
########################################################################

model1_a <- glm(cbind(error, 1800 - error) ~ dating * metric * type + constraint + group,      
          family = binomial(link = "logit"),
          data = catSimInt[with(catSimInt, ab %in% "alpha" & tree %in% "community"), ])  
summary(model1_a)
Anova(model1_a, type = "III")

model1_b <- glm(cbind(error, 1800 - error) ~ dating * metric * type + constraint + group,      
          family = binomial(link = "logit"),
          data = catSimInt[with(catSimInt, ab %in% "beta" & tree %in% "community"), ])          
summary(model1_b)
Anova(model1_b, type = "III")

# goodness of fit
hoslem.test(model1_a$y, fitted(model1_a), g = length(coef(model1_a)) + 1)
hoslem.test(model1_b$y, fitted(model1_b), g = length(coef(model1_b)) + 1)

# contrasts
Means_a_type <- pmmeans(model1_a, specs = ~ type) 
contrast(Means_a_type, method = "pairwise", adjust = "holm")

Means_b_type <- pmmeans(model1_b, specs = ~ type) 
contrast(Means_b_type, method = "pairwise", adjust = "holm")

Means_a_dating <- pmmeans(model1_a, specs = ~ dating) 
contrast(Means_a_dating, method = "pairwise", adjust = "holm")

Means_b_dating <- pmmeans(model1_b, specs = ~ dating) 
contrast(Means_b_dating, method = "pairwise", adjust = "holm")

Means_a_metric <- pmmeans(model1_a, specs = ~ metric) 
contrast(Means_a_metric, method = "pairwise", adjust = "holm")

Means_b_metric <- pmmeans(model1_b, specs = ~ metric) 
contrast(Means_b_metric, method = "pairwise", adjust = "holm")

# marginal effects
eff1_a <- allEffects(model1_a)
eff1_a_df <- as.data.frame(eff1_a[["dating:metric:type"]])
eff1_a_df$ab <- "Alpha" 

eff1_b <- allEffects(model1_b)
eff1_b_df <- as.data.frame(eff1_b[["dating:metric:type"]])
eff1_b_df$ab <- "Beta"

eff1_df_sim <- rbind.data.frame(eff1_a_df, eff1_b_df)
eff1_df_sim$analysis <- "Sim" 

load("eff1_df_emp.Rdata") 
eff1_df <- rbind.data.frame(eff1_df_sim, eff1_df_emp)
eff1_df <- na.omit(eff1_df)
eff1_df$upper[eff1_df$upper > 0.9] <- NA

eff1_df$analysis <- factor(eff1_df$analysis, levels = c("Sim", "Emp"))

fig4 <- ggplot(eff1_df, aes(x = reorder(type, fit), y = fit, fill = metric)) +
    geom_bar(stat = "identity", position = position_dodge(width = 0.9), color = "black", size = 0.3) +
    geom_errorbar(aes(ymin = lower, ymax = upper), stat = "identity", position = position_dodge(width = 0.9), width = 0.3, size = 0.4) +
    facet_grid(ab ~ analysis + dating, scales = "free_y", space = "free_y") +
    scale_fill_tableau(palette = "tableau10", name = "Metric") +
    scale_y_continuous(breaks = seq(0, 1, by = 0.1), labels = percent) + # limits = c(0, 0.2)
    labs(x = "Community Type", y = "Probability of error") +
    theme_publication() +
    theme(legend.position = c(0.75, 0.78))
ggsave(fig4, file = "Figure_4.pdf", height = 3.5, width = 7.5)


########################################################################
############### Comparisons of error type (Sign, type 1, type 2) #################
########################################################################

model2 <- glm(cbind(count, 1800 - count) ~ error_type * metric * ab + main + group + type + dating + constraint,     
          family = binomial(link = "logit"),
          data = rawSimInt_count)  
summary(model2)

# goodness of fit
hoslem.test(model2$y, fitted(model2), g = length(coef(model2)) + 1)

# contrasts
Means_2 <- pmmeans(model2, specs = ~ error_type | metric + ab) 
contrast(Means_2, method = "pairwise", adjust = "holm")

# marginal effects
eff2 <- allEffects(model2)
eff2_df <- as.data.frame(eff2[["error_type:metric:ab"]])
eff2_df <- na.omit(eff2_df)

eff2_df_stacked <- ddply(eff2_df, .(metric, ab), transform, total = sum(fit))
eff2_df_stacked <- within(eff2_df_stacked, prop <- fit / total)
eff2_df_stacked <- droplevels(eff2_df_stacked)
eff2_df_stacked$error_type <- factor(eff2_df_stacked$error_type, levels = rev(levels(eff2_df_stacked$error_type)))

figS1a <- ggplot(eff2_df_stacked, aes(x = metric, y = prop, fill = error_type)) +
    geom_bar(stat = "identity", color = "black", size = 0.3) +
    facet_grid(. ~ ab, scales = "free_x", space = "free_x") +
    scale_fill_tableau(palette = "tableau10", name = "Error Type", guide = guide_legend(reverse = TRUE)) +
    scale_y_continuous(breaks = seq(0, 1, by = 0.2), labels = percent) +
    labs(x = "Metric", y = "Probability of error") +
    theme_publication() +
    theme(legend.position = "none")
ggsave(figS1a, file = "Figure_S1a.pdf", height = 2.93, width = 3.25)


########################################################################
##### Error by the total diversity (PD, MPD, MNND) and species pool (c1+c2+c3) #####
########################################################################

# ----------------------------------------------------------------------------------------------------------
# PD

model3_PD <- glm(cbind(error, 1800 - error) ~ PD.all.AVG * ab * metric * type + dating + constraint + group ,     
          family = binomial(link = "logit"),
          data = catSimInt_SP_PD) 
summary(model3_PD)

# goodness of fit
hoslem.test(model3_PD$y, fitted(model3_PD), g = length(coef(model3_PD)) + 1)

# marginal effects
eff3_PD <- allEffects(model3_PD, xlevels = list(PD.all.AVG = seq(100, 10000, by = 100)))
eff3_PD_df <- as.data.frame(eff3_PD[["PD.all.AVG:ab:metric:type"]])
eff3_PD_df <- na.omit(eff3_PD_df)
eff3_PD_df <- eff3_PD_df[with(eff3_PD_df, metric != "PD" | ab != "beta"), ]
eff3_PD_df[with(eff3_PD_df, type == "D" & metric == "MNND"), c("lower", "upper")] <- NA 

thousands <- function(x) x/1000

fig5 <- ggplot(eff3_PD_df, aes(x = PD.all.AVG, y = fit, fill = metric, color = metric)) +
    geom_ribbon(aes(ymin = lower, ymax = upper), stat = "identity", alpha = 0.3, color = NA) +
    geom_line(stat = "identity",  size = 0.6) +
    facet_grid(ab ~ type, scales = "free_y", space = "free_y") +
    scale_color_tableau(palette = "tableau10", name = "Metric") +
    scale_fill_tableau(palette = "tableau10", name = "Metric") +
    scale_y_continuous(breaks = seq(0, 1, by = 0.05), labels = percent) +
    scale_x_continuous(labels = thousands, breaks = seq(0, 10000, by = 2500)) +
    labs(x = "PD (species pool)", y = "Probability of error") +
    theme_publication() +
    theme(legend.position = c(0.88, 0.78),
               panel.grid.major.x = element_line(size=.5, color="#f0f0f0"))
ggsave(fig5, file = "Figure_5.pdf", height = 4, width = 7)


# ----------------------------------------------------------------------------------------------------------
# MPD

model3_MPD <- glm(cbind(error, 1800 - error) ~ MPD.all.AVG * ab * metric * type + dating + constraint + group ,     
          family = binomial(link = "logit"),
          data = catSimInt_SP_PD) 
summary(model3_MPD)

# goodness of fit
hoslem.test(model3_MPD$y, fitted(model3_MPD), g = length(coef(model3_MPD)) + 1)

# marginal effects
eff3_MPD <- allEffects(model3_MPD, xlevels = list(MPD.all.AVG = seq(10, 450, by = 10)))
eff3_MPD_df <- as.data.frame(eff3_MPD[["MPD.all.AVG:ab:metric:type"]])
eff3_MPD_df <- na.omit(eff3_MPD_df)
eff3_MPD_df <- eff3_MPD_df[with(eff3_MPD_df, metric != "PD" | ab != "beta"), ]
eff3_MPD_df[with(eff3_MPD_df, type == "D" & metric == "MNND"), c("lower", "upper")] <- NA 

thousands <- function(x) x/1000

figS2 <- ggplot(eff3_MPD_df, aes(x = MPD.all.AVG, y = fit, fill = metric, color = metric)) +
    geom_ribbon(aes(ymin = lower, ymax = upper), stat = "identity", alpha = 0.3, color = NA) +
    geom_line(stat = "identity",  size = 0.6) +
    facet_grid(ab ~ type, scales = "free_y", space = "free_y") +
    scale_color_tableau(palette = "tableau10", name = "Metric") +
    scale_fill_tableau(palette = "tableau10", name = "Metric") +
    scale_y_continuous(breaks = seq(0, 1, by = 0.05), labels = percent) +
    scale_x_continuous(breaks = seq(0, 500, by = 100)) +
    labs(x = "MPD (species pool)", y = "Probability of error") +
    theme_publication() +
    theme(legend.position = c(0.88, 0.78),
               panel.grid.major.x = element_line(size=.5, color="#f0f0f0"))
ggsave(figS2, file = "Figure_S2.pdf", height = 4, width = 7)


########################################################################
############# Error compared among community trees and master trees ############
########################################################################

# ----------------------------------------------------------------------------------------------------------
# additive model

model4 <- glm(cbind(error, 1800 - error) ~ tree + type + dating + constraint + ab + metric + group,
              family = binomial(link = "logit"),
              data = catSimInt)
summary(model4)
Anova(model4)

Means4 <- lsmeans(model4, specs = ~ tree)
contrast(Means4, method = "pairwise")

# goodness of fit
hoslem.test(model4$y, fitted(model4), g = length(coef(model4)) + 1)

# marginal effects
eff4 <- allEffects(model4)
eff4_df <- as.data.frame(eff4[["tree"]])

figSA3a <- ggplot(eff4_df, aes(x = tree, y = fit)) +
    geom_bar(stat = "identity", color = "black", fill = "grey70", size = 0.3) +
    geom_errorbar(aes(ymin = lower, ymax = upper), stat = "identity", width = 0.2, size = 0.4) +
    scale_y_continuous(breaks = seq(0, 1, by = 0.01), labels = percent, limits = c(0, 0.049)) +
    labs(x = "Tree", y = "Probability of error") +
    theme_publication() +
    theme(axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1))
ggsave(figSA3a, file = "Figure_SA3a.pdf", height = 2.77, width = 1.5)


# ----------------------------------------------------------------------------------------------------------
# interaction model

model4_int <- glm(cbind(error, 1800 - error) ~ tree * dating * metric * ab + group + type,     
          family = binomial(link = "logit"),
          data = catSimInt[with(catSimInt, constraint %in% "constrained"), ]) 
summary(model4_int)

# goodness of fit
hoslem.test(model4_int$y, fitted(model4_int), g = length(coef(model4_int)) + 1)

# marginal effects
eff4_int <- allEffects(model4_int)
eff4_int_df <- as.data.frame(eff4_int[["tree:dating:metric:ab"]])
eff4_int_df <- na.omit(eff4_int_df)
eff4_int_df$upper[eff4_int_df$upper > 0.9] <- NA

figSA3b <- ggplot(eff4_int_df, aes(x = tree, y = fit, fill = metric)) +
    geom_bar(stat = "identity", position = position_dodge(width = 0.9), color = "black", size = 0.3) +
    geom_errorbar(aes(ymin = lower, ymax = upper), stat = "identity", position = position_dodge(width = 0.9), width = 0.3, size = 0.4) +
    facet_grid(ab ~ dating, scales = "free_y", space = "free_y") +
    scale_fill_tableau(palette = "tableau10", name = "Metric") +
    scale_y_continuous(breaks = seq(0, 1, by = 0.025), labels = percent) + # limits = c(0, 0.271)
    labs(x = "Tree", y = "Probability of error") +
    theme_publication() +
    theme(legend.position = c(0.5, 0.76),
               axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1))
ggsave(figSA3b, file = "Figure_SA3b.pdf", height = 3, width = 4)