################################################################################
#                                                                              #
#             Exemplary application of the additive hazard model               #
#                                                                              #
################################################################################

##------------------------------------------------------------------------------
##
## General Setup 
##
##------------------------------------------------------------------------------

# set working directory where data set is located accordingly 
setwd("Path")

# clear environment 
rm(list=ls(all=TRUE))

# read data from HALLUCA study
halluca <- read.csv("halluca_short.csv", header=TRUE, sep="")

# data preparation 
# (i.e. define column providing survival time, event and censoring indicator)
halluca$t <- ifelse(halluca$t==0, .01, halluca$t)
halluca$surv <- halluca$t/12
halluca$event <- ifelse(halluca$d==1, 1, 0)
halluca$cens <- ifelse(halluca$d==0, 1, 0)
drops <- which(halluca$TNMmis==1)
halluca <- halluca[-drops,]

# initial values for parameters to be estimated with respect to the chosen distribution
# Log-logistic distribution
initialLl <- c(b_TNMIV = 0.7, log_h0_a= 1.3, log_h0_b= .3)
# Weibull distribution
initialWb <- c(b_TNMIV = 0.7, log_h0_a=-0.2, log_h0_b=3)
# Gompertz distribution
initialGo <- c(b_TNMIV = 0.7, log_h0_a=-4,   log_h0_b=4)  
# Log-normal distribution
initialLn <- c(b_TNMIV = 0.7, log_h0_a=1.3,  log_h0_b=1.3) 
# Exponential distribution
initialEx <- c(b_TNMIV = 0.7,                log_h0_b=0.5)
# Gamma distribution
initialGa <- c(b_TNMIV = 0.7, log_h0_a=0.5,  log_h0_b=2)   

##------------------------------------------------------------------------------
##
## Defining overall function that applies our additive hazard model 
##
##------------------------------------------------------------------------------

ParamAddHaz <- function(distribution, initial){
  # as input factors, the function needs a string defining the distribution 
  # (e.g. "Weibull") and a vector containing the initial values for the 
  # parameters to be estimated, namely b_TNMIV, log_h0_a, and log_h0_b 
  
  ##----------------------------------------------------------------------------
  ##
  ## Defining log-likelihood functions to be optimized
  ##
  ##----------------------------------------------------------------------------
  
  # Function for Weibull distribution ------------------------------------------
  logLL.Weibull <- function(parms, data)
  {
    b_TNMIV <- as.numeric(parms["b_TNMIV"])
    log_h0_a <- as.numeric(parms["log_h0_a"])
    log_h0_b <- as.numeric(parms["log_h0_b"])
    TNMIV <- data["TNMIV"]
    cens <- as.numeric(data$cens)
    surv <- as.numeric(data$surv)
    event <- data["event"]
    
    h0_a <- exp(log_h0_a)
    h0_b <- exp(log_h0_b)
    
    linp <- b_TNMIV*TNMIV
    
    h0_density <- (h0_a/h0_b)*(surv/h0_b)^(h0_a-1)*exp(-(surv/h0_b)^h0_a)
    h0_survival <- exp(-(surv/h0_b)^h0_a)
    
    ll <- (1-cens)*(log(h0_density + linp*h0_survival)-surv*linp) +
      (cens)*(log(h0_survival)-surv*linp)
    
    return(-sum(ll))
  }
  
  # Function for Log-Logistic distribution -------------------------------------
  logLL.Loglogistic <- function(parms, data)
  {
    b_TNMIV <- as.numeric(parms["b_TNMIV"])
    log_h0_a <- as.numeric(parms["log_h0_a"])
    log_h0_b <- as.numeric(parms["log_h0_b"])
    TNMIV <- data["TNMIV"]
    cens <- as.numeric(data$cens)
    surv <- as.numeric(data$surv)
    event <- data["event"]
    
    h0_a <- exp(log_h0_a)
    h0_b <- exp(log_h0_b)
    
    linp <- b_TNMIV*TNMIV
    
    h0_density <- ((h0_b/h0_a)*(surv/h0_a)^(h0_b - 1))/((1 +(surv/h0_a)^h0_b)^2)
    h0_survival <- 1 - (1/(1 + (surv/h0_a)^(-h0_b)))
    
    ll <- (1-cens)*(log(h0_density + linp*h0_survival)-surv*linp) +
      (cens)*(log(h0_survival)-surv*linp)
    
    return(-sum(ll))
  }
  
  # Function for Log-Normal distribution ---------------------------------------
  logLL.Lognormal <- function(parms, data)
  {
    b_TNMIV <- as.numeric(parms["b_TNMIV"])
    log_h0_a <- as.numeric(parms["log_h0_a"])
    log_h0_b <- as.numeric(parms["log_h0_b"])
    TNMIV <- data["TNMIV"]
    cens <- as.numeric(data$cens)
    surv <- as.numeric(data$surv)
    event <- data["event"]
    
    h0_a <- exp(log_h0_a)
    h0_b <- exp(log_h0_b)
    
    linp <- b_TNMIV*TNMIV
    
    h0_density <- (1/(surv*h0_a*sqrt(2*pi)))*exp(-((log(surv)-h0_b)^2)/(2*h0_a^2))
    arg_NV <- (log(surv)-h0_b)/h0_a
    h0_survival <- 1 - pnorm(arg_NV,0,1)
    
    ll <- (1-cens)*(log(h0_density + linp*h0_survival)-surv*linp) +
      (cens)*(log(h0_survival)-surv*linp)
    
    return(-sum(ll))
  }
  # Function for Gompertz distribution -----------------------------------------
  logLL.Gompertz <- function(parms, data)
  {
    b_TNMIV <- as.numeric(parms["b_TNMIV"])
    log_h0_a <- as.numeric(parms["log_h0_a"])
    log_h0_b <- as.numeric(parms["log_h0_b"])
    TNMIV <- data["TNMIV"]
    cens <- as.numeric(data$cens)
    surv <- as.numeric(data$surv)
    event <- data["event"]
    
    h0_a <- exp(log_h0_a)
    h0_b <- exp(log_h0_b)
    
    linp <- b_TNMIV*TNMIV
    
    h0_density <- h0_a*h0_b*exp(h0_a*surv)*exp(h0_b)*exp(-h0_b*exp(h0_a*surv))
    h0_survival <- exp(-h0_b*(exp(h0_a*surv)-1))
    
    ll <- (1-cens)*(log(h0_density + linp*h0_survival)-surv*linp) +
      (cens)*(log(h0_survival)-surv*linp)
    
    return(-sum(ll))
  }
  # Function for Gamma distribution --------------------------------------------
  logLL.Gamma <- function(parms, data)
  {
    b_TNMIV <- as.numeric(parms["b_TNMIV"])
    log_h0_a <- as.numeric(parms["log_h0_a"])
    log_h0_b <- as.numeric(parms["log_h0_b"])
    TNMIV <- data["TNMIV"]
    cens <- as.numeric(data$cens)
    surv <- as.numeric(data$surv)
    event <- data["event"]
    
    h0_a <- exp(log_h0_a)
    h0_b <- exp(-log_h0_b)
    
    linp <- b_TNMIV*TNMIV
    
    h0_density <- dgamma(surv,h0_a,h0_b)
    h0_survival <- 1 - (pgamma(surv,h0_a,h0_b))
    
    ll <- (1-cens)*(log(h0_density + linp*h0_survival)-surv*linp) +
      (cens)*(log(h0_survival)-surv*linp)
    
    return(-sum(ll))
  }
  # Function for Exponential distribution --------------------------------------
  logLL.Exponential <- function(parms, data)
  {
    b_TNMIV <- as.numeric(parms["b_TNMIV"])
    log_h0_b <- as.numeric(parms["log_h0_b"])
    TNMIV <- data["TNMIV"]
    cens <- as.numeric(data$cens)
    surv <- as.numeric(data$surv)
    event <- data["event"]
    
    h0_b <- exp(log_h0_b)
    
    linp <- b_TNMIV*TNMIV
    
    h0_density <- (1/h0_b)*exp(-surv/h0_b)
    h0_survival <- exp(-surv/h0_b)
    
    ll <- (1-cens)*(log(h0_density + linp*h0_survival)-surv*linp) +
      (cens)*(log(h0_survival)-surv*linp)
    
    return(-sum(ll))
  }
  
  
  ##----------------------------------------------------------------------------
  ##
  ## Executing maximum likelihood estimation 
  ##
  ##----------------------------------------------------------------------------
  
  # optimizing the maximum likelihood function defined for each respective 
  # function, using the initial parameter values provided and data from the 
  # HALLUCA study
  
  if (distribution == "Weibull") {
    logLL.Weibull(initialWb, halluca)
    # fit the data for each distribution separately 
    fit <- optim(par = initialWb, fn = logLL.Weibull, data = halluca, hessian = TRUE)
    
  } else if (distribution == "Loglogistic") {
    logLL.Loglogistic(initialLl, halluca)
    fit <- optim(par = initialLl, fn = logLL.Loglogistic, data = halluca, hessian = TRUE)
    
  } else if (distribution == "Lognormal") {
    logLL.Lognormal(initialLn, halluca)
    fit <- optim(par = initialLn, fn = logLL.Lognormal, data = halluca, hessian = TRUE)
    
  } else if (distribution == "Gamma") {
    logLL.Gamma(initialGa, halluca)
    fit <- optim(par = initialGa, fn = logLL.Gamma, data = halluca, hessian = TRUE)
    
  } else if (distribution == "Gompertz") {
    logLL.Gompertz(initialGo, halluca)
    fit <- optim(par = initialGo, fn = logLL.Gompertz, data = halluca, hessian = TRUE, 
                 control = list(maxit = 100) )
    
  } else if (distribution == "Exponential") {
    logLL.Exponential(initialEx, halluca)
    fit <- optim(par = initialEx, fn = logLL.Exponential, data = halluca, hessian = TRUE)
  }
  
  ##----------------------------------------------------------------------------
  ##
  ## Transform and return estimation results
  ##
  ##----------------------------------------------------------------------------
  
  results <- function(est, conf.level = 0.95, df)
    # 1.) est = output of optim fitting procedure (with hessian = TRUE)
    # 2.) conf.level, e.g. = 0.95 for 95\% Confidence Intervals
    # 3.) df = degree of freedom for Students t-distribution
  {
    # original parameter
    b_TNMIV <- as.numeric(est$par[1])
    if (distribution == "Exponential") {
      log_h0_b <- as.numeric(est$par[2])
    }
    else{
      log_h0_a <- as.numeric(est$par[2])
      log_h0_b <- as.numeric(est$par[3])
    }
    
    # define baseline mean and median according to the respective distribution
    if (distribution == "Weibull") {
      BaselineMedian <- exp(log_h0_b) * log(2)**(1/exp(log_h0_a)) 
      BaselineMean <- exp(log_h0_b)*gamma( 1 + (1/exp(log_h0_a))) 
    } else if (distribution == "Loglogistic") {
      BaselineMedian <- exp(log_h0_a)
      BaselineMean <- (exp(log_h0_a)*pi/exp(log_h0_b))/(sin(pi/exp(log_h0_b)))
    } else if (distribution == "Lognormal") {
      BaselineMedian <- exp(exp(log_h0_b))
      BaselineMean <- exp(exp(log_h0_b) + (exp(log_h0_a))**2/2)
    } else if (distribution == "Gamma") {
      BaselineMean <- exp(log_h0_b)*exp(log_h0_a)
    } else if (distribution == "Gompertz") {
      BaselineMedian <- (1/exp(log_h0_a))*log((-1/exp(log_h0_b))*log(0.5)+1)
    } else if (distribution == "Exponential") {
      BaselineMedian <- exp(log_h0_b)*log(2)
      BaselineMean <- exp(log_h0_b) 
    }
    
    # Inverse Fisher information matrix
    invFisher <- solve(est$hessian)
    
    # standard errors of estimated parameters
    SEs <- sqrt(diag(invFisher))
    names(SEs) <- rownames(invFisher)
    crit <- qt(1-(1-conf.level)/2, df = df-1)

    # confidence intervals for b_TNMIV, log_h0_a and log_h0_b
    CI_b_TNMIV <- b_TNMIV + crit*c(-1,1)*SEs["b_TNMIV"]
    
    if (distribution == "Exponential") {
      CI_log_h0_b <- log_h0_b + crit*c(-1,1)*SEs["log_h0_b"]
    }
    else {
      CI_log_h0_a <- log_h0_a + crit*c(-1,1)*SEs["log_h0_a"]
      CI_log_h0_b <- log_h0_b + crit*c(-1,1)*SEs["log_h0_b"]
    }
    
    # confidence intervals for baseline mean and median
    if (distribution == "Weibull") {
      grad2 <- deriv(~exp(log_h0_b) * log(2)**(1/exp(log_h0_a)), c("log_h0_a", "log_h0_b"), function.arg = TRUE)
      fact2 <- as.numeric(attr(grad2(log_h0_a, log_h0_b), "gradient"))
      invFisher_select <- invFisher[2:3, 2:3]
      grad3 <- deriv(~exp(log_h0_b)*gamma( 1 + (1/exp(log_h0_a))), c("log_h0_a", "log_h0_b"), function.arg = TRUE)
      fact3 <- as.numeric(attr(grad3(log_h0_a, log_h0_b), "gradient"))
      
      # confidence interval for baseline median
      SEs["BaselineMedian"] <- sqrt(fact2%*%invFisher_select%*%t(t(fact2)))
      CI_BaselineMedian <- BaselineMedian+crit*c(-1,1)*as.vector(sqrt(fact2%*%invFisher_select%*%t(t(fact2))))
      
      # confidence interval for baseline mean
      SEs["BaselineMean"] <- sqrt(fact3%*%invFisher_select%*%t(t(fact3)))
      CI_BaselineMean <- BaselineMean+crit*c(-1,1)*as.vector(sqrt(fact3%*%invFisher_select%*%t(t(fact3))))
    }
    
    if (distribution == "Lognormal") {
      grad2 <- deriv(~exp(exp(log_h0_b)), c("log_h0_b"), function.arg = TRUE)
      fact2 <- as.numeric(attr(grad2(log_h0_b), "gradient"))
      invFisher_select_median <- invFisher[3:3, 3:3]
      grad3 <- deriv(~exp(exp(log_h0_b) + (exp(log_h0_a))**2/2), c("log_h0_a", "log_h0_b"), function.arg = TRUE)
      invFisher_select <- invFisher[2:3, 2:3]
      fact3 <- as.numeric(attr(grad3(log_h0_a, log_h0_b), "gradient"))
      
      # confidence interval for baseline median
      SEs["BaselineMedian"] <- sqrt(fact2*invFisher_select_median*t(t(fact2)))
      CI_BaselineMedian <- BaselineMedian+crit*c(-1,1)*as.vector(sqrt(fact2*invFisher_select_median*t(t(fact2))))
      
      # confidence interval for baseline mean
      SEs["BaselineMean"] <- sqrt(fact3%*%invFisher_select%*%t(t(fact3)))
      CI_BaselineMean <- BaselineMean+crit*c(-1,1)*as.vector(sqrt(fact3%*%invFisher_select%*%t(t(fact3))))
    }
    
    if (distribution == "Loglogistic") {
      grad2 <- deriv(~exp(log_h0_a), c("log_h0_a"), function.arg = TRUE)
      fact2 <- as.numeric(attr(grad2(log_h0_a), "gradient"))
      invFisher_select_median <- invFisher[2:2, 2:2]
      grad3 <- deriv(~(exp(log_h0_a)*pi/exp(log_h0_b))/(sin(pi/exp(log_h0_b))), c("log_h0_a", "log_h0_b"), function.arg = TRUE)
      invFisher_select <- invFisher[2:3, 2:3]
      fact3 <- as.numeric(attr(grad3(log_h0_a, log_h0_b), "gradient"))
      
      # confidence interval for baseline median
      SEs["BaselineMedian"] <- sqrt(fact2*invFisher_select_median*t(t(fact2)))
      CI_BaselineMedian <- BaselineMedian+crit*c(-1,1)*as.vector(sqrt(fact2*invFisher_select_median*t(t(fact2))))
      
      # confidence interval for baseline mean
      SEs["BaselineMean"] <- sqrt(fact3%*%invFisher_select%*%t(t(fact3)))
      CI_BaselineMean <- BaselineMean+crit*c(-1,1)*as.vector(sqrt(fact3%*%invFisher_select%*%t(t(fact3))))
    }
    
    if (distribution == "Exponential") {
      grad2 <- deriv(~exp(log_h0_b)*log(2), c("log_h0_b"), function.arg = TRUE)
      fact2 <- as.numeric(attr(grad2(log_h0_b), "gradient"))
      invFisher_select <- invFisher[2:2, 2:2]
      grad3 <- deriv(~exp(log_h0_b), c("log_h0_b"), function.arg = TRUE)
      fact3 <- as.numeric(attr(grad3(log_h0_b), "gradient"))
      
      # confidence interval for baseline median
      SEs["BaselineMedian"] <- sqrt(fact2*invFisher_select*t(t(fact2)))
      CI_BaselineMedian <- BaselineMedian+crit*c(-1,1)*as.vector(sqrt(fact2*invFisher_select*t(t(fact2))))
      
      # confidence interval for baseline mean
      SEs["BaselineMean"] <- sqrt(fact3*invFisher_select*t(t(fact3)))
      CI_BaselineMean <- BaselineMean+crit*c(-1,1)*as.vector(sqrt(fact3*invFisher_select*t(t(fact3))))
    }
    if (distribution == "Gamma") {
      invFisher_select <- invFisher[2:3, 2:3]
      grad3 <- deriv(~exp(log_h0_b)*exp(log_h0_a), c("log_h0_a", "log_h0_b"), function.arg = TRUE)
      fact3 <- as.numeric(attr(grad3(log_h0_a, log_h0_b), "gradient"))
      
      # confidence interval for baseline median
      SEs["BaselineMedian"] <- NA
      BaselineMedian <- NA
      CI_BaselineMedian <- c(NA,NA)
      
      # confidence interval for baseline mean
      SEs["BaselineMean"] <- sqrt(fact3%*%invFisher_select%*%t(t(fact3)))
      CI_BaselineMean <- BaselineMean+crit*c(-1,1)*as.vector(sqrt(fact3%*%invFisher_select%*%t(t(fact3))))
    }
    if (distribution == "Gompertz") {
      grad2 <- deriv(~(1/exp(log_h0_a))*log((-1/exp(log_h0_b))*log(0.5)+1), c("log_h0_a", "log_h0_b"), function.arg = TRUE)
      fact2 <- as.numeric(attr(grad2(log_h0_a, log_h0_b), "gradient"))
      invFisher_select <- invFisher[2:3, 2:3]
      
      # confidence interval for baseline median
      SEs["BaselineMedian"] <- sqrt(fact2%*%invFisher_select%*%t(t(fact2)))
      CI_BaselineMedian <- BaselineMedian+crit*c(-1,1)*as.vector(sqrt(fact2%*%invFisher_select%*%t(t(fact2))))
      
      # confidence interval for baseline mean
      SEs["BaselineMean"] <- NA
      BaselineMean <- NA
      CI_BaselineMean <- c(NA,NA)
    }
    
    # Output data set
    if (distribution == "Exponential") {
      out <- data.frame(rbind(
        par = c(b_TNMIV, log_h0_b, BaselineMedian, BaselineMean),
        SE = c(SEs["b_TNMIV"], SEs["log_h0_b"], SEs["BaselineMedian"], SEs["BaselineMean"]),
        CI_lb = c(CI_b_TNMIV[1], CI_log_h0_b[1], CI_BaselineMedian[1], CI_BaselineMean[1]),
        CI_ub = c(CI_b_TNMIV[2], CI_log_h0_b[2], CI_BaselineMedian[2], CI_BaselineMean[2])))
      names(out) <- c("b_TNMIV", "log_h0_b", "BaselineMedian", "BaselineMean")
      out
    }
    else {
      out <- data.frame(rbind(
        par = c(b_TNMIV, log_h0_a, log_h0_b, BaselineMedian, BaselineMean),
        SE = c(SEs["b_TNMIV"], SEs["log_h0_a"], SEs["log_h0_b"], SEs["BaselineMedian"], SEs["BaselineMean"]),
        CI_lb = c(CI_b_TNMIV[1], CI_log_h0_a[1], CI_log_h0_b[1], CI_BaselineMedian[1], CI_BaselineMean[1]),
        CI_ub = c(CI_b_TNMIV[2], CI_log_h0_a[2], CI_log_h0_b[2], CI_BaselineMedian[2], CI_BaselineMean[2])))
      names(out) <- c("b_TNMIV","log_h0_a", "log_h0_b", "BaselineMedian", "BaselineMean")
      out
    }
  }
  
  # return estimated values (b_TNMIV, log_h0_a, log_h0_b, baseline median and 
  # mean along with confidence intervals and standard error)
  res <- results(est = fit, conf.level = 0.95, df=dim(halluca)[1])
  return(res)
}

##------------------------------------------------------------------------------
##
## Executing the estimation 
##
##------------------------------------------------------------------------------

# execute for Weibull distribution 
res_WB <- ParamAddHaz(distribution = "Weibull", initial = initialWb)
print(round(res_WB,4))

# execute for Log-Normal distribution 
res_Ln <- ParamAddHaz(distribution = "Lognormal", initial = initialLn)
print(round(res_Ln,4))

# execute for Gamma distribution 
res_Ga <- ParamAddHaz(distribution = "Gamma", initial = initialGa)
print(round(res_Ga,4))

# execute for Log-Logistic distribution 
res_Ll <- ParamAddHaz(distribution = "Loglogistic", initial = initialLl) 
print(round(res_Ll,4))

# execute for Gompertz distribution 
res_Go <- ParamAddHaz(distribution = "Gompertz", initial = initialGo)
print(round(res_Go,4))

# execute for Exponential distribution 
res_Ex <- ParamAddHaz(distribution = "Exponential", initial = initialEx)
print(round(res_Ex,4))