NSCLC TruSight Oncology 500 (TSO 500) Landscape Study 2024

Author

Zachary D Wallen

Published

October 24, 2024

1 Setting up the R environment

1.1 Create needed functions

# Function to convert numbers from strings back to numbers in excel output
# parameters:
#       df - the data.frame being outputted into excel
#       wb - the excel workbook object
#    sheet - the excel sheet name
# colnames - does df have column names, TRUE/FALSE
convertNum <- function(df, wb, sheet, colnames) {
    library(foreach)
    cn <- expand.grid(seq_len(ncol(df)), seq_len(nrow(df)))[, 1]
    rn <- expand.grid(seq_len(ncol(df)), seq_len(nrow(df)))[, 2]
    trash <- foreach(cn = cn, rn = rn) %dopar% {
        if (!is.numeric(df[rn, cn]) && !is.na(as.numeric(as.character(df[rn, cn])))) {
            row.offset <- ifelse(colnames, 1, 0)
            if (df[rn, cn] == Inf) {
                openxlsx::writeData(wb, sheet, "Inf", startCol = cn, startRow = row.offset + rn)
            } else {
                openxlsx::writeData(wb, sheet, as.numeric(as.character(df[rn, cn])),
                    startCol = cn, startRow = row.offset + rn
                )
            }
        }
    }
}

# Function to sort a matrix for better visualization of mutual exclusivity
# of alterations across samples
# Retrieved from https://gist.github.com/armish/564a65ab874a770e2c26, and modified
# to work with character, binary, or logical matrices where NA, '', 0, or FALSE
# represents the absence of alterations in a gene
# parameters:
#      M - an alteration by sample matrix
memoSort <- function(M) {
    geneOrder <- sort(
        rowSums(!is.na(M) & M != "" & M > 0 & M != FALSE),
        decreasing = TRUE, index.return = TRUE
    )$ix
    scoreCol <- function(x) {
        score <- 0
        for (i in 1:length(x)) {
            if (!is.na(x[i]) & x[i] != "" & x[i] > 0 & x[i] != FALSE) {
                score <- score + 2^(length(x) - i)
            }
        }
        return(score)
    }
    scores <- apply(M[geneOrder, ], 2, scoreCol)
    sampleOrder <- sort(scores, decreasing = TRUE, index.return = TRUE)$ix
    return(M[geneOrder, sampleOrder])
}

# Function to perform association testing between each level of a categorical variable and
# one or more binary variables describing different group strata. Performs Fisher's
# exact test when no covariates are specified, and Firth's logistic regression (via the
# logistf functionin the logistf package) when covariates are specified.
# parameters:
#       var - column name of the target categorical variable to test
#    strata - a vector of categorical variable column names denoting different strata
#       ref - a vector specifying the reference level for each group specified with strata
#             (only used when performing Firth's logisitic regression)
#    covars - a vector of variable names to include as covariates in testing
# trend.var - column name of a variable to use for performing a test of trend across strata
#   data.df - a data.frame containing the variables to be included in testing
catStratTest <- function(var, strata, ref = NULL, covars = NULL, trend.var = NULL, data.df) {
    # prepare lists for output
    n <- list()
    stats <- list()
    or <- list()
    p <- list()

    # perform testing and summary statistic calculations for each test variables
    for (i in strata) {
        i <- as.character(i)

        # calculate frequency table and total N
        df <- table(data.df[, var], data.df[, i])
        n[[i]] <- colSums(df)

        # for each level of the target variable...
        for (j in rownames(df)) {
            j <- as.character(j)

            # calculate frequency table for presence and absence of current level
            df2 <- table(ifelse(data.df[, var] == j, 0, 1), data.df[, i])

            # calculate summary statistics
            stats[[i]][j] <- list(apply(df2, 2, function(x) {
                paste(x, " ", "(", round(x / sum(x) * 100, 1), "%", ")", sep = "")
            }))

            if (!is.null(covars)) {
                # perform Firth's logistic regression
                res <- logistf::logistf(
                    ifelse(data.df[, var] == j, 1, 0) ~ .,
                    data = data.frame(
                        x = ifelse(data.df[, i] == ref[which(strata == i)], " ", data.df[, i]),
                        data.df[, covars, drop = FALSE]
                    )
                )
                or[[i]][j] <- list(paste(
                    round(exp(res$coefficients[2]), 2),
                    " [", round(exp(res$ci.lower[2]), 2), "; ",
                    round(exp(res$ci.upper[2]), 2), "]",
                    sep = ""
                ))
                p[[i]][j] <- list(formatC(res$prob[2], digits = 1, format = "e"))
            } else {
                # perform Fisher's exact test
                res <- fisher.test(df2)
                or[[i]][j] <- list(paste(
                    round(res$estimate, 2),
                    " [", round(res$conf.int[1], 2), "; ",
                    round(res$conf.int[2], 2), "]",
                    sep = ""
                ))
                p[[i]][j] <- list(formatC(res$p.value, digits = 1, format = "e"))
            }
        }
    }

    # if provided, perform test for trend for each variable level using the trend variable
    if (!is.null(trend.var)) {
        ptrend <- c()
        for (j in seq_along(names(table(data.df[, var])))) {
            if (!is.null(covars)) {
                ptrend <- c(ptrend, as.vector(
                    anova(
                        logistf::logistf(
                            ifelse(
                                data.df[, var] == names(table(data.df[, var]))[j], 1, 0
                            ) ~ .,
                            data = data.df[, c(trend.var, covars)]
                        ),
                        logistf::logistf(
                            ifelse(
                                data.df[, var] == names(table(data.df[, var]))[j], 1, 0
                            ) ~ .,
                            data = data.df[, covars, drop = FALSE]
                        ),
                        method = "PLR"
                    )$pval
                ))
                names(ptrend)[j] <- names(table(data.df[, var]))[j]
            } else {
                ptrend <- c(ptrend, as.vector(
                    anova(
                        logistf::logistf(
                            ifelse(
                                data.df[, var] == names(table(data.df[, var]))[j], 1, 0
                            ) ~ data.df[, trend.var]
                        ),
                        logistf::logistf(
                            ifelse(
                                data.df[, var] == names(table(data.df[, var]))[j], 1, 0
                            ) ~ 1
                        ),
                        method = "PLR"
                    )$pval
                ))
                names(ptrend)[j] <- names(table(data.df[, var]))[j]
            }
        }
    } else {
        ptrend <- NULL
    }
    return(list(n = n, stats = stats, or = or, p = p, ptrend = ptrend))
}

# Function to perform association testing between a quantitative variable and
# one or more binary variables describing different group strata. Performs linear
# regression with or without adjustment for covariates.
# parameters:
#       var - column name of the target quantitative variable to test
#    strata - a vector of categorical variable column names denoting different strata
#       ref - a vector specifying the reference level for each group specified with strata
#    covars - a vector of variable names to include as covariates in testing
# trend.var - column name of a variable to use for performing a test of trend across strata
#   data.df - a data.frame containing the variables to be included in testing
quantStratTest <- function(var, strata, ref, covars = NULL, trend.var = NULL, data.df) {
    # prepare lists for output
    n <- list()
    stats <- list()
    beta <- list()
    p <- list()

    # perform testing and summary statistic calculations for each test variables
    for (i in strata) {
        i <- as.character(i)

        # calculate summary statistics
        df <- ddply(
            data.df, i,
            .fun = function(x) {
                c(
                    n = length(na.omit(x[, var])),
                    mean = mean(na.omit(x[, var])),
                    sd = sd(na.omit(x[, var]))
                )
            }
        )
        n[[i]] <- df$n
        names(n[[i]]) <- df[, i]
        stats[[i]] <- paste(round(df$mean, 1), round(df$sd, 1), sep = "±")
        names(stats[[i]]) <- df[, i]

        # perform linear regression with or without covariates
        if (!is.null(covars)) {
            res <- lm(
                data.df[, var] ~ .,
                data = data.frame(
                    x = ifelse(data.df[, i] == ref[which(strata == i)], " ", data.df[, i]),
                    data.df[, covars, drop = FALSE]
                )
            )
            beta[[i]] <- paste(
                round(summary(res)$coefficients[2, 1], 2),
                " [", round(confint(res)[2, 1], 2), "; ", round(confint(res)[2, 2], 2), "]",
                sep = ""
            )
            p[[i]] <- formatC(summary(res)$coefficients[2, 4], digits = 1, format = "e")
        } else {
            res <- lm(
                data.df[, var] ~ .,
                data = data.frame(
                    x = ifelse(data.df[, i] == ref[which(strata == i)], " ", data.df[, i])
                )
            )
            beta[[i]] <- paste(
                round(summary(res)$coefficients[2, 1], 2),
                " [", round(confint(res)[2, 1], 2), "; ", round(confint(res)[2, 2], 2), "]",
                sep = ""
            )
            p[[i]] <- formatC(summary(res)$coefficients[2, 4], digits = 1, format = "e")
        }
    }

    # if provided, perform test for trend using the trend variable
    if (!is.null(trend.var)) {
        if (!is.null(covars)) {
            ptrend <- anova(
                lm(data.df[, var] ~ ., data = data.df[, c(trend.var, covars)]),
                lm(data.df[, var] ~ ., data = data.df[, covars, drop = FALSE])
            )$`Pr(>F)`[2]
        } else {
            ptrend <- anova(
                lm(data.df[, var] ~ ., data = data.df[, trend.var, drop = FALSE]),
                lm(data.df[, var] ~ 1)
            )$`Pr(>F)`[2]
        }
    } else {
        ptrend <- NULL
    }
    return(list(n = n, stats = stats, beta = beta, p = p, ptrend = ptrend))
}

# Function to perform pairwise Fisher exact tests or Firth's penalized logistic regression
# on an alteration by sample binary matrix for detection of co-occurrence between alterations.
# Will also calculate the proportion of overlapping samples each pair of alteration were detected in together.
# Will perform Fisher exact test if no covariates provided and logistic regression if they
# are provided.
# parameters:
#      M - a sample by alteration binary matrix where '1' denotes a detected alteration
#          and '0' denotes absence of detected alteration for each sample
# covars - NULL (default) or data.frame of covariates to include in the analysis. Should
#          have same rownames (sample IDs) as 'M'.
alterationCoOcurr <- function(M, covars = NULL) {
    # create matrices for outputs
    or.matrix <- matrix(nrow = ncol(M), ncol = ncol(M), dimnames = list(colnames(M), colnames(M)))
    or.lower.matrix <- matrix(nrow = ncol(M), ncol = ncol(M), dimnames = list(colnames(M), colnames(M)))
    or.upper.matrix <- matrix(nrow = ncol(M), ncol = ncol(M), dimnames = list(colnames(M), colnames(M)))
    p.matrix <- matrix(nrow = ncol(M), ncol = ncol(M), dimnames = list(colnames(M), colnames(M)))
    prop.matrix <- matrix(nrow = ncol(M), ncol = ncol(M), dimnames = list(colnames(M), colnames(M)))

    # if covariates given, make sure covariate data lines up with alteration matrix
    if (!is.null(covars)) {
        if (
            !(sum(rownames(M) %in% rownames(covars)) != nrow(M)) |
                !(sum(rownames(covars) %in% rownames(M)) != nrow(covars))
        ) {
            stop("samples included in feature and sample data are different")
        }
        covars <- covars[rownames(M), ]
    }

    # begin calculations and testing
    for (x in seq_len(ncol(M))) {
        for (y in seq_len(ncol(M))) {
            # create testing data for pair of alterations
            if (!is.null(covars)) {
                test.data <- data.frame(y = M[, y], x = M[, x], covars)
            } else {
                test.data <- data.frame(y = M[, y], x = M[, x])
            }

            # perform testing
            if (
                colnames(M)[x] == colnames(M)[y] |
                    sum(grepl("TMB", colnames(M)[x]) + grepl("TMB", colnames(M)[y])) == 2 |
                    sum(grepl("PD.*?L1", colnames(M)[x]) + grepl("PD.*?L1", colnames(M)[y])) == 2
            ) {
                # give null results if testing the same variable or a level of immune marker
                # with another level of the same immune marker
                or.matrix[x, y] <- 1
                or.lower.matrix[x, y] <- NA
                or.upper.matrix[x, y] <- NA
                p.matrix[x, y] <- 1
                prop.matrix[x, y] <- 1
            } else if (
                sum(test.data$x[!is.na(test.data$x) & !is.na(test.data$y)]) == 0 |
                    sum(test.data$y[!is.na(test.data$x) & !is.na(test.data$y)]) == 0
            ) {
                # give null results for alterations with no overlapping results due to
                # missing data
                or.matrix[x, y] <- 1
                or.lower.matrix[x, y] <- NA
                or.upper.matrix[x, y] <- NA
                p.matrix[x, y] <- 1
                prop.matrix[x, y] <- 0
            } else {
                # perform test
                if (!is.null(covars)) {
                    res <- logistf::logistf(y ~ ., data = test.data)
                } else {
                    res <- with(test.data, fisher.test(y, x))
                }

                # add results to matrices
                if (!is.null(covars)) {
                    or.matrix[x, y] <- exp(res$coefficients["x"])
                    or.lower.matrix[x, y] <- exp(res$ci.lower["x"])
                    or.upper.matrix[x, y] <- exp(res$ci.upper["x"])
                    p.matrix[x, y] <- res$prob["x"]
                } else {
                    or.matrix[x, y] <- res$estimate
                    or.lower.matrix[x, y] <- res$conf.int[1]
                    or.upper.matrix[x, y] <- res$conf.int[2]
                    p.matrix[x, y] <- res$p.value
                }
                prop.matrix[x, y] <- sum(na.omit(M[, x] + M[, y] == 2)) /
                    sum(na.omit(M[, x] + M[, y] > 0))
            }
        }
    }

    # multiple test correct p-values for unique tests
    padj.matrix <- p.matrix
    padj.matrix[lower.tri(padj.matrix)] <- p.adjust(
        padj.matrix[lower.tri(padj.matrix)],
        method = "BH"
    )
    padj.matrix[upper.tri(padj.matrix)] <- p.adjust(
        padj.matrix[upper.tri(padj.matrix)],
        method = "BH"
    )

    # combine results into one data.frame
    res <- data.frame(
        `Feature 1` = t(combn(rownames(or.matrix), 2))[, 1],
        `Feature 2` = t(combn(rownames(or.matrix), 2))[, 2],
        `Proportion of co-occurrence` = prop.matrix[lower.tri(prop.matrix)],
        OR = or.matrix[lower.tri(or.matrix)],
        `OR 95%CI lower` = or.lower.matrix[lower.tri(or.lower.matrix)],
        `OR 95%CI upper` = or.upper.matrix[lower.tri(or.upper.matrix)],
        `P-value` = p.matrix[lower.tri(p.matrix)],
        `Adjusted P-value` = padj.matrix[lower.tri(padj.matrix)],
        check.names = FALSE
    )
    res <- res[order(res$`Adjusted P-value`), ]

    # return data
    return(list(
        result.summary = res,
        or.matrix = or.matrix,
        or.lower.matrix = or.lower.matrix,
        or.upper.matrix = or.upper.matrix,
        p.matrix = p.matrix,
        padj.matrix = padj.matrix,
        prop.matrix = prop.matrix
    ))
}

1.2 Load required R packages

library(plyr)
library(readxl)
library(tibble)
library(openxlsx)
library(foreach)
library(kableExtra)
library(ggplot2)
library(ggpubr)
library(ggtext)
library(plotly)
library(GGally)
library(igraph)
library(ggraph)

1.3 Create excel styles used for formatting output

bold <- createStyle(textDecoration = "bold")
left_just <- createStyle(halign = "left", valign = "center", wrapText = TRUE)
right_just <- createStyle(halign = "right", valign = "center", wrapText = TRUE)
center <- createStyle(halign = "center", valign = "center", wrapText = TRUE)
horizontal_border_med <- createStyle(border = "top", borderStyle = "medium")
horizontal_border_thin <- createStyle(border = "top", borderStyle = "thin")
percentage <- createStyle(numFmt = "PERCENTAGE")

2 Statistical analysis

2.1 Import and prepare data

# read data
patient.df <- data.frame(read_xlsx(
    "Data/NSCLC_TSO500_Landscape_Study_de_id_data.xlsx",
    sheet = 1,
    na = c("NULL", "NA", "N/A")
))
variant.df <- data.frame(read_xlsx(
    "Data/NSCLC_TSO500_Landscape_Study_de_id_data.xlsx",
    sheet = 2,
    na = c("NULL", "NA", "N/A")
))
therapy.df <- data.frame(read_xlsx(
    "Data/NSCLC_TSO500_Landscape_Study_de_id_data.xlsx",
    sheet = 3,
    na = c("NULL", "NA", "N/A")
))

# remove data for patients without adenocarcinoma or squamous cell carcinoma
# and for hypermutated patients
patient.df <- patient.df[
    patient.df$histology_group %in% c("Adenocarcinoma", "Squamous cell carcinoma") &
        !(patient.df$de_id %in% patient.df$de_id[patient.df$TMB > 200]),
]
therapy.df <- therapy.df[therapy.df$de_id %in% patient.df$de_id, ]
variant.df <- variant.df[variant.df$de_id %in% patient.df$de_id, ]

# make sure factor levels of age bins are in the correct order
patient.df$patient_age_cat <- factor(
    patient.df$patient_age_cat,
    levels = c(
        sort(unique(patient.df$patient_age_cat))[-1],
        sort(unique(patient.df$patient_age_cat))[1]
    )
)

# order tissue specimen status
patient.df$tissue_status <- factor(
    patient.df$tissue_status,
    levels = c("Primary", "Advanced", "Metastatic")
)

# collapse sub-groups of clinical stages
patient.df$stage[grep("Stage IV|Metastatic", patient.df$stage)] <- "Stage IV"
patient.df$stage[grep("Stage III", patient.df$stage)] <- "Stage III"
patient.df$stage[
    grepl("Stage II", patient.df$stage) & patient.df$stage != "Stage III"
] <- "Stage II"
patient.df$stage[
    grepl("Stage I", patient.df$stage) &
        patient.df$stage != "Stage IV" &
        patient.df$stage != "Stage III" &
        patient.df$stage != "Stage II"
] <- "Stage I"
patient.df$stage[
    is.na(patient.df$stage) | !grepl("Stage I", patient.df$stage)
] <- "Unknown"

# re-create TMB interpretation with very high/high/low, log transform TMB,
# and make sure to mask TMB score for those that failed testing
patient.df$TMB_interp <- NA
patient.df$TMB_interp[patient.df$TMB >= 20] <- "Very high (≥20)"
patient.df$TMB_interp[patient.df$TMB < 20 & patient.df$TMB >= 10] <- "High (10-19)"
patient.df$TMB_interp[patient.df$TMB < 10] <- "Not high (<10)"
patient.df$TMB_interp[
    patient.df$de_id %in% patient.df$de_id[patient.df$DNA_TMB %in% c("Fail", "Not Performed")]
] <- NA
patient.df$TMB[
    patient.df$de_id %in% patient.df$de_id[patient.df$DNA_TMB %in% c("Fail", "Not Performed")]
] <- NA
patient.df$TMB_interp <- factor(
    patient.df$TMB_interp,
    levels = c("Not high (<10)", "High (10-19)", "Very high (≥20)")
)

# re-create PD-L1 with high/low/negative and make sure to mask PD-L1 score for those
# that failed testing
patient.df$PD_L1_IHC_22C3_interp <- NA
patient.df$PD_L1_IHC_22C3_interp[
    patient.df$PD_L1_IHC_22C3_result >= 50
] <- "High (≥50%)"
patient.df$PD_L1_IHC_22C3_interp[
    patient.df$PD_L1_IHC_22C3_result < 50 &
        patient.df$PD_L1_IHC_22C3_result >= 1
] <- "Low (1-49%)"
patient.df$PD_L1_IHC_22C3_interp[
    patient.df$PD_L1_IHC_22C3_result == 0
] <- "Negative (<1%)"
patient.df$PD_L1_IHC_22C3_interp[
    patient.df$de_id %in%
        patient.df$de_id[patient.df$PD_L1_IHC_22C3 %in% c("Fail", "Not Performed")]
] <- NA
patient.df$PD_L1_IHC_22C3_result[
    patient.df$de_id %in%
        patient.df$de_id[patient.df$PD_L1_IHC_22C3 %in% c("Fail", "Not Performed")]
] <- NA
patient.df$PD_L1_IHC_22C3_interp <- factor(
    patient.df$PD_L1_IHC_22C3_interp,
    levels = c("Negative (<1%)", "Low (1-49%)", "High (≥50%)")
)

# recode MSI variable and make those without MSI or failed testing NA for MSI
patient.df$MSI[patient.df$MSI == "MSI_H"] <- "MSI High"
patient.df$MSI[patient.df$MSI == "MSS"] <- "Stable"
patient.df$MSI[patient.df$MSI == "FT"] <- NA
patient.df$MSI[patient.df$MSI == ""] <- NA

# recode SNV type variable to make more simple categories and add CNV and Fusion entries
variant.df$snv_type[grep("^Complex|^Nonstop|^Promoter|Unknown", variant.df$snv_type)] <- "Other SNV"
variant.df$snv_type[grep("^Deletion|^Insertion", variant.df$snv_type)] <- "Indel"
variant.df$snv_type[grep("^Splice", variant.df$snv_type)] <- "Splice site"
variant.df$snv_type[grep("^Substitution", variant.df$snv_type)] <- "Substitution"
variant.df$snv_type[variant.df$variant_type == "CNV"] <- "CNV"
variant.df$snv_type[variant.df$variant_type == "Fusion"] <- "Fusion"

# recode AMP/CAP/ASCO tier variable to only have two categories and be more clear
therapy.df$amp_cap_asco_tier[
    grep("1", therapy.df$amp_cap_asco_tier)
] <- "Guideline indicated"
therapy.df$amp_cap_asco_tier[
    grep("2", therapy.df$amp_cap_asco_tier)
] <- "Clinical trial or therapy in other tumor type"

# create variables to stratify patients by histology
patient.df$adeno <- ifelse(patient.df$histology_group == "Adenocarcinoma", 1, 2)
patient.df$scc <- ifelse(patient.df$histology_group == "Squamous cell carcinoma", 1, 2)
hist_strata <- c("adeno", "scc")

# add histology groups to all data
variant.df <- merge(
    patient.df[, c("de_id", "histology_group")],
    variant.df,
    sort = FALSE
)
therapy.df <- merge(
    patient.df[, c("de_id", "histology_group")],
    therapy.df,
    sort = FALSE
)

2.2 Patient characteristics

# create result data.frame
results <- data.frame()
results <- rbind(
    results,
    data.frame(
        Variable = "",
        N = "All patients", `Summary stats` = "",
        N1 = "Adenocarcinoma", `N1 summary stats` = "",
        N2 = "Squamous cell carcinoma", `N2 summary stats` = "", OR = "", P = "",
        check.names = FALSE
    )
)
results <- rbind(
    results,
    data.frame(
        Variable = "Variable",
        N = "N", `Summary stats` = "Summary stats",
        N1 = "N", `N1 summary stats` = "Summary stats",
        N2 = "N", `N2 summary stats` = "Summary stats", OR = "OR/Beta", P = "P",
        check.names = FALSE
    )
)

# get total number of patients
results <- rbind(
    results,
    data.frame(
        Variable = "Total number of patients",
        N = nrow(patient.df), `Summary stats` = "-",
        N1 = table(patient.df$histology_group)[1], `N1 summary stats` = "-",
        N2 = table(patient.df$histology_group)[2], `N2 summary stats` = "-", OR = "-", P = "-",
        check.names = FALSE
    )
)

### gender

# perform testing
var <- "patient_gender"
res <- catStratTest(
    var = var, strata = "histology_group",
    data.df = patient.df[patient.df$patient_gender != "Unspecified", ]
)

# get totals for all patients
data.df <- table(
    patient.df[patient.df$patient_gender != "Unspecified", var],
    rep("0", nrow(patient.df[patient.df$patient_gender != "Unspecified", ]))
)

# add results to result data.frame
results <- rbind(
    results,
    data.frame(
        Variable = c("Gender (N, %)", rownames(data.df)),
        N = c(sum(data.df), rep("", nrow(data.df))),
        `Summary stats` = c(
            "",
            paste(
                rowSums(data.df), " ", "(", round(rowSums(data.df) / sum(data.df) * 100, 1), "%", ")",
                sep = ""
            )
        ),
        N1 = c(res$n[[1]][1], rep("", nrow(data.df))),
        `N1 summary stats` = c("", res$stats[[1]][[1]][, 1]),
        N2 = c(res$n[[1]][2], rep("", nrow(data.df))),
        `N2 summary stats` = c("", res$stats[[1]][[1]][, 2]),
        OR = c("", "", res$or[[1]][[2]][1]),
        P = c("", "", res$p[[1]][[2]][1]),
        check.names = FALSE
    )
)

### age

# perform testing
var <- "patient_age"
res <- quantStratTest(
    var = var, strata = "histology_group", ref = "Adenocarcinoma", data.df = patient.df
)

# add results to result data.frame
results <- rbind(
    results,
    data.frame(
        Variable = "Age (Mean±SD)",
        N = nrow(patient.df[!is.na(patient.df[, var]), ]),
        `Summary stats` = paste(
            round(mean(na.omit(patient.df[, var])), 1),
            round(sd(na.omit(patient.df[, var])), 1),
            sep = "±"
        ),
        N1 = res$n[[1]][1],
        `N1 summary stats` = res$stats[[1]][1],
        N2 = res$n[[1]][2],
        `N2 summary stats` = res$stats[[1]][2],
        OR = res$beta[[1]],
        P = res$p[[1]],
        check.names = FALSE
    )
)

### age group

# perform testing
var <- "patient_age_cat"
res <- catStratTest(var = var, strata = "histology_group", data.df = patient.df)

# get totals for all patients
data.df <- table(patient.df[, var], rep("0", nrow(patient.df)))

# add results to result data.frame
results <- rbind(
    results,
    data.frame(
        Variable = c("Age group (N, %)", rownames(data.df)),
        N = c(sum(data.df), rep("", nrow(data.df))),
        `Summary stats` = c(
            "",
            paste(
                rowSums(data.df), " ", "(", round(rowSums(data.df) / sum(data.df) * 100, 1), "%", ")",
                sep = ""
            )
        ),
        N1 = c(res$n[[1]][1], rep("", nrow(data.df))),
        `N1 summary stats` = c("", sapply(res$stats[[1]], function(x) {
            x[1, 1]
        })),
        N2 = c(res$n[[1]][2], rep("", nrow(data.df))),
        `N2 summary stats` = c("", sapply(res$stats[[1]], function(x) {
            x[1, 2]
        })),
        OR = c("", sapply(res$or[[1]], function(x) {
            x[1]
        })),
        P = c("", sapply(res$p[[1]], function(x) {
            x[1]
        })),
        check.names = FALSE
    )
)

### histology

# get frequencies
data.df <- table(patient.df$histology_group, patient.df$histology_group)

# add results to result data.frame
results <- rbind(
    results,
    data.frame(
        Variable = c("Histology group (N, %)", rownames(data.df)),
        N = c(sum(data.df), rep("", nrow(data.df))),
        `Summary stats` = c(
            "",
            paste(
                rowSums(data.df), " ", "(", round(rowSums(data.df) / sum(data.df) * 100, 1), "%", ")",
                sep = ""
            )
        ),
        N1 = c(sum(data.df[, 1]), rep("", nrow(data.df))),
        `N1 summary stats` = c(
            "",
            gsub(
                "0 \\(0\\%\\)", "-",
                paste(
                    data.df[, 1], " ", "(", round(data.df[, 1] / sum(data.df[, 1]) * 100, 1), "%", ")",
                    sep = ""
                )
            )
        ),
        N2 = c(sum(data.df[, 2]), rep("", nrow(data.df))),
        `N2 summary stats` = c(
            "",
            gsub(
                "0 \\(0\\%\\)", "-",
                paste(
                    data.df[, 2], " ", "(", round(data.df[, 2] / sum(data.df[, 2]) * 100, 1), "%", ")",
                    sep = ""
                )
            )
        ),
        OR = c("", rep("-", nrow(data.df))),
        P = c("", rep("-", nrow(data.df))),
        check.names = FALSE
    )
)

### tissue specimen location

# perform testing
var <- "tissue_status"
res <- catStratTest(var = var, strata = "histology_group", data.df = patient.df)

# get totals for all patients
data.df <- table(patient.df[, var], rep("0", nrow(patient.df)))

# add results to result data.frame
results <- rbind(
    results,
    data.frame(
        Variable = c("Tissue specimen location (N, %)", rownames(data.df)),
        N = c(sum(data.df), rep("", nrow(data.df))),
        `Summary stats` = c(
            "",
            paste(
                rowSums(data.df), " ", "(", round(rowSums(data.df) / sum(data.df) * 100, 1), "%", ")",
                sep = ""
            )
        ),
        N1 = c(res$n[[1]][1], rep("", nrow(data.df))),
        `N1 summary stats` = c("", sapply(res$stats[[1]], function(x) {
            x[1, 1]
        })),
        N2 = c(res$n[[1]][2], rep("", nrow(data.df))),
        `N2 summary stats` = c("", sapply(res$stats[[1]], function(x) {
            x[1, 2]
        })),
        OR = c("", sapply(res$or[[1]], function(x) {
            x[1]
        })),
        P = c("", sapply(res$p[[1]], function(x) {
            x[1]
        })),
        check.names = FALSE
    )
)

### unknown clinical stage

# get frequencies
data.df <- table(
    factor(
        ifelse(patient.df$stage == "Unknown",
            "Unknown clinical stage (N, %)",
            "Known clinical stage (N, %)"
        ),
        levels = c("Unknown clinical stage (N, %)", "Known clinical stage (N, %)")
    ),
    patient.df$histology_group
)

# add results to result data.frame
results <- rbind(
    results,
    data.frame(
        Variable = rownames(data.df),
        N = rep("", 2),
        `Summary stats` = paste(
            rowSums(data.df), " ", "(", round(rowSums(data.df) / sum(data.df) * 100, 1), "%", ")",
            sep = ""
        ),
        N1 = rep("", 2),
        `N1 summary stats` = paste(
            data.df[, 1], " ", "(", round(data.df[, 1] / sum(data.df[, 1]) * 100, 1), "%", ")",
            sep = ""
        ),
        N2 = rep("", 2),
        `N2 summary stats` = paste(
            data.df[, 2], " ", "(", round(data.df[, 2] / sum(data.df[, 2]) * 100, 1), "%", ")",
            sep = ""
        ),
        OR = rep("-", nrow(data.df)),
        P = rep("-", nrow(data.df)),
        check.names = FALSE
    )
)

### clinical stage

# perform testing
var <- "stage"
res <- catStratTest(
    var = var, strata = "histology_group", data.df = patient.df[patient.df[, var] != "Unknown", ]
)

# get totals for all patients
data.df <- table(
    patient.df[patient.df[, var] != "Unknown", var],
    rep("0", nrow(patient.df[patient.df[, var] != "Unknown", ]))
)

# add results to result data.frame
results <- rbind(
    results,
    data.frame(
        Variable = c("Known clinical stage (N, %)", rownames(data.df)),
        N = c(sum(data.df), rep("", nrow(data.df))),
        `Summary stats` = c(
            "",
            paste(
                rowSums(data.df), " ", "(", round(rowSums(data.df) / sum(data.df) * 100, 1), "%", ")",
                sep = ""
            )
        ),
        N1 = c(res$n[[1]][1], rep("", nrow(data.df))),
        `N1 summary stats` = c("", sapply(res$stats[[1]], function(x) {
            x[1, 1]
        })),
        N2 = c(res$n[[1]][2], rep("", nrow(data.df))),
        `N2 summary stats` = c("", sapply(res$stats[[1]], function(x) {
            x[1, 2]
        })),
        OR = c("", sapply(res$or[[1]], function(x) {
            x[1]
        })),
        P = c("", sapply(res$p[[1]], function(x) {
            x[1]
        })),
        check.names = FALSE
    )
)

### Number of detected alterations

# create data for testing
test.data <- rbind(
    merge(
        patient.df[
            !(patient.df$DNA_SNV %in% c("Fail", "Not Performed") |
                patient.df$DNA_CNV %in% c("Fail", "Not Performed") |
                patient.df$RNA_Rearrangements %in% c("Fail", "Not Performed")),
            c("de_id", "histology_group")
        ],
        ddply(variant.df, .(de_id), summarize, var_count = length(de_id)),
        by = "de_id"
    ),
    data.frame(
        patient.df[
            !(patient.df$de_id %in% variant.df$de_id) &
                !(patient.df$DNA_SNV %in% c("Fail", "Not Performed") |
                    patient.df$DNA_CNV %in% c("Fail", "Not Performed") |
                    patient.df$RNA_Rearrangements %in% c("Fail", "Not Performed")),
            c("de_id", "histology_group")
        ],
        var_count = 0
    )
)

# perform testing
var <- "var_count"
res <- quantStratTest(
    var = var, strata = "histology_group", ref = "Adenocarcinoma", data.df = test.data
)

# add results to result data.frame
results <- rbind(
    results,
    data.frame(
        Variable = "Number of detected alterations (Mean±SD)",
        N = nrow(test.data[!is.na(test.data[, var]), ]),
        `Summary stats` = paste(
            round(mean(na.omit(test.data[, var])), 1),
            round(sd(na.omit(test.data[, var])), 1),
            sep = "±"
        ),
        N1 = res$n[[1]][1],
        `N1 summary stats` = res$stats[[1]][1],
        N2 = res$n[[1]][2],
        `N2 summary stats` = res$stats[[1]][2],
        OR = res$beta[[1]],
        P = res$p[[1]],
        check.names = FALSE
    )
)

### Tier 1 or 2 genomic alterations

# create data for testing
test.data <- data.frame(
    patient.df,
    `Guideline indicated` = ifelse(
        patient.df$de_id %in% therapy.df$de_id[
            grepl("Guideline", therapy.df$amp_cap_asco_tier) &
                therapy.df$marker_component_marker_type %in% c("SNV", "CNV", "Fusion")
        ], 0, 1
    ),
    `Clinical trial or therapy in other tumor type` = ifelse(
        patient.df$de_id %in% therapy.df$de_id[
            grepl("Clinical", therapy.df$amp_cap_asco_tier) &
                therapy.df$marker_component_marker_type %in% c("SNV", "CNV", "Fusion")
        ], 0, 1
    ),
    `Neither above, but known pathogenic` = ifelse(
        !(patient.df$de_id %in% therapy.df$de_id[
            therapy.df$marker_component_marker_type %in% c("SNV", "CNV", "Fusion")
        ]), 0, 1
    ),
    check.names = FALSE
)

# perform testing
var <- c(
    "Guideline indicated",
    "Clinical trial or therapy in other tumor type",
    "Neither above, but known pathogenic"
)
res <- list()
for (i in var) {
    res[[i]] <- catStratTest(var = i, strata = "histology_group", data.df = test.data)
}

# add results to result data.frame
results <- rbind(
    results,
    data.frame(
        Variable = c("Genomic alterations with known or potential clinical significance (N, %)", var),
        N = c(nrow(test.data), rep("", length(var))),
        `Summary stats` = c(
            "",
            paste(
                colSums(test.data[, var] == 0 & !is.na(test.data[, var])), " ", "(",
                round(
                    colSums(test.data[, var] == 0 & !is.na(test.data[, var])) /
                        nrow(test.data) * 100, 1
                ), "%", ")",
                sep = ""
            )
        ),
        N1 = c(sapply(res, function(x) {
            x$n[[1]][1]
        })[1], rep("", length(var))),
        `N1 summary stats` = c("", sapply(res, function(x) {
            x$stats[[1]][[1]][1, 1]
        })),
        N2 = c(sapply(res, function(x) {
            x$n[[1]][2]
        })[1], rep("", length(var))),
        `N2 summary stats` = c("", sapply(res, function(x) {
            x$stats[[1]][[1]][1, 2]
        })),
        OR = c("", sapply(res, function(x) {
            x$or[[1]][[2]][1]
        })),
        P = c("", sapply(res, function(x) {
            x$p[[1]][[2]][1]
        })),
        check.names = FALSE
    )
)

### TMB score (mut/Mb)

# perform testing
var <- "TMB"
res <- quantStratTest(
    var = var, strata = "histology_group", ref = "Adenocarcinoma", data.df = patient.df
)

# add results to result data.frame
results <- rbind(
    results,
    data.frame(
        Variable = "TMB (Mut/Mb) (Mean±SD)",
        N = nrow(patient.df[!is.na(patient.df[, var]), ]),
        `Summary stats` = paste(
            round(mean(na.omit(patient.df[, var])), 1),
            round(sd(na.omit(patient.df[, var])), 1),
            sep = "±"
        ),
        N1 = res$n[[1]][1],
        `N1 summary stats` = res$stats[[1]][1],
        N2 = res$n[[1]][2],
        `N2 summary stats` = res$stats[[1]][2],
        OR = res$beta[[1]],
        P = res$p[[1]],
        check.names = FALSE
    )
)

### TMB very high, high, not high

# create data for testing
total.patients <- length(patient.df[!is.na(patient.df$TMB_interp), "de_id"])
test.data <- data.frame(
    patient.df,
    `Very high (≥20)` = ifelse(
        patient.df$de_id %in% patient.df$de_id[patient.df$TMB_interp == "Very high (≥20)"], 0,
        ifelse(
            !(patient.df$de_id %in% patient.df$de_id) |
                patient.df$de_id %in% patient.df$de_id[is.na(patient.df$TMB_interp)], NA, 1
        )
    ),
    `High (10-19)` = ifelse(
        patient.df$de_id %in% patient.df$de_id[patient.df$TMB_interp == "High (10-19)"], 0,
        ifelse(
            !(patient.df$de_id %in% patient.df$de_id) |
                patient.df$de_id %in% patient.df$de_id[is.na(patient.df$TMB_interp)], NA, 1
        )
    ),
    `Not high (<10)` = ifelse(
        patient.df$de_id %in% patient.df$de_id[patient.df$TMB_interp == "Not high (<10)"], 0,
        ifelse(
            !(patient.df$de_id %in% patient.df$de_id) |
                patient.df$de_id %in% patient.df$de_id[is.na(patient.df$TMB_interp)], NA, 1
        )
    ),
    check.names = FALSE
)

# perform testing
var <- c("Very high (≥20)", "High (10-19)", "Not high (<10)")
res <- list()
for (i in var) {
    res[[i]] <- catStratTest(var = i, strata = "histology_group", data.df = test.data)
}

# add results to result data.frame
results <- rbind(
    results,
    data.frame(
        Variable = c("TMB level (N, %)", var),
        N = c(total.patients, rep("", length(var))),
        `Summary stats` = c(
            "",
            paste(
                colSums(test.data[, var] == 0 & !is.na(test.data[, var])), " ", "(",
                round(
                    colSums(test.data[, var] == 0 & !is.na(test.data[, var])) / total.patients * 100, 1
                ), "%", ")",
                sep = ""
            )
        ),
        N1 = c(sapply(res, function(x) {
            x$n[[1]][1]
        })[1], rep("", length(var))),
        `N1 summary stats` = c("", sapply(res, function(x) {
            x$stats[[1]][[1]][1, 1]
        })),
        N2 = c(sapply(res, function(x) {
            x$n[[1]][2]
        })[1], rep("", length(var))),
        `N2 summary stats` = c("", sapply(res, function(x) {
            x$stats[[1]][[1]][1, 2]
        })),
        OR = c("", sapply(res, function(x) {
            x$or[[1]][[2]][1]
        })),
        P = c("", sapply(res, function(x) {
            x$p[[1]][[2]][1]
        })),
        check.names = FALSE
    )
)

### PD-L1 score (% expressing cells)

# perform testing
var <- "PD_L1_IHC_22C3_result"
res <- quantStratTest(
    var = var, strata = "histology_group", ref = "Adenocarcinoma", data.df = patient.df
)

# add results to result data.frame
results <- rbind(
    results,
    data.frame(
        Variable = "PD-L1 22C3 TPS (Mean±SD)",
        N = nrow(patient.df[!is.na(patient.df[, var]), ]),
        `Summary stats` = paste(
            round(mean(na.omit(patient.df[, var])), 1),
            round(sd(na.omit(patient.df[, var])), 1),
            sep = "±"
        ),
        N1 = res$n[[1]][1],
        `N1 summary stats` = res$stats[[1]][1],
        N2 = res$n[[1]][2],
        `N2 summary stats` = res$stats[[1]][2],
        OR = res$beta[[1]],
        P = res$p[[1]],
        check.names = FALSE
    )
)

### PD-L1 high, low, negative

# create data for testing
total.patients <- length(patient.df[!is.na(patient.df$PD_L1_IHC_22C3_interp), "de_id"])
test.data <- data.frame(
    patient.df,
    `High (≥50%)` = ifelse(
        patient.df$de_id %in%
            patient.df$de_id[patient.df$PD_L1_IHC_22C3_interp == "High (≥50%)"], 0,
        ifelse(
            !(patient.df$de_id %in% patient.df$de_id) |
                patient.df$de_id %in%
                    patient.df$de_id[is.na(patient.df$PD_L1_IHC_22C3_interp)], NA, 1
        )
    ),
    `Low (1-49%)` = ifelse(
        patient.df$de_id %in%
            patient.df$de_id[patient.df$PD_L1_IHC_22C3_interp == "Low (1-49%)"], 0,
        ifelse(
            !(patient.df$de_id %in% patient.df$de_id) |
                patient.df$de_id %in%
                    patient.df$de_id[is.na(patient.df$PD_L1_IHC_22C3_interp)], NA, 1
        )
    ),
    `Negative (<1%)` = ifelse(
        patient.df$de_id %in%
            patient.df$de_id[patient.df$PD_L1_IHC_22C3_interp == "Negative (<1%)"], 0,
        ifelse(
            !(patient.df$de_id %in% patient.df$de_id) |
                patient.df$de_id %in%
                    patient.df$de_id[is.na(patient.df$PD_L1_IHC_22C3_interp)], NA, 1
        )
    ),
    check.names = FALSE
)

# perform testing
var <- c("High (≥50%)", "Low (1-49%)", "Negative (<1%)")
res <- list()
for (i in var) {
    res[[i]] <- catStratTest(var = i, strata = "histology_group", data.df = test.data)
}

# add results to result data.frame
results <- rbind(
    results,
    data.frame(
        Variable = c("PD-L1 level (N, %)", var),
        N = c(total.patients, rep("", length(var))),
        `Summary stats` = c(
            "",
            paste(
                colSums(test.data[, var] == 0 & !is.na(test.data[, var])), " ", "(",
                round(
                    colSums(test.data[, var] == 0 & !is.na(test.data[, var])) / total.patients * 100, 1
                ), "%", ")",
                sep = ""
            )
        ),
        N1 = c(sapply(res, function(x) {
            x$n[[1]][1]
        })[1], rep("", length(var))),
        `N1 summary stats` = c("", sapply(res, function(x) {
            x$stats[[1]][[1]][1, 1]
        })),
        N2 = c(sapply(res, function(x) {
            x$n[[1]][2]
        })[1], rep("", length(var))),
        `N2 summary stats` = c("", sapply(res, function(x) {
            x$stats[[1]][[1]][1, 2]
        })),
        OR = c("", sapply(res, function(x) {
            x$or[[1]][[2]][1]
        })),
        P = c("", sapply(res, function(x) {
            x$p[[1]][[2]][1]
        })),
        check.names = FALSE
    )
)

### MSI level high or stable

# perform testing
var <- "MSI"
res <- catStratTest(var = var, strata = "histology_group", data.df = patient.df)

# get totals for all patients
data.df <- table(patient.df[, var], rep("0", nrow(patient.df)))

# add results to result data.frame
results <- rbind(
    results,
    data.frame(
        Variable = c("MSI level (N, %)", rownames(data.df)),
        N = c(sum(data.df), rep("", nrow(data.df))),
        `Summary stats` = c(
            "",
            paste(
                rowSums(data.df), " ", "(", round(rowSums(data.df) / sum(data.df) * 100, 1), "%", ")",
                sep = ""
            )
        ),
        N1 = c(res$n[[1]][1], rep("", nrow(data.df))),
        `N1 summary stats` = c("", res$stats[[1]][[1]][, 1]),
        N2 = c(res$n[[1]][2], rep("", nrow(data.df))),
        `N2 summary stats` = c("", res$stats[[1]][[1]][, 2]),
        OR = c("", "", res$or[[1]][[2]][1]),
        P = c("", "", res$p[[1]][[2]][1]),
        check.names = FALSE
    )
)

### write out results

# create workbook
wb <- createWorkbook()

# add worksheet, write data, and format output
addWorksheet(wb, "Patient characteristics")

writeData(wb, "Patient characteristics", results, keepNA = TRUE, colNames = FALSE)

setColWidths(
    wb, "Patient characteristics",
    cols = seq_len(ncol(results)),
    widths = c(41, 5, 15, rep(c(5, 14), 2), 19.5, 11)
)

addStyle(
    wb, "Patient characteristics",
    cols = seq_len(ncol(results)), rows = 1:(nrow(results) + 1),
    gridExpand = TRUE, style = left_just, stack = TRUE
)

addStyle(
    wb, "Patient characteristics",
    cols = 1,
    rows = c(
        which(results$Variable == "Female"):which(results$Variable == "Male"),
        which(results$Variable == "≤40"):which(results$Variable == ">90"),
        which(results$Variable == "Adenocarcinoma"):which(results$Variable == "Squamous cell carcinoma"),
        which(results$Variable == "Primary"):which(results$Variable == "Metastatic"),
        which(results$Variable == "Stage I"):which(results$Variable == "Stage IV"),
        which(results$Variable == "Guideline indicated"):which(results$Variable == "Neither above, but known pathogenic"),
        which(results$Variable == "Very high (≥20)"):which(results$Variable == "Not high (<10)"),
        which(results$Variable == "High (≥50%)"):which(results$Variable == "Negative (<1%)"),
        which(results$Variable == "MSI High"):which(results$Variable == "Stable")
    ),
    gridExpand = TRUE, style = center, stack = TRUE
)

mergeCells(wb, "Patient characteristics", cols = c(2, 3), rows = 1)
mergeCells(wb, "Patient characteristics", cols = c(4, 5), rows = 1)
mergeCells(wb, "Patient characteristics", cols = c(6, 7), rows = 1)

addStyle(
    wb, "Patient characteristics",
    cols = seq_len(ncol(results)), rows = 1:2,
    gridExpand = TRUE, style = bold, stack = TRUE
)

addStyle(
    wb, "Patient characteristics",
    cols = seq_len(ncol(results)),
    rows = c(1, 3, (nrow(results) + 1)), gridExpand = TRUE,
    style = horizontal_border_med, stack = TRUE
)

addStyle(
    wb, "Patient characteristics",
    cols = seq_len(ncol(results)),
    rows = c(
        grep("Gender", results$Variable),
        grep("Age \\(", results$Variable),
        grep("Age group", results$Variable),
        grep("Histology", results$Variable),
        grep("Unknown", results$Variable),
        grep("Known", results$Variable)[2],
        grep("Tissue", results$Variable),
        grep("Number", results$Variable),
        grep("Genomic", results$Variable),
        grep("TMB", results$Variable),
        grep("PD-L1", results$Variable),
        grep("MSI level", results$Variable)
    ),
    gridExpand = TRUE, style = horizontal_border_thin, stack = TRUE
)

# convert numbers from strings back to numbers
convertNum(results, wb, "Patient characteristics", FALSE)

# save workbook
saveWorkbook(wb, "Output/Tables/Patient_characteristics.xlsx", overwrite = TRUE)

2.3 Genomic alteration characterization

2.3.1 Genomic alteration prevalence: Gene-level

# create result data.frame
results <- data.frame()
results <- rbind(
    results,
    data.frame(
        Variable = "",
        N = "All patients", Frequency = "",
        N1 = "Adenocarcinoma", `N1 summary stats` = "",
        N2 = "Squamous cell carcinoma", `N2 summary stats` = "", OR = "", P = "",
        check.names = FALSE
    )
)
results <- rbind(
    results,
    data.frame(
        Variable = "Gene",
        N = "N", Frequency = "Summary stats",
        N1 = "N", `N1 summary stats` = "Summary stats",
        N2 = "N", `N2 summary stats` = "Summary stats", OR = "OR", P = "FDR q-value",
        check.names = FALSE
    )
)

# get total number of patients
results <- rbind(
    results,
    data.frame(
        Variable = "Total number of patients",
        N = nrow(patient.df), Frequency = "-",
        N1 = table(patient.df$histology_group)[1], `N1 summary stats` = "-",
        N2 = table(patient.df$histology_group)[2], `N2 summary stats` = "-", OR = "-", P = "-",
        check.names = FALSE
    )
)

### SNVs

# create data for testing
total.patients <- sum(!(patient.df$DNA_SNV %in% c("Fail", "Not Performed")))
var <- c()
test.data <- patient.df
ref.data <- variant.df[variant.df$variant_type == "SNV", ]
for (i in unique(ref.data$gene)) {
    test.data$gene <- ifelse(
        patient.df$de_id %in% ref.data$de_id[ref.data$gene == i], 0,
        ifelse(
            !(patient.df$DNA_SNV %in% c("Fail", "Not Performed")), 1, NA
        )
    )
    if (sum(na.omit(test.data$gene == 0)) / total.patients >= 0.001) {
        var <- append(var, i)
        colnames(test.data)[ncol(test.data)] <- i
    } else {
        test.data <- test.data[, -ncol(test.data)]
    }
}
var <- colnames(t(memoSort(t(abs(test.data[, var] - 1)))))
test.data <- test.data[, c(colnames(patient.df), var)]

# perform testing
res <- list()
for (i in var) {
    res[[i]] <- catStratTest(var = i, strata = "histology_group", data.df = test.data)
}

# add results to result data.frame
results <- rbind(
    results,
    data.frame(
        Variable = c("Single nucleotide variants (N, %)", var),
        N = c(total.patients, rep("", length(var))),
        Frequency = c(
            "",
            paste(
                colSums(test.data[, var] == 0 & !is.na(test.data[, var])), " ", "(",
                round(
                    colSums(test.data[, var] == 0 & !is.na(test.data[, var])) / total.patients * 100, 1
                ), "%", ")",
                sep = ""
            )
        ),
        N1 = c(sapply(res, function(x) {
            x$n[[1]][1]
        })[1], rep("", length(var))),
        `N1 summary stats` = c("", sapply(res, function(x) {
            x$stats[[1]][[1]][1, 1]
        })),
        N2 = c(sapply(res, function(x) {
            x$n[[1]][2]
        })[1], rep("", length(var))),
        `N2 summary stats` = c("", sapply(res, function(x) {
            x$stats[[1]][[1]][1, 2]
        })),
        OR = c("", sapply(res, function(x) {
            x$or[[1]][[2]][1]
        })),
        P = c("", sapply(res, function(x) {
            x$p[[1]][[2]][1]
        })),
        check.names = FALSE
    )
)

### CNVs

# create data for testing
total.patients <- sum(!(patient.df$DNA_CNV %in% c("Fail", "Not Performed")))
var <- c()
test.data <- patient.df
ref.data <- variant.df[variant.df$variant_type == "CNV", ]
for (i in unique(ref.data$gene)) {
    test.data$gene <- ifelse(
        patient.df$de_id %in% ref.data$de_id[ref.data$gene == i], 0,
        ifelse(
            !(patient.df$DNA_CNV %in% c("Fail", "Not Performed")), 1, NA
        )
    )
    if (sum(na.omit(test.data$gene == 0)) / total.patients >= 0.001) {
        var <- append(var, i)
        colnames(test.data)[ncol(test.data)] <- i
    } else {
        test.data <- test.data[, -ncol(test.data)]
    }
}
var <- colnames(t(memoSort(t(abs(test.data[, var] - 1)))))
test.data <- test.data[, c(colnames(patient.df), var)]

# perform testing
res <- list()
for (i in var) {
    res[[i]] <- catStratTest(var = i, strata = "histology_group", data.df = test.data)
}

# add results to result data.frame
results <- rbind(
    results,
    data.frame(
        Variable = c("Copy number variants (N, %)", var),
        N = c(total.patients, rep("", length(var))),
        Frequency = c(
            "",
            paste(
                colSums(test.data[, var] == 0 & !is.na(test.data[, var])), " ", "(",
                round(
                    colSums(test.data[, var] == 0 & !is.na(test.data[, var])) / total.patients * 100, 1
                ), "%", ")",
                sep = ""
            )
        ),
        N1 = c(sapply(res, function(x) {
            x$n[[1]][1]
        })[1], rep("", length(var))),
        `N1 summary stats` = c("", sapply(res, function(x) {
            x$stats[[1]][[1]][1, 1]
        })),
        N2 = c(sapply(res, function(x) {
            x$n[[1]][2]
        })[1], rep("", length(var))),
        `N2 summary stats` = c("", sapply(res, function(x) {
            x$stats[[1]][[1]][1, 2]
        })),
        OR = c("", sapply(res, function(x) {
            x$or[[1]][[2]][1]
        })),
        P = c("", sapply(res, function(x) {
            x$p[[1]][[2]][1]
        })),
        check.names = FALSE
    )
)

### Fusions

# create data for testing
total.patients <- sum(!(patient.df$RNA_Rearrangements %in% c("Fail", "Not Performed")))
var <- c()
test.data <- patient.df
ref.data <- variant.df[variant.df$variant_type == "Fusion", ]
ref.data$gene[
    grep("Skip", ref.data$variant)
] <- grep("Skip", ref.data$variant, value = TRUE)
for (i in unique(ref.data$gene)) {
    test.data$gene <- ifelse(
        patient.df$de_id %in% ref.data$de_id[ref.data$gene == i], 0,
        ifelse(
            !(patient.df$RNA_Rearrangements %in% c("Fail", "Not Performed")), 1, NA
        )
    )
    if (sum(na.omit(test.data$gene == 0)) / total.patients >= 0.001) {
        var <- append(var, i)
        colnames(test.data)[ncol(test.data)] <- i
    } else {
        test.data <- test.data[, -ncol(test.data)]
    }
}
var <- colnames(t(memoSort(t(abs(test.data[, var] - 1)))))
test.data <- test.data[, c(colnames(patient.df), var)]

# perform testing
res <- list()
for (i in var) {
    res[[i]] <- catStratTest(var = i, strata = "histology_group", data.df = test.data)
}

# add results to result data.frame
results <- rbind(
    results,
    data.frame(
        Variable = c("Fusions/Skipping Variants (N, %)", var),
        N = c(total.patients, rep("", length(var))),
        Frequency = c(
            "",
            paste(
                colSums(test.data[, var] == 0 & !is.na(test.data[, var])), " ", "(",
                round(
                    colSums(test.data[, var] == 0 & !is.na(test.data[, var])) / total.patients * 100, 1
                ), "%", ")",
                sep = ""
            )
        ),
        N1 = c(sapply(res, function(x) {
            x$n[[1]][1]
        })[1], rep("", length(var))),
        `N1 summary stats` = c("", sapply(res, function(x) {
            x$stats[[1]][[1]][1, 1]
        })),
        N2 = c(sapply(res, function(x) {
            x$n[[1]][2]
        })[1], rep("", length(var))),
        `N2 summary stats` = c("", sapply(res, function(x) {
            x$stats[[1]][[1]][1, 2]
        })),
        OR = c("", sapply(res, function(x) {
            x$or[[1]][[2]][1]
        })),
        P = c("", sapply(res, function(x) {
            x$p[[1]][[2]][1]
        })),
        check.names = FALSE
    )
)

### multiple testing correction

# adjust p-values for multiple testing
for (i in grep("P", colnames(results))) {
    results[!(results[, i] %in% c("FDR q-value", "-", "")), i] <- formatC(
        p.adjust(
            as.numeric(results[!(results[, i] %in% c("FDR q-value", "-", "")), i]),
            method = "BH"
        ),
        digits = 1, format = "e"
    )
}

### write out results

# create workbook
wb <- createWorkbook()

# add worksheet, write data, and format output
addWorksheet(wb, "Alteration prevalence")

writeData(wb, "Alteration prevalence", results, keepNA = TRUE, colNames = FALSE)

setColWidths(
    wb, "Alteration prevalence",
    cols = seq_len(ncol(results)),
    widths = c(33, 5, 15, rep(c(5, 14), 2), 19.5, 11)
)

addStyle(
    wb, "Alteration prevalence",
    cols = seq_len(ncol(results)), rows = 1:(nrow(results) + 1),
    gridExpand = TRUE, style = left_just, stack = TRUE
)

addStyle(
    wb, "Alteration prevalence",
    cols = 1,
    rows = c(
        (grep("^Single", results$Variable) + 1):(grep("^Copy", results$Variable) - 1),
        (grep("^Copy", results$Variable) + 1):(grep("^Fusions", results$Variable) - 1),
        (grep("^Fusions", results$Variable) + 1):nrow(results)
    ),
    gridExpand = TRUE, style = center, stack = TRUE
)

mergeCells(wb, "Alteration prevalence", cols = c(2, 3), rows = 1)
mergeCells(wb, "Alteration prevalence", cols = c(4, 5), rows = 1)
mergeCells(wb, "Alteration prevalence", cols = c(6, 7), rows = 1)

addStyle(
    wb, "Alteration prevalence",
    cols = seq_len(ncol(results)), rows = 1:2,
    gridExpand = TRUE, style = bold, stack = TRUE
)

addStyle(
    wb, "Alteration prevalence",
    cols = seq_len(ncol(results)),
    rows = c(1, 3, (nrow(results) + 1)), gridExpand = TRUE,
    style = horizontal_border_med, stack = TRUE
)

addStyle(
    wb, "Alteration prevalence",
    cols = seq_len(ncol(results)),
    rows = c(
        grep("^Single", results$Variable),
        grep("^Copy", results$Variable),
        grep("^Fusions", results$Variable)
    ),
    gridExpand = TRUE, style = horizontal_border_thin, stack = TRUE
)

# convert numbers from strings back to numbers
convertNum(results, wb, "Alteration prevalence", FALSE)

# save workbook and results
saveWorkbook(
    wb, "Output/Tables/Patient_alteration_prevalence_by_gene.xlsx",
    overwrite = TRUE
)
gene.results <- results

2.3.2 Genomic alteration prevalence: Variant-level

# create result data.frame
results <- data.frame()
results <- rbind(
    results,
    data.frame(
        Variable = "",
        Type = "",
        N = "All patients", Frequency = "",
        N1 = "Adenocarcinoma", `N1 summary stats` = "",
        N2 = "Squamous cell carcinoma", `N2 summary stats` = "", OR = "", P = "",
        check.names = FALSE
    )
)
results <- rbind(
    results,
    data.frame(
        Variable = "Variant",
        Type = "SNV type",
        N = "N", Frequency = "Summary stats",
        N1 = "N", `N1 summary stats` = "Summary stats",
        N2 = "N", `N2 summary stats` = "Summary stats", OR = "OR", P = "FDR q-value",
        check.names = FALSE
    )
)

# get total number of patients
results <- rbind(
    results,
    data.frame(
        Variable = "Total number of patients",
        Type = "",
        N = nrow(patient.df), Frequency = "-",
        N1 = table(patient.df$histology_group)[1], `N1 summary stats` = "-",
        N2 = table(patient.df$histology_group)[2], `N2 summary stats` = "-", OR = "-", P = "-",
        check.names = FALSE
    )
)

### SNVs

# create data for testing
total.patients <- sum(!(patient.df$DNA_SNV %in% c("Fail", "Not Performed")))
var <- c()
test.data <- patient.df
ref.data <- variant.df[variant.df$variant_type == "SNV", ]
for (i in unique(ref.data$variant_go_name)) {
    test.data$gene <- ifelse(
        patient.df$de_id %in% ref.data$de_id[ref.data$variant_go_name == i], 0,
        ifelse(
            !(patient.df$DNA_SNV %in% c("Fail", "Not Performed")), 1, NA
        )
    )
    if (sum(na.omit(test.data$gene == 0)) / total.patients >= 0.001) {
        var <- append(var, i)
        colnames(test.data)[ncol(test.data)] <- i
    } else {
        test.data <- test.data[, -ncol(test.data)]
    }
}
var <- colnames(t(memoSort(t(abs(test.data[, var] - 1)))))
snv.type <- variant.df$snv_type
names(snv.type) <- variant.df$variant_go_name
test.data <- test.data[, c(colnames(patient.df), var)]

# perform testing
res <- list()
for (i in var) {
    res[[i]] <- catStratTest(var = i, strata = "histology_group", data.df = test.data)
}

# add results to result data.frame
results <- rbind(
    results,
    data.frame(
        Variable = c("Single nucleotide variants (N, %)", var),
        Type = c("", snv.type[var]),
        N = c(total.patients, rep("", length(var))),
        Frequency = c(
            "",
            paste(
                colSums(test.data[, var] == 0 & !is.na(test.data[, var])), " ", "(",
                round(
                    colSums(test.data[, var] == 0 & !is.na(test.data[, var])) / total.patients * 100, 1
                ), "%", ")",
                sep = ""
            )
        ),
        N1 = c(sapply(res, function(x) {
            x$n[[1]][1]
        })[1], rep("", length(var))),
        `N1 summary stats` = c("", sapply(res, function(x) {
            x$stats[[1]][[1]][1, 1]
        })),
        N2 = c(sapply(res, function(x) {
            x$n[[1]][2]
        })[1], rep("", length(var))),
        `N2 summary stats` = c("", sapply(res, function(x) {
            x$stats[[1]][[1]][1, 2]
        })),
        OR = c("", sapply(res, function(x) {
            x$or[[1]][[2]][1]
        })),
        P = c("", sapply(res, function(x) {
            x$p[[1]][[2]][1]
        })),
        check.names = FALSE
    )
)

### CNVs

# create data for testing
total.patients <- sum(!(patient.df$DNA_CNV %in% c("Fail", "Not Performed")))
var <- c()
test.data <- patient.df
ref.data <- variant.df[variant.df$variant_type == "CNV", ]
for (i in unique(ref.data$variant_go_name)) {
    test.data$gene <- ifelse(
        patient.df$de_id %in% ref.data$de_id[ref.data$variant_go_name == i], 0,
        ifelse(
            !(patient.df$DNA_CNV %in% c("Fail", "Not Performed")), 1, NA
        )
    )
    if (sum(na.omit(test.data$gene == 0)) / total.patients >= 0.001) {
        var <- append(var, i)
        colnames(test.data)[ncol(test.data)] <- i
    } else {
        test.data <- test.data[, -ncol(test.data)]
    }
}
var <- colnames(t(memoSort(t(abs(test.data[, var] - 1)))))
test.data <- test.data[, c(colnames(patient.df), var)]

# perform testing
res <- list()
for (i in var) {
    res[[i]] <- catStratTest(var = i, strata = "histology_group", data.df = test.data)
}

# add results to result data.frame
results <- rbind(
    results,
    data.frame(
        Variable = c("Copy number variants (N, %)", var),
        Type = "",
        N = c(total.patients, rep("", length(var))),
        Frequency = c(
            "",
            paste(
                colSums(test.data[, var] == 0 & !is.na(test.data[, var])), " ", "(",
                round(
                    colSums(test.data[, var] == 0 & !is.na(test.data[, var])) / total.patients * 100, 1
                ), "%", ")",
                sep = ""
            )
        ),
        N1 = c(sapply(res, function(x) {
            x$n[[1]][1]
        })[1], rep("", length(var))),
        `N1 summary stats` = c("", sapply(res, function(x) {
            x$stats[[1]][[1]][1, 1]
        })),
        N2 = c(sapply(res, function(x) {
            x$n[[1]][2]
        })[1], rep("", length(var))),
        `N2 summary stats` = c("", sapply(res, function(x) {
            x$stats[[1]][[1]][1, 2]
        })),
        OR = c("", sapply(res, function(x) {
            x$or[[1]][[2]][1]
        })),
        P = c("", sapply(res, function(x) {
            x$p[[1]][[2]][1]
        })),
        check.names = FALSE
    )
)

### Fusions

# create data for testing
total.patients <- sum(!(patient.df$RNA_Rearrangements %in% c("Fail", "Not Performed")))
var <- c()
test.data <- patient.df
ref.data <- variant.df[variant.df$variant_type == "Fusion", ]
ref.data$variant_go_name[
    grep("Skip", ref.data$variant)
] <- grep("Skip", ref.data$variant, value = TRUE)
for (i in unique(ref.data$variant_go_name)) {
    test.data$gene <- ifelse(
        patient.df$de_id %in% ref.data$de_id[ref.data$variant_go_name == i], 0,
        ifelse(
            !(patient.df$RNA_Rearrangements %in% c("Fail", "Not Performed")), 1, NA
        )
    )
    if (sum(na.omit(test.data$gene == 0)) / total.patients >= 0.001) {
        var <- append(var, i)
        colnames(test.data)[ncol(test.data)] <- i
    } else {
        test.data <- test.data[, -ncol(test.data)]
    }
}
var <- colnames(t(memoSort(t(abs(test.data[, var] - 1)))))
test.data <- test.data[, c(colnames(patient.df), var)]

# perform testing
res <- list()
for (i in var) {
    res[[i]] <- catStratTest(var = i, strata = "histology_group", data.df = test.data)
}

# add results to result data.frame
results <- rbind(
    results,
    data.frame(
        Variable = c("Fusions/Skipping Variants (N, %)", var),
        Type = "",
        N = c(total.patients, rep("", length(var))),
        Frequency = c(
            "",
            paste(
                colSums(test.data[, var] == 0 & !is.na(test.data[, var])), " ", "(",
                round(
                    colSums(test.data[, var] == 0 & !is.na(test.data[, var])) / total.patients * 100, 1
                ), "%", ")",
                sep = ""
            )
        ),
        N1 = c(sapply(res, function(x) {
            x$n[[1]][1]
        })[1], rep("", length(var))),
        `N1 summary stats` = c("", sapply(res, function(x) {
            x$stats[[1]][[1]][1, 1]
        })),
        N2 = c(sapply(res, function(x) {
            x$n[[1]][2]
        })[1], rep("", length(var))),
        `N2 summary stats` = c("", sapply(res, function(x) {
            x$stats[[1]][[1]][1, 2]
        })),
        OR = c("", sapply(res, function(x) {
            x$or[[1]][[2]][1]
        })),
        P = c("", sapply(res, function(x) {
            x$p[[1]][[2]][1]
        })),
        check.names = FALSE
    )
)

### multiple testing correction

# adjust p-values for multiple testing
for (i in grep("P", colnames(results))) {
    results[!(results[, i] %in% c("FDR q-value", "-", "")), i] <- formatC(
        p.adjust(
            as.numeric(results[!(results[, i] %in% c("FDR q-value", "-", "")), i]),
            method = "BH"
        ),
        digits = 1, format = "e"
    )
}

### write out results

# create workbook
wb <- createWorkbook()

# add worksheet, write data, and format output
addWorksheet(wb, "Alteration prevalence")

writeData(wb, "Alteration prevalence", results, keepNA = TRUE, colNames = FALSE)

setColWidths(
    wb, "Alteration prevalence",
    cols = seq_len(ncol(results)),
    widths = c(33, 12, 5, 15, rep(c(5, 14), 2), 19.5, 11)
)

addStyle(
    wb, "Alteration prevalence",
    cols = seq_len(ncol(results)), rows = 1:(nrow(results) + 1),
    gridExpand = TRUE, style = left_just, stack = TRUE
)

addStyle(
    wb, "Alteration prevalence",
    cols = 1,
    rows = c(
        (grep("^Single", results$Variable) + 1):(grep("^Copy", results$Variable) - 1),
        (grep("^Copy", results$Variable) + 1):(grep("^Fusions", results$Variable) - 1),
        (grep("^Fusions", results$Variable) + 1):nrow(results)
    ),
    gridExpand = TRUE, style = center, stack = TRUE
)

mergeCells(wb, "Alteration prevalence", cols = c(3, 4), rows = 1)
mergeCells(wb, "Alteration prevalence", cols = c(5, 6), rows = 1)
mergeCells(wb, "Alteration prevalence", cols = c(7, 8), rows = 1)

addStyle(
    wb, "Alteration prevalence",
    cols = seq_len(ncol(results)), rows = 1:2,
    gridExpand = TRUE, style = bold, stack = TRUE
)

addStyle(
    wb, "Alteration prevalence",
    cols = seq_len(ncol(results)),
    rows = c(1, 3, (nrow(results) + 1)), gridExpand = TRUE,
    style = horizontal_border_med, stack = TRUE
)

addStyle(
    wb, "Alteration prevalence",
    cols = seq_len(ncol(results)),
    rows = c(
        grep("^Single", results$Variable),
        grep("^Copy", results$Variable),
        grep("^Fusions", results$Variable)
    ),
    gridExpand = TRUE, style = horizontal_border_thin, stack = TRUE
)

# convert numbers from strings back to numbers
convertNum(results, wb, "Alteration prevalence", FALSE)

# save workbook and results
saveWorkbook(
    wb, "Output/Tables/Patient_alteration_prevalence_by_variant.xlsx",
    overwrite = TRUE
)
var.results <- results

2.3.3 Visualization of genomic alterations and alteration prevalence

# extract data needed for alteration matrix, isolating unique instances
totals.list <- c()
totals.list <- append(totals.list, nrow(patient.df))
plot.data <- unique(variant.df[, c("de_id", "snv_type", "gene")])

# extract data needed for stacked bar plotting, isolating unique instances and
# calculate number of alterations per gene
plot.data <- ddply(
    unique(variant.df[, c("de_id", "variant_type", "snv_type", "gene")]),
    .(de_id, gene, variant_type), summarize,
    snv_type = snv_type[1]
)
plot.data <- ddply(plot.data, .(gene, snv_type), summarize, var_count = length(de_id))
plot.data$snv_type <- factor(
    plot.data$snv_type,
    levels = c("Substitution", "Indel", "Splice site", "Other SNV", "Fusion", "CNV")
)

# convert alteration counts to frequencies using appropriate denominators
for (j in names(table(plot.data$snv_type))) {
    total.patients <- ifelse(
        j == "CNV", sum(!(patient.df$DNA_CNV %in% c("Fail", "Not Performed"))),
        ifelse(
            j == "Fusion", sum(!(patient.df$RNA_Rearrangement %in% c("Fail", "Not Performed"))),
            sum(!(patient.df$DNA_SNV %in% c("Fail", "Not Performed")))
        )
    )
    plot.data$var_freq[plot.data$snv_type == j] <-
        plot.data$var_count[plot.data$snv_type == j] / total.patients
    totals.list <- append(totals.list, total.patients)
}

# calculate combined frequencies of alterations per gene and sort
# from highest to lowest
gene.freq <- ddply(
    plot.data, .(gene), summarize,
    freq = sum(var_freq),
    perc = paste(round(sum(var_freq), 4) * 100, "%", sep = "")
)
gene.freq <- gene.freq[order(gene.freq$freq, decreasing = TRUE), ]
plot.data$gene <- factor(plot.data$gene, levels = gene.freq$gene)

# create color vector to color genes that have tier 1 or 2 alterations
gene.col <- ifelse(
    levels(plot.data$gene) %in%
        therapy.df$marker_component_marker[grep("Guideline", therapy.df$amp_cap_asco_tier)],
    "red",
    ifelse(
        levels(plot.data$gene) %in%
            therapy.df$marker_component_marker[grep("Clinical", therapy.df$amp_cap_asco_tier)],
        "orange", "black"
    )
)
names(gene.col) <- levels(plot.data$gene)

# create color vector for filling in plots based on alteration type
col <- RColorBrewer::brewer.pal(12, name = "Paired")[c(1:4, 9:10)]
names(col) <- levels(plot.data$snv_type)

# tag genes with alterations that have different frequencies between histology
gene.freq$sub_diff[
    gene.freq$gene %in%
        sapply(
            strsplit(var.results$Variable[
                var.results$Type == "Substitution" &
                    as.numeric(var.results$P) < 0.05
            ], " "),
            function(x) {
                x[1]
            }
        )
] <- col["Substitution"]
gene.freq$indel_diff[
    gene.freq$gene %in%
        sapply(
            strsplit(var.results$Variable[
                var.results$Type == "Indel" &
                    as.numeric(var.results$P) < 0.05
            ], " "),
            function(x) {
                x[1]
            }
        )
] <- col["Indel"]
gene.freq$splice_diff[
    gene.freq$gene %in%
        sapply(
            strsplit(var.results$Variable[
                var.results$Type == "Splice site" &
                    as.numeric(var.results$P) < 0.05
            ], " "),
            function(x) {
                x[1]
            }
        )
] <- col["Splice site"]
gene.freq$other_diff[
    gene.freq$gene %in%
        sapply(
            strsplit(var.results$Variable[
                var.results$Type == "Other SNV" &
                    as.numeric(var.results$P) < 0.05
            ], " "),
            function(x) {
                x[1]
            }
        )
] <- col["Other SNV"]
gene.freq$cnv_diff[
    gene.freq$gene %in%
        gene.results$Variable[
            (grep("^Copy", gene.results$Variable) + 1):(grep("^Fusions", gene.results$Variable) - 1)
        ][
            as.numeric(gene.results$P[
                (grep("^Copy", gene.results$Variable) + 1):(grep("^Fusions", gene.results$Variable) - 1)
            ]) < 0.05
        ]
] <- col["CNV"]
gene.freq$fusion_diff[
    gene.freq$gene %in%
        gsub(
            " Exon 14 Skipping", "",
            gene.results$Variable[
                (grep("^Fusions", gene.results$Variable) + 1):nrow(gene.results)
            ][
                as.numeric(gene.results$P[
                    (grep("^Fusions", gene.results$Variable) + 1):nrow(gene.results)
                ]) < 0.05
            ]
        )
] <- col["Fusion"]

# create bar plot of patients with SNVs, CNV, and fusions/rearrangements, and
# guide-line indicated alterations and alterations with clinical trial/treatment
# in other tumor type
plot.data.sub <- ddply(
    unique(variant.df[, c("de_id", "snv_type")]), .(snv_type), summarize,
    prop = round(length(de_id) / totals.list[1], 2) * 100
)
plot.data.sub$snv_type <- factor(
    plot.data.sub$snv_type,
    levels = plot.data.sub[order(plot.data.sub$prop, decreasing = TRUE), "snv_type"]
)
g1 <- ggplot(plot.data.sub, aes(x = snv_type, y = prop, fill = snv_type)) +
    geom_bar(stat = "identity", color = "black", alpha = 0.75) +
    geom_text(aes(label = paste(prop, "%", sep = "")), size = 8, vjust = -1) +
    scale_y_continuous(limits = c(0, max(plot.data.sub$prop) + 5)) +
    scale_fill_manual(
        breaks = levels(plot.data$snv_type), values = col, na.value = NA,
        labels = levels(plot.data$snv_type)
    ) +
    coord_cartesian(clip = "off") +
    theme_classic() +
    theme(
        text = element_text(size = 24),
        axis.title.x = element_blank(),
        legend.position = "none"
    ) +
    labs(y = "Percent of tumors\nwith alteration type")

plot.data.sub <- ddply(
    unique(
        therapy.df[
            therapy.df$marker_component_marker_type %in% c("SNV", "CNV", "Fusion"),
            c("de_id", "amp_cap_asco_tier")
        ]
    ),
    .(amp_cap_asco_tier), summarize,
    prop = round(length(de_id) / totals.list[1], 2) * 100
)
plot.data.sub$amp_cap_asco_tier <- factor(
    plot.data.sub$amp_cap_asco_tier,
    levels = c("Guideline indicated", "Clinical trial or therapy in other tumor type")
)
g2 <- ggplot(plot.data.sub, aes(x = amp_cap_asco_tier, y = prop, fill = amp_cap_asco_tier)) +
    geom_bar(stat = "identity", color = "black", alpha = 0.75) +
    geom_text(aes(label = paste(prop, "%", sep = "")), size = 8, vjust = -1) +
    scale_x_discrete(labels = stringr::str_wrap(levels(plot.data.sub$amp_cap_asco_tier), 10)) +
    scale_y_continuous(limits = c(0, max(plot.data.sub$prop) + 5)) +
    scale_fill_manual(values = c("red", "orange"), na.value = NA) +
    coord_cartesian(clip = "off") +
    theme_classic() +
    theme(
        text = element_text(size = 24),
        axis.title.x = element_blank(),
        axis.title.y = element_blank(),
        axis.ticks.y = element_blank(),
        axis.text.y = element_blank(),
        axis.line.y = element_blank(),
        axis.line.x = element_line(),
        legend.position = "none"
    )
g1 <- ggarrange(g1, NULL, g2, nrow = 1, ncol = 3, widths = c(0.75, 0, 0.25), align = "h")

# create stacked bar plot of all genes with alterations detected in >=2% of patients
target.genes <- gene.freq$gene[round(gene.freq$freq, 2) >= 0.02]
plot.data.sub <- plot.data[plot.data$gene %in% target.genes, ]
g2 <- ggplot(plot.data.sub, aes(x = gene, y = var_freq, fill = snv_type)) +
    geom_bar(position = "stack", stat = "identity") +
    geom_text(
        inherit.aes = FALSE,
        data = gene.freq[gene.freq$gene %in% target.genes, ],
        aes(x = gene, y = freq + 0.01, label = perc),
        size = 6, angle = 90, hjust = 0
    ) +
    geom_point(
        inherit.aes = FALSE,
        data = gene.freq[gene.freq$gene %in% target.genes, ],
        aes(x = gene, y = freq + 0.01 + 0.1),
        color = gene.freq$sub_diff[gene.freq$gene %in% target.genes],
        size = 5
    ) +
    geom_point(
        inherit.aes = FALSE,
        data = gene.freq[gene.freq$gene %in% target.genes, ],
        aes(x = gene, y = freq + 0.01 + 0.11),
        color = gene.freq$indel_diff[gene.freq$gene %in% target.genes],
        size = 5
    ) +
    geom_point(
        inherit.aes = FALSE,
        data = gene.freq[gene.freq$gene %in% target.genes, ],
        aes(x = gene, y = freq + 0.01 + 0.12),
        color = gene.freq$splice_diff[gene.freq$gene %in% target.genes],
        size = 5
    ) +
    geom_point(
        inherit.aes = FALSE,
        data = gene.freq[gene.freq$gene %in% target.genes, ],
        aes(x = gene, y = freq + 0.01 + 0.12),
        color = gene.freq$fusion_diff[gene.freq$gene %in% target.genes],
        size = 5
    ) +
    geom_point(
        inherit.aes = FALSE,
        data = gene.freq[gene.freq$gene %in% target.genes, ],
        aes(x = gene, y = freq + 0.01 + 0.12),
        color = gene.freq$cnv_diff[gene.freq$gene %in% target.genes],
        size = 5
    ) +
    geom_point(
        inherit.aes = FALSE,
        data = gene.freq[gene.freq$gene %in% target.genes, ],
        aes(x = gene, y = freq + 0.01 + 0.12),
        color = gene.freq$other_diff[gene.freq$gene %in% target.genes],
        size = 5
    ) +
    geom_bar(
        inherit.aes = FALSE,
        data = gene.freq[gene.freq$gene %in% target.genes, ],
        aes(x = gene, y = freq),
        position = "stack", stat = "identity", color = "grey50", fill = NA
    ) +
    scale_discrete_manual("fill",
        values = col[names(table(plot.data.sub$snv_type)
        [table(plot.data.sub$snv_type) > 0])],
        name = ""
    ) +
    scale_y_continuous(limits = c(0, max(gene.freq$freq) + 0.15), expand = c(0, 0)) +
    coord_cartesian(clip = "off") +
    guides(fill = "none") +
    theme_classic() +
    theme(
        text = element_text(size = 24),
        axis.text.x = element_text(angle = 45, hjust = 1, color = gene.col[target.genes]),
        axis.title.x = element_markdown(size = 20),
        plot.margin = margin(20, 5, 5, 5)
    ) +
    labs(
        y = paste(
            "Alteration prevalence\n",
            "(number of SNVs, CNVs or fusions ÷\ntotal patients passing assay)"
        ),
        x = paste("Genes with cumulative SNV, CNV, and/or fusion prevalence of 2%<br>",
            "<b style='color:#E74C3C'>red</b> = Guideline-indicated alteration(s) detected<br>",
            "<b style='color:#F39C12'>orange</b> = Alteration(s) detected with clinical trial or therapy in other tumor type",
            sep = ""
        )
    )
plot.data.sub <- plot.data[
    plot.data$gene %in% target.genes &
        plot.data$gene %in% gene.freq$gene[round(gene.freq$freq, 2) < 0.1],
]
g2 <- g2 + annotation_custom(
    grob = ggplotGrob(
        ggplot(plot.data.sub, aes(x = gene, y = var_freq, fill = snv_type)) +
            geom_bar(position = "stack", stat = "identity") +
            geom_text(
                inherit.aes = FALSE,
                data = gene.freq[round(gene.freq$freq, 2) >= 0.02 &
                    round(gene.freq$freq, 2) < 0.1, ],
                aes(x = gene, y = freq + 0.001, label = perc),
                size = 6, angle = 90, hjust = 0
            ) +
            geom_bar(
                inherit.aes = FALSE,
                data = gene.freq[round(gene.freq$freq, 2) >= 0.02 &
                    round(gene.freq$freq, 2) < 0.1, ],
                aes(x = gene, y = freq),
                position = "stack", stat = "identity", color = "grey50", fill = NA
            ) +
            scale_discrete_manual("fill",
                values = col[names(table(plot.data.sub$snv_type)
                [table(plot.data.sub$snv_type) > 0])],
                name = ""
            ) +
            scale_y_continuous(
                breaks = seq(0, 0.1, 0.02), labels = round(seq(0, 0.1, 0.02), 2),
                limits = c(0, 0.11), expand = c(0, 0)
            ) +
            coord_cartesian(clip = "off") +
            theme_classic() +
            guides(fill = "none") +
            theme(
                text = element_text(size = 24),
                axis.title.y = element_blank(),
                axis.text.x = element_text(
                    angle = 45, hjust = 1,
                    color = gene.col[-seq_len(nrow(gene.freq[round(gene.freq$freq, 2) >= 0.1, ]))]
                ),
                axis.title.x = element_blank(),
                plot.margin = margin(0, 0, 0, 0)
            )
    ),
    xmin = nrow(gene.freq[round(gene.freq$freq, 2) >= 0.1, ]) - 0.5,
    xmax = length(unique(plot.data[plot.data$gene %in% target.genes, ]$gene)) + 0.5,
    ymin = round(max(gene.freq$freq[round(gene.freq$freq, 2) < 0.1]) * 3.4, 2),
    ymax = max(gene.freq$freq) + 0.15
)

# extract data needed for stacked bar plotting of RNA-based results
# isolating unique instances and calculate number of alterations per gene
plot.data <- variant.df[
    variant.df$snv_type == "Fusion",
    c("de_id", "snv_type", "gene", "variant")
]
plot.data$gene[
    grep("Skip", plot.data$variant)
] <- grep("Skip", plot.data$variant, value = TRUE)
plot.data <- ddply(
    unique(plot.data[, c("de_id", "snv_type", "gene")]),
    .(gene, snv_type), summarize,
    var_count = length(de_id)
)
plot.data$snv_type[grepl("Skip", plot.data$gene)] <- "Exon skipping variant"
plot.data$gene <- sapply(strsplit(plot.data$gene, " "), function(x) {
    x[1]
})

# convert alteration counts to frequencies using appropriate denominators
total.patients <- sum(
    !(patient.df$RNA_Rearrangements %in% c("Fail", "Not Performed"))
)
plot.data$var_freq <- plot.data$var_count / total.patients

# calculate combined frequencies of alterations per gene and sort
# from highest to lowest
gene.freq <- ddply(
    plot.data, .(gene), summarize,
    freq = sum(var_freq),
    perc = paste(round(sum(var_freq), 4) * 100, "%", sep = "")
)
gene.freq <- gene.freq[order(gene.freq$freq, decreasing = TRUE), ]

# make sure levels of variables are in correct order
plot.data$gene <- factor(plot.data$gene, levels = gene.freq$gene)

# create color vector to color genes that have tier 1 alterations or
# alterations matched to a clinical trial
gene.col <- ifelse(
    levels(plot.data$gene) %in%
        therapy.df$marker_component_marker[grep("Guideline", therapy.df$amp_cap_asco_tier)],
    "red",
    ifelse(
        levels(plot.data$gene) %in%
            therapy.df$marker_component_marker[grep("Clinical", therapy.df$amp_cap_asco_tier)],
        "orange", "black"
    )
)
names(gene.col) <- levels(plot.data$gene)

# tag genes with alterations that have different frequencies between histology
gene.freq$fusion_diff[
    gene.freq$gene %in%
        gsub(
            " Exon 14 Skipping", "",
            gene.results$Variable[
                (grep("^Fusions", gene.results$Variable) + 1):nrow(gene.results)
            ][
                as.numeric(gene.results$P[
                    (grep("^Fusions", gene.results$Variable) + 1):nrow(gene.results)
                ]) < 0.05
            ]
        )
] <- "#CAB2D6"
gene.freq$fusion_diff[gene.freq$gene == "MET"] <- "#ABEBC6"

# create stacked bar plot of genes with fusions/rearrangements detected in >= 0.1%
# of patients
col <- c("#ABEBC6", "#CAB2D6")
names(col) <- c("Exon skipping variant", "Fusion")
target.genes <- gene.freq$gene[gene.freq$freq >= 0.001]
plot.data.sub <- plot.data[plot.data$gene %in% target.genes, ]
g3 <- ggplot(plot.data.sub, aes(x = gene, y = var_freq, fill = snv_type)) +
    geom_bar(position = "stack", stat = "identity") +
    geom_text(
        inherit.aes = FALSE,
        data = gene.freq[gene.freq$gene %in% target.genes, ],
        aes(x = gene, y = freq + 0.001, label = perc),
        size = 7, angle = 90, hjust = 0
    ) +
    geom_point(
        inherit.aes = FALSE,
        data = gene.freq[gene.freq$gene %in% target.genes, ],
        aes(x = gene, y = freq + 0.001 + 0.02),
        color = gene.freq$fusion_diff[gene.freq$gene %in% target.genes],
        size = 5
    ) +
    geom_bar(
        inherit.aes = FALSE,
        data = gene.freq[gene.freq$gene %in% target.genes, ],
        aes(x = gene, y = freq),
        position = "stack", stat = "identity", color = "grey50", fill = NA
    ) +
    scale_discrete_manual("fill",
        values = col[names(table(plot.data.sub$snv_type)
        [table(plot.data.sub$snv_type) > 0])],
        name = ""
    ) +
    scale_y_continuous(limits = c(0, max(plot.data$var_freq) + 0.03), expand = c(0, 0)) +
    coord_cartesian(clip = "off") +
    theme_classic() +
    theme(
        text = element_text(size = 24),
        axis.text.x = element_text(angle = 45, hjust = 1, color = gene.col[target.genes]),
        axis.title.x = element_markdown(size = 20),
        legend.title = element_blank(),
        legend.text = element_text(size = 24),
        legend.position = "top",
        plot.margin = margin(20, 5, 5, 5)
    ) +
    labs(
        y = paste(
            "Alteration prevalence\n",
            "(number of fusions/variants ÷\ntotal patients passing assay)"
        ),
        x = paste("Fusions or skipping variants detected in 0.1% of patients by RNA-seq<br>",
            "<b style='color:#E74C3C'>red</b> = Guideline-indicated alteration(s) detected<br>",
            "<b style='color:#F39C12'>orange</b> = Alteration(s) detected with clinical trial or therapy in other tumor type",
            sep = ""
        )
    )

# put all plots together
g <- ggarrange(
    NULL, g1, NULL, g2, NULL, g3,
    nrow = 6, ncol = 1, heights = c(0.03, 0.5, 0, 1, 0, 0.6),
    labels = c("", "A", "", "B", "", "C"), font.label = list(size = 30), label.y = 1.08
)
ggsave(
    "Output/Plots/Genomic_alteration_overview.tiff", g,
    device = "tiff", bg = "white", width = 17, height = 20
)
g
Figure 1: Overview of genomic alterations detected by the TSO500 in NSCLC cases

2.4 Association of TMB and PD-L1 IHC with known driver mutations

2.4.1 Distribution of TMB and PD-L1 in tumors with non-KRAS driver mutations

# construct data needed for plotting and testing, isolating unique instances
target.genes <- c("ALK", "EGFR", "MET", "BRAF", "ROS1", "ERBB2")
plot.data <- patient.df[, c("de_id", "TMB", "PD_L1_IHC_22C3_interp", "histology_group")]

for (gene in target.genes) {
    plot.data <- cbind(
        plot.data,
        ifelse(
            patient.df$de_id %in% therapy.df$de_id[
                therapy.df$marker_component_marker == gene &
                    grepl("Guideline", therapy.df$amp_cap_asco_tier) &
                    therapy.df$marker_component_marker_type %in% c("SNV", "CNV", "Fusion")
            ],
            "Driver mutation",
            ifelse(
                patient.df$de_id %in% variant.df$de_id[
                    variant.df$gene == gene &
                        !(variant.df$de_id %in% therapy.df$de_id[
                            therapy.df$marker_component_marker == gene &
                                grepl("Guideline", therapy.df$amp_cap_asco_tier) &
                                therapy.df$marker_component_marker_type %in% c("SNV", "CNV", "Fusion")
                        ])
                ],
                paste("Non-guideline indicated ", gene, " alterations", sep = ""),
                ifelse(
                    !(patient.df$de_id %in% variant.df$de_id[
                        variant.df$gene %in% therapy.df$marker_component_marker[
                            grepl("Guideline", therapy.df$amp_cap_asco_tier) &
                                therapy.df$marker_component_marker_type %in% c("SNV", "CNV", "Fusion")
                        ]
                    ]),
                    "No driver gene alterations detected", NA
                )
            )
        )
    )
    colnames(plot.data)[ncol(plot.data)] <- gene
}

# format data
plot.data <- reshape2::melt(plot.data, id = c("de_id", "PD_L1_IHC_22C3_interp", "TMB", "histology_group"))
plot.data <- plot.data[
    !is.na(plot.data$TMB) &
        !is.na(plot.data$PD_L1_IHC_22C3_interp) &
        !is.na(plot.data$value),
]
colnames(plot.data)[ncol(plot.data)] <- "status"
colnames(plot.data)[colnames(plot.data) == "TMB"] <- "value"

# add exact driver mutation details
extra.plot.data <- data.frame()
for (j in seq_len(nrow(plot.data))) {
    if (plot.data$status[j] == "Driver mutation") {
        variant <- unique(therapy.df$marker_component[
            therapy.df$de_id == plot.data$de_id[j] &
                therapy.df$marker_component_marker == plot.data$variable[j] &
                grepl("Guideline", therapy.df$amp_cap_asco_tier) &
                therapy.df$marker_component_marker_type %in% c("SNV", "CNV", "Fusion")
        ])
        if (length(variant) > 1) {
            plot.data$status[j] <- variant[1]
            extra.plot.data <- rbind(
                extra.plot.data, data.frame(plot.data[j, -ncol(plot.data)], status = variant[-1])
            )
        } else {
            plot.data$status[j] <- variant
        }
    }
}
plot.data <- rbind(plot.data, extra.plot.data)

# group less prevalent mutations together if they are found in <0.5% of patients
plot.data$status_grouped <- plot.data$status
threshold <- 0.005 * nrow(patient.df)
for (gene in target.genes) {
    if (
        (length(plot.data$status_grouped[plot.data$variable == gene &
            grepl("Fusion", plot.data$status_grouped)]) ==
            length(plot.data$status_grouped[plot.data$variable == gene &
                !grepl("^No", plot.data$status_grouped)])) &
            length(table(plot.data$status_grouped[plot.data$variable == gene &
                !grepl("^No", plot.data$status_grouped)])[
                table(plot.data$status_grouped[plot.data$variable == gene &
                    !grepl("^No", plot.data$status_grouped)]) < threshold
            ]) ==
                length(table(plot.data$status_grouped[plot.data$variable == gene &
                    !grepl("^No", plot.data$status_grouped)]))
    ) {
        plot.data$status_grouped[
            plot.data$variable == gene & !grepl("^No", plot.data$status_grouped) &
                plot.data$status_grouped %in%
                    names(table(plot.data$status_grouped)[table(plot.data$status_grouped) < threshold])
        ] <- paste(gene, " Fusions", sep = "")
    } else if (
        length(table(plot.data$status_grouped[plot.data$variable == gene &
            !grepl("^No", plot.data$status_grouped)])[
            table(plot.data$status_grouped[plot.data$variable == gene &
                !grepl("^No", plot.data$status_grouped)]) < threshold
        ]) ==
            length(table(plot.data$status_grouped[plot.data$variable == gene &
                !grepl("^No", plot.data$status_grouped)]))
    ) {
        plot.data$status_grouped[
            plot.data$variable == gene & !grepl("^No", plot.data$status_grouped) &
                plot.data$status_grouped %in%
                    names(table(plot.data$status_grouped)[table(plot.data$status_grouped) < threshold])
        ] <- paste(gene, " guideline indicated alterations", sep = "")
    } else {
        plot.data$status_grouped[
            plot.data$variable == gene & !grepl("^No", plot.data$status_grouped) &
                plot.data$status_grouped %in%
                    names(table(plot.data$status_grouped)[table(plot.data$status_grouped) < threshold])
        ] <- paste("Other ", gene, " guideline indicated alterations", sep = "")
    }
}

# group all EGFR driver mutations together
plot.data$status_grouped[
    plot.data$variable == "EGFR" &
        !(plot.data$status_grouped %in%
            c("Non-guideline indicated EGFR alterations", "No driver gene alterations detected"))
] <- "EGFR guideline indicated alterations"

# re-label other ALK guideline indicated as other ALK fusions
plot.data$status_grouped[grep("Other ALK", plot.data$status_grouped)] <- "Other ALK fusions"

# re-label other MET guideline indicated as MET amplification
plot.data$status_grouped[grep("Other MET", plot.data$status_grouped)] <- "MET amplification"

# calculate PD-L1 group totals and percents
totals <- data.frame()
for (gene in target.genes) {
    for (
        status_grouped in levels(factor(plot.data[plot.data$variable == gene, ]$status_grouped))
    ) {
        totals <- rbind(totals, ddply(
            plot.data[plot.data$variable == gene & plot.data$status_grouped == status_grouped, ],
            .(variable, status_grouped, PD_L1_IHC_22C3_interp), summarize,
            N = length(de_id),
            value = length(de_id) /
                nrow(plot.data[plot.data$variable == gene & plot.data$status_grouped == status_grouped, ])
        ))
    }
}

# calculate/collate summary stats and write out results
summ.stats <- ddply(
    plot.data, .(variable, status_grouped), summarize,
    `Alterations in group` = paste(unique(status), collapse = ", "),
    `Median TMB` = median(value)
)
for (gene in target.genes) {
    summ.stats$`Alterations in group`[
        summ.stats$variable == gene & grepl("Non-guideline", summ.stats$status_grouped)
    ] <- paste(
        unique(variant.df$variant_go_name[
            variant.df$gene == gene &
                !(variant.df$variant_go_name %in%
                    therapy.df$marker_component[grep("Guideline", therapy.df$amp_cap_asco_tier)])
        ]),
        collapse = ", "
    )
}
summ.stats$`Alterations in group`[grep("No ", summ.stats$`Alterations in group`)] <- "-"
colnames(summ.stats)[1:2] <- c("Gene", "Alteration group")
summ.stats$`Alteration group` <- gsub(
    "Non-guideline indicated", "Other", summ.stats$`Alteration group`
)

wb1 <- createWorkbook()
addWorksheet(wb1, "Alteration groups")
writeData(
    wb1, "Alteration groups", summ.stats[, -ncol(summ.stats)],
    keepNA = TRUE, colNames = TRUE
)
setColWidths(
    wb1, "Alteration groups",
    cols = seq_len(ncol(summ.stats) - 1),
    widths = c(6, 30, 30)
)
addStyle(
    wb1, "Alteration groups",
    cols = seq_len(ncol(summ.stats) - 1), rows = 1,
    gridExpand = TRUE, style = bold, stack = TRUE
)
addStyle(
    wb1, "Alteration groups",
    cols = seq_len(ncol(summ.stats) - 1),
    rows = c(1, 2, (nrow(summ.stats) + 2)), gridExpand = TRUE,
    style = horizontal_border_med, stack = TRUE
)
convertNum(summ.stats, wb1, "Alteration groups", TRUE)
saveWorkbook(
    wb1, "Output/Tables/Alteration_groups_non-KRAS_drivers.xlsx",
    overwrite = TRUE
)

# test for differences in frequencies of PD-L1 levels and TMB
# and write out the results
pvals <- data.frame()
for (gene in target.genes) {
    for (j in unique(plot.data$status_grouped[plot.data$variable == gene])) {
        for (k in unique(plot.data$status_grouped[plot.data$variable == gene])) {
            if (j != k) {
                pdl1.p <- logistf::logistftest(logistf::logistf(
                  ifelse(
                    status_grouped == names(table(status_grouped))[1], 1, 0
                  ) ~ PD_L1_IHC_22C3_interp + histology_group,
                  data = plot.data[plot.data$variable == gene & plot.data$status_grouped %in% c(j, k), ]
                ))$prob
                tmb.p <- summary(lm(
                    log(value + 1) ~ status_grouped  + histology_group, 
                    data = plot.data[plot.data$variable == gene & plot.data$status_grouped %in% c(j, k), ]
                ))$coefficients[2, 4]
                pvals <- rbind(
                    pvals,
                    data.frame(
                        Gene = gene, `Alteration group 1` = j, `Alteration group 2` = k,
                        `PD-L1 Adjusted P-value` = pdl1.p, `TMB Adjusted P-value` = tmb.p,
                        check.names = FALSE
                    )
                )
            }
        }
    }
    pvals <- pvals[!duplicated(pvals$`TMB Adjusted P-value`), ]
    pvals$`PD-L1 Adjusted P-value`[pvals$variable == gene] <- p.adjust(
        pvals$`PD-L1 Adjusted P-value`[pvals$variable == gene],
        method = "bonferroni"
    )
    pvals$`TMB Adjusted P-value`[pvals$variable == gene] <- p.adjust(
        pvals$`TMB Adjusted P-value`[pvals$variable == gene],
        method = "bonferroni"
    )
}
pvals$`Alteration group 1` <- gsub(
    "Non-guideline indicated", "Other", pvals$`Alteration group 1`
)
pvals$`Alteration group 2` <- gsub(
    "Non-guideline indicated", "Other", pvals$`Alteration group 2`
)

wb2 <- createWorkbook()
addWorksheet(wb2, "TMB and PD-L1")
writeData(wb2, "TMB and PD-L1", pvals, keepNA = TRUE, colNames = TRUE)
setColWidths(
    wb2, "TMB and PD-L1",
    cols = seq_len(ncol(pvals)),
    widths = c(6, 30, 30, 21, 21)
)
addStyle(
    wb2, "TMB and PD-L1",
    cols = seq_len(ncol(pvals)), rows = 1,
    gridExpand = TRUE, style = bold, stack = TRUE
)
addStyle(
    wb2, "TMB and PD-L1",
    cols = seq_len(ncol(pvals)),
    rows = c(1, 2, (nrow(pvals) + 2)), gridExpand = TRUE,
    style = horizontal_border_med, stack = TRUE
)
convertNum(pvals, wb2, "TMB and PD-L1", TRUE)
saveWorkbook(
    wb2, "Output/Tables/Testing_diff_in_TMB_PD_L1_non-KRAS_drivers.xlsx",
    overwrite = TRUE
)

pvals$PD_L1_labels <- ifelse(
    pvals$`PD-L1 Adjusted P-value` < 0.0001, "****",
    ifelse(
        pvals$`PD-L1 Adjusted P-value` < 0.001, "***",
        ifelse(
            pvals$`PD-L1 Adjusted P-value` < 0.01, "**",
            ifelse(pvals$`PD-L1 Adjusted P-value` < 0.05, "*", "ns")
        )
    )
)
pvals$TMB_labels <- ifelse(
    pvals$`TMB Adjusted P-value` < 0.0001, "****",
    ifelse(
        pvals$`TMB Adjusted P-value` < 0.001, "***",
        ifelse(
            pvals$`TMB Adjusted P-value` < 0.01, "**",
            ifelse(pvals$`TMB Adjusted P-value` < 0.05, "*", "ns")
        )
    )
)

# combine data for plotting
plot.data <- data.frame(
    variable = c(plot.data$variable, totals$variable),
    status_grouped = c(plot.data$status_grouped, totals$status_grouped),
    PD_L1_IHC_22C3_interp = c(plot.data$PD_L1_IHC_22C3_interp, totals$PD_L1_IHC_22C3_interp),
    N = c(rep(NA, nrow(plot.data)), totals$N),
    value = c(plot.data$value, totals$value),
    plot = c(
        rep("Log transformed TMB (Mut/Mb)", nrow(plot.data)),
        rep("PD-L1 TPS level", nrow(totals))
    )
)
plot.data$status_grouped <- gsub(
    "Non-guideline indicated", "Other", plot.data$status_grouped
)
plot.data$status_grouped <- factor(
    plot.data$status_grouped,
    levels = c(
        sort(unique(plot.data$status_grouped)[
            !(unique(plot.data$status_grouped) %in%
                c(
                    unique(grep("Other.*guideline", plot.data$status_grouped, value = TRUE)),
                    grep(
                        "Other.*alterations",
                        unique(grep(
                            "Other.*guideline", plot.data$status_grouped,
                            value = TRUE, invert = TRUE
                        )),
                        value = TRUE
                    ),
                    "No driver gene alterations detected"
                ))
        ]),
        unique(grep("Other.*guideline", plot.data$status_grouped, value = TRUE)),
        grep(
            "Other.*alterations",
            unique(grep(
                "Other.*guideline", plot.data$status_grouped,
                value = TRUE, invert = TRUE
            )),
            value = TRUE
        ),
        "No driver gene alterations detected"
    )
)

# plot distributions of TMB and PD-L1 for driver genes
plot.list <- list()
for (gene in target.genes) {
    if (nrow(pvals[pvals$Gene == gene & pvals$TMB_labels != "ns", ]) > 0) {
        g1 <- ggplot(
            plot.data[
                plot.data$variable == gene & plot.data$plot == "Log transformed TMB (Mut/Mb)",
            ],
            aes(y = log(value + 1), x = status_grouped)
        ) +
            geom_violin(alpha = 0.5, scale = "width", fill = "grey50", color = "black") +
            geom_boxplot(width = 0.3, fill = "white", color = "black", outlier.size = 0) +
            geom_point(color = "black", size = 3.5) +
            geom_hline(yintercept = log(10 + 1), color = "black", linetype = "dashed") +
            geom_text(
                data = summ.stats[summ.stats$Gene == gene, ],
                aes(x = `Alteration group`, label = round(`Median TMB`, 1)), y = -0.5, size = 8
            ) +
            geom_signif(
                data = data.frame(
                    pvals[pvals$Gene == gene & pvals$TMB_labels != "ns", ],
                    y = sapply(
                        seq(1, nrow(pvals[pvals$Gene == gene & pvals$TMB_labels != "ns", ]), 1) / 4,
                        function(x) {
                            x + max(log(
                                plot.data$value[plot.data$variable == gene &
                                    plot.data$plot == "Log transformed TMB (Mut/Mb)"] + 1
                            ))
                        }
                    ),
                    check.names = FALSE
                ),
                aes(
                    xmin = `Alteration group 1`, xmax = `Alteration group 2`, y_position = y,
                    annotations = TMB_labels
                ),
                manual = TRUE, tip_length = 0, textsize = 10, vjust = 0.8
            ) +
            scale_y_continuous(
                breaks = c(0, 1, 2, 3, 4, 5),
                labels = c(
                    0,
                    paste("1 (", round(exp(1) - 1, 1), ")", sep = ""),
                    paste("2 (", round(exp(2) - 1, 1), ")", sep = ""),
                    paste("3 (", round(exp(3) - 1, 1), ")", sep = ""),
                    paste("4 (", round(exp(4) - 1, 1), ")", sep = ""),
                    paste("5 (", round(exp(5) - 1, 1), ")", sep = "")
                )
            ) +
            coord_cartesian(ylim = c(-0.5, 5), clip = "off") +
            theme_classic() +
            labs(y = "Log transformed TMB\n(Mut/Mb)", x = "") +
            theme(
                text = element_text(size = 22),
                axis.text.x = element_blank(),
                axis.ticks.x = element_blank(),
                axis.title.x = element_blank(),
                axis.title.y = element_text(vjust = 1)
            )
    } else {
        g1 <- ggplot(
            plot.data[
                plot.data$variable == gene & plot.data$plot == "Log transformed TMB (Mut/Mb)",
            ],
            aes(y = log(value + 1), x = status_grouped)
        ) +
            geom_violin(alpha = 0.5, scale = "width", fill = "grey50", color = "black") +
            geom_boxplot(width = 0.3, fill = "white", color = "black", outlier.size = 0) +
            geom_point(color = "black", size = 3.5) +
            geom_hline(yintercept = log(10 + 1), color = "black", linetype = "dashed") +
            geom_text(
                data = summ.stats[summ.stats$Gene == gene, ],
                aes(x = `Alteration group`, label = round(`Median TMB`, 1)), y = -0.5, size = 8
            ) +
            scale_y_continuous(
                breaks = c(0, 1, 2, 3, 4, 5),
                labels = c(
                    0,
                    paste("1 (", round(exp(1) - 1, 1), ")", sep = ""),
                    paste("2 (", round(exp(2) - 1, 1), ")", sep = ""),
                    paste("3 (", round(exp(3) - 1, 1), ")", sep = ""),
                    paste("4 (", round(exp(4) - 1, 1), ")", sep = ""),
                    paste("5 (", round(exp(5) - 1, 1), ")", sep = "")
                )
            ) +
            coord_cartesian(ylim = c(-0.5, 5), clip = "off") +
            theme_classic() +
            labs(y = "Log transformed TMB\n(Mut/Mb)", x = "") +
            theme(
                text = element_text(size = 22),
                axis.text.x = element_blank(),
                axis.ticks.x = element_blank(),
                axis.title.x = element_blank(),
                axis.title.y = element_text(vjust = 1)
            )
    }
    if (nrow(pvals[pvals$Gene == gene & pvals$PD_L1_labels != "ns", ]) > 0) {
        g2 <- ggplot(
            plot.data[plot.data$variable == gene & plot.data$plot == "PD-L1 TPS level", ],
            aes(y = value, x = status_grouped, fill = PD_L1_IHC_22C3_interp)
        ) +
            geom_bar(stat = "identity", color = "black", alpha = 0.5) +
            geom_label(
                aes(
                    label = paste(N, " (", round(value, 2) * 100, "%)", sep = ""),
                    color = PD_L1_IHC_22C3_interp
                ),
                position = position_stack(vjust = 0.5), size = 8, fill = "white"
            ) +
            geom_signif(
                inherit.aes = FALSE,
                data = data.frame(
                    pvals[pvals$Gene == gene & pvals$PD_L1_labels != "ns", ],
                    y = sapply(
                        seq(1, nrow(pvals[pvals$Gene == gene & pvals$PD_L1_labels != "ns", ]), 1) / 20,
                        function(x) {
                            x + 1.05
                        }
                    ),
                    check.names = FALSE
                ),
                aes(
                    xmin = `Alteration group 1`, xmax = `Alteration group 2`, y_position = y,
                    annotations = PD_L1_labels
                ),
                manual = TRUE, tip_length = 0, textsize = 10, vjust = 0.8
            ) +
            scale_fill_manual(
                labels = c(
                    levels(plot.data$PD_L1_IHC_22C3_interp)[1:2],
                    expression("High (" >= "50%)")
                ),
                values = c("blue", "purple", "deeppink")
            ) +
            scale_color_manual(values = rep("black", 3)) +
            scale_y_continuous(labels = scales::percent, breaks = seq(0, 1, 0.25)) +
            scale_x_discrete(labels = scales::wrap_format(10)) +
            coord_cartesian(ylim = c(0, 1), clip = "off") +
            theme_classic() +
            labs(
                y = paste(
                    "Tumors with PD-L1 level<br>",
                    "<b style='color:#0000FF'>blue</b> = Negative<br>",
                    "<b style='color:#A020F0'>purple</b> = Low<br>",
                    "<b style='color:#FF1493'>pink</b> = High",
                    sep = ""
                ),
                x = ""
            ) +
            guides(
                fill = "none",
                color = "none"
            ) +
            theme(
                text = element_text(size = 20),
                axis.title.y = element_markdown(size = 22),
                axis.text.x = element_text(size = 22)
            )
    } else {
        g2 <- ggplot(
            plot.data[plot.data$variable == gene & plot.data$plot == "PD-L1 TPS level", ],
            aes(y = value, x = status_grouped, fill = PD_L1_IHC_22C3_interp)
        ) +
            geom_bar(stat = "identity", color = "black", alpha = 0.5) +
            geom_label(
                aes(
                    label = paste(N, " (", round(value, 2) * 100, "%)", sep = ""),
                    color = PD_L1_IHC_22C3_interp
                ),
                position = position_stack(vjust = 0.5), size = 8, fill = "white"
            ) +
            scale_fill_manual(
                labels = c(
                    levels(plot.data$PD_L1_IHC_22C3_interp)[1:2],
                    expression("High (" >= "50%)")
                ),
                values = c("blue", "purple", "deeppink")
            ) +
            scale_color_manual(values = rep("black", 3)) +
            scale_y_continuous(labels = scales::percent, breaks = seq(0, 1, 0.25)) +
            scale_x_discrete(labels = scales::wrap_format(10)) +
            coord_cartesian(ylim = c(0, 1), clip = "off") +
            theme_classic() +
            labs(
                y = paste(
                    "Tumors with PD-L1 level<br>",
                    "<b style='color:#0000FF'>blue</b> = Negative<br>",
                    "<b style='color:#A020F0'>purple</b> = Low<br>",
                    "<b style='color:#FF1493'>pink</b> = High",
                    sep = ""
                ),
                x = ""
            ) +
            guides(
                fill = "none",
                color = "none"
            ) +
            theme(
                text = element_text(size = 22),
                axis.title.y = element_markdown(size = 22),
                axis.text.x = element_text(size = 22)
            )
    }
    g <- ggarrange(
        NULL, g1, NULL, g2,
        nrow = 4, ncol = 1,
        heights = c(
            (nrow(pvals[pvals$Gene == gene & pvals$TMB_labels != "ns", ]) / 2) / 10, 1,
            (nrow(pvals[pvals$Gene == gene & pvals$PD_L1_labels != "ns", ]) / 2) / 10, 1.5
        ),
        align = "v"
    )
    plot.list[[length(plot.list) + 1]] <- g
}

# combine and save plots
g <- ggarrange(
    plotlist = plot.list, nrow = 2, ncol = 3, heights = rep(1, 2), widths = rep(1, 3),
    labels = c("A", "B", "C", "D", "E", "F"), font.label = list(size = 35)
)
ggsave(
    "Output/Plots/TMB_PDL1_dist_non-KRAS_drivers.tiff", g,
    device = "tiff", bg = "white",
    width = 37, height = 22, limitsize = FALSE
)
g
Figure 2: TMB and PD-L1 22C3 IHC TPS in non-KRAS driver mutated NSCLC

2.4.2 Distribution of TMB and PD-L1 in tumors with key KRAS driver mutations

# construct data needed for plotting and testing, isolating unique instances
gene <- "KRAS"
plot.data <- patient.df[, c("de_id", "TMB", "PD_L1_IHC_22C3_interp", "histology_group")]

plot.data <- cbind(
    plot.data,
    ifelse(
        patient.df$de_id %in% therapy.df$de_id[
            therapy.df$marker_component_marker == gene &
                grepl("Guideline", therapy.df$amp_cap_asco_tier) &
                therapy.df$marker_component_marker_type %in% c("SNV", "CNV", "Fusion")
        ],
        "Driver mutation",
        ifelse(
            patient.df$de_id %in% variant.df$de_id[
                variant.df$gene == gene &
                    !(variant.df$de_id %in% therapy.df$de_id[
                        therapy.df$marker_component_marker == gene &
                            grepl("Guideline", therapy.df$amp_cap_asco_tier) &
                            therapy.df$marker_component_marker_type %in% c("SNV", "CNV", "Fusion")
                    ])
            ],
            paste("Non-guideline indicated ", gene, " alterations", sep = ""),
            ifelse(
                !(patient.df$de_id %in% variant.df$de_id[
                    variant.df$gene %in% therapy.df$marker_component_marker[
                        grepl("Guideline", therapy.df$amp_cap_asco_tier) &
                            therapy.df$marker_component_marker_type %in% c("SNV", "CNV", "Fusion")
                    ]
                ]),
                "No driver gene alterations detected", NA
            )
        )
    )
)
colnames(plot.data)[ncol(plot.data)] <- gene

# format data
plot.data <- reshape2::melt(plot.data, id = c("de_id", "PD_L1_IHC_22C3_interp", "TMB", "histology_group"))
plot.data <- plot.data[
    !is.na(plot.data$TMB) &
        !is.na(plot.data$PD_L1_IHC_22C3_interp) &
        !is.na(plot.data$value),
]
colnames(plot.data)[ncol(plot.data)] <- "status"
colnames(plot.data)[colnames(plot.data) == "TMB"] <- "value"

# add exact driver mutation details
extra.plot.data <- data.frame()
for (j in seq_len(nrow(plot.data))) {
    if (plot.data$status[j] == "Driver mutation") {
        variant <- unique(therapy.df$marker_component[
            therapy.df$de_id == plot.data$de_id[j] &
                therapy.df$marker_component_marker == plot.data$variable[j] &
                grepl("Guideline", therapy.df$amp_cap_asco_tier) &
                therapy.df$marker_component_marker_type %in% c("SNV", "CNV", "Fusion")
        ])
        if (length(variant) > 1) {
            plot.data$status[j] <- variant[1]
            extra.plot.data <- rbind(
                extra.plot.data, data.frame(plot.data[j, -ncol(plot.data)], status = variant[-1])
            )
        } else {
            plot.data$status[j] <- variant
        }
    }
}
plot.data <- rbind(plot.data, extra.plot.data)

# group less prevalent mutations together if they are found in <0.5% of patients
plot.data$status_grouped <- plot.data$status
threshold <- 0.005 * nrow(patient.df)
if (
    (length(plot.data$status_grouped[plot.data$variable == gene &
        grepl("Fusion", plot.data$status_grouped)]) ==
        length(plot.data$status_grouped[plot.data$variable == gene &
            !grepl("^No", plot.data$status_grouped)])) &
        length(table(plot.data$status_grouped[plot.data$variable == gene &
            !grepl("^No", plot.data$status_grouped)])[
            table(plot.data$status_grouped[plot.data$variable == gene &
                !grepl("^No", plot.data$status_grouped)]) < threshold
        ]) ==
            length(table(plot.data$status_grouped[plot.data$variable == gene &
                !grepl("^No", plot.data$status_grouped)]))
) {
    plot.data$status_grouped[
        plot.data$variable == gene & !grepl("^No", plot.data$status_grouped) &
            plot.data$status_grouped %in%
                names(table(plot.data$status_grouped)[table(plot.data$status_grouped) < threshold])
    ] <- paste(gene, " Fusions", sep = "")
} else if (
    length(table(plot.data$status_grouped[plot.data$variable == gene &
        !grepl("^No", plot.data$status_grouped)])[
        table(plot.data$status_grouped[plot.data$variable == gene &
            !grepl("^No", plot.data$status_grouped)]) < threshold
    ]) ==
        length(table(plot.data$status_grouped[plot.data$variable == gene &
            !grepl("^No", plot.data$status_grouped)]))
) {
    plot.data$status_grouped[
        plot.data$variable == gene & !grepl("^No", plot.data$status_grouped) &
            plot.data$status_grouped %in%
                names(table(plot.data$status_grouped)[table(plot.data$status_grouped) < threshold])
    ] <- paste(gene, " guideline indicated alterations", sep = "")
} else {
    plot.data$status_grouped[
        plot.data$variable == gene & !grepl("^No", plot.data$status_grouped) &
            plot.data$status_grouped %in%
                names(table(plot.data$status_grouped)[table(plot.data$status_grouped) < threshold])
    ] <- paste("Other ", gene, " guideline indicated alterations", sep = "")
}

# calculate PD-L1 group totals and percents
totals <- data.frame()
for (
    status_grouped in levels(factor(plot.data[plot.data$variable == gene, ]$status_grouped))
) {
    totals <- rbind(totals, ddply(
        plot.data[plot.data$variable == gene & plot.data$status_grouped == status_grouped, ],
        .(variable, status_grouped, PD_L1_IHC_22C3_interp), summarize,
        N = length(de_id),
        value = length(de_id) /
            nrow(plot.data[plot.data$variable == gene & plot.data$status_grouped == status_grouped, ])
    ))
}

# calculate/collate summary stats
summ.stats <- ddply(
    plot.data, .(variable, status_grouped), summarize,
    `Alterations in group` = paste(unique(status), collapse = ", "),
    `Median TMB` = median(value)
)
summ.stats$`Alterations in group`[
    summ.stats$variable == gene & grepl("Non-guideline", summ.stats$status_grouped)
] <- paste(
    unique(variant.df$variant_go_name[
        variant.df$gene == gene &
            !(variant.df$variant_go_name %in%
                therapy.df$marker_component[grep("Guideline", therapy.df$amp_cap_asco_tier)])
    ]),
    collapse = ", "
)
summ.stats$`Alterations in group`[grep("No ", summ.stats$`Alterations in group`)] <- "-"
colnames(summ.stats)[1:2] <- c("Gene", "Alteration group")
summ.stats$`Alteration group` <- gsub(
    "Non-guideline indicated", "Other", summ.stats$`Alteration group`
)

# test for differences in frequencies of PD-L1 levels and TMB
# and write out the results
pvals <- data.frame()
for (j in unique(plot.data$status_grouped[plot.data$variable == gene])) {
    for (k in unique(plot.data$status_grouped[plot.data$variable == gene])) {
        if (j != k) {
            pdl1.p <- logistf::logistftest(logistf::logistf(
                ifelse(
                    status_grouped == names(table(status_grouped))[1], 1, 0
                ) ~ PD_L1_IHC_22C3_interp + histology_group,
                data = plot.data[plot.data$variable == gene & plot.data$status_grouped %in% c(j, k), ]
            ))$prob
            tmb.p <- summary(lm(
                log(value + 1) ~ status_grouped + histology_group,
                data = plot.data[plot.data$variable == gene & plot.data$status_grouped %in% c(j, k), ]
            ))$coefficients[2, 4]
            pvals <- rbind(
                pvals,
                data.frame(
                    Gene = gene, `Alteration group 1` = j, `Alteration group 2` = k,
                    `PD-L1 Adjusted P-value` = pdl1.p, `TMB Adjusted P-value` = tmb.p,
                    check.names = FALSE
                )
            )
        }
    }
}
pvals <- pvals[!duplicated(pvals$`TMB Adjusted P-value`), ]
pvals$`PD-L1 Adjusted P-value`[pvals$variable == gene] <- p.adjust(
    pvals$`PD-L1 Adjusted P-value`[pvals$variable == gene],
    method = "bonferroni"
)
pvals$`TMB Adjusted P-value`[pvals$variable == gene] <- p.adjust(
    pvals$`TMB Adjusted P-value`[pvals$variable == gene],
    method = "bonferroni"
)
pvals$`Alteration group 1` <- gsub(
    "Non-guideline indicated", "Other", pvals$`Alteration group 1`
)
pvals$`Alteration group 2` <- gsub(
    "Non-guideline indicated", "Other", pvals$`Alteration group 2`
)

wb <- createWorkbook()
addWorksheet(wb, "KRAS mutations")
writeData(wb, "KRAS mutations", pvals, keepNA = TRUE, colNames = TRUE)
setColWidths(
    wb, "KRAS mutations",
    cols = seq_len(ncol(pvals)),
    widths = c(6, 30, 30, 21, 21)
)
addStyle(
    wb, "KRAS mutations",
    cols = seq_len(ncol(pvals)), rows = 1,
    gridExpand = TRUE, style = bold, stack = TRUE
)
addStyle(
    wb, "KRAS mutations",
    cols = seq_len(ncol(pvals)),
    rows = c(1, 2, (nrow(pvals) + 2)), gridExpand = TRUE,
    style = horizontal_border_med, stack = TRUE
)
convertNum(pvals, wb, "KRAS mutations", TRUE)
saveWorkbook(
    wb, "Output/Tables/Testing_diff_in_TMB_PD_L1_KRAS_drivers.xlsx",
    overwrite = TRUE
)

pvals$PD_L1_labels <- ifelse(
    pvals$`PD-L1 Adjusted P-value` < 0.0001, "****",
    ifelse(
        pvals$`PD-L1 Adjusted P-value` < 0.001, "***",
        ifelse(
            pvals$`PD-L1 Adjusted P-value` < 0.01, "**",
            ifelse(pvals$`PD-L1 Adjusted P-value` < 0.05, "*", "ns")
        )
    )
)
pvals$TMB_labels <- ifelse(
    pvals$`TMB Adjusted P-value` < 0.0001, "****",
    ifelse(
        pvals$`TMB Adjusted P-value` < 0.001, "***",
        ifelse(
            pvals$`TMB Adjusted P-value` < 0.01, "**",
            ifelse(pvals$`TMB Adjusted P-value` < 0.05, "*", "ns")
        )
    )
)

# combine data for plotting
plot.data <- data.frame(
    variable = c(plot.data$variable, totals$variable),
    status_grouped = c(plot.data$status_grouped, totals$status_grouped),
    PD_L1_IHC_22C3_interp = c(plot.data$PD_L1_IHC_22C3_interp, totals$PD_L1_IHC_22C3_interp),
    N = c(rep(NA, nrow(plot.data)), totals$N),
    value = c(plot.data$value, totals$value),
    plot = c(
        rep("Log transformed TMB (Mut/Mb)", nrow(plot.data)),
        rep("PD-L1 TPS level", nrow(totals))
    )
)
plot.data$status_grouped <- gsub(
    "Non-guideline indicated", "Other", plot.data$status_grouped
)
plot.data$status_grouped <- factor(
    plot.data$status_grouped,
    levels = c(
        sort(unique(plot.data$status_grouped)[
            !(unique(plot.data$status_grouped) %in%
                c(
                    unique(grep("Other.*guideline", plot.data$status_grouped, value = TRUE)),
                    grep(
                        "Other.*alterations",
                        unique(grep(
                            "Other.*guideline", plot.data$status_grouped,
                            value = TRUE, invert = TRUE
                        )),
                        value = TRUE
                    ),
                    "No driver gene alterations detected"
                ))
        ]),
        unique(grep("Other.*guideline", plot.data$status_grouped, value = TRUE)),
        grep(
            "Other.*alterations",
            unique(grep(
                "Other.*guideline", plot.data$status_grouped,
                value = TRUE, invert = TRUE
            )),
            value = TRUE
        ),
        "No driver gene alterations detected"
    )
)

# plot distributions of TMB and PD-L1 for prevalent KRAS mutations
if (nrow(pvals[pvals$Gene == gene & pvals$TMB_labels != "ns", ]) > 0) {
    g1 <- ggplot(
        plot.data[
            plot.data$variable == gene & plot.data$plot == "Log transformed TMB (Mut/Mb)",
        ],
        aes(y = log(value + 1), x = status_grouped)
    ) +
        geom_violin(alpha = 0.5, scale = "width", fill = "grey50", color = "black") +
        geom_boxplot(width = 0.3, fill = "white", color = "black", outlier.size = 0) +
        geom_point(color = "black", size = 3.5) +
        geom_hline(yintercept = log(10 + 1), color = "black", linetype = "dashed") +
        geom_text(
            data = summ.stats[summ.stats$Gene == gene, ],
            aes(x = `Alteration group`, label = round(`Median TMB`, 1)), y = -0.5, size = 8
        ) +
        geom_signif(
            data = data.frame(
                pvals[pvals$Gene == gene & pvals$TMB_labels != "ns", ],
                y = sapply(
                    seq(1, nrow(pvals[pvals$Gene == gene & pvals$TMB_labels != "ns", ]), 1) / 4,
                    function(x) {
                        x + max(log(
                            plot.data$value[plot.data$variable == gene &
                                plot.data$plot == "Log transformed TMB (Mut/Mb)"] + 1
                        ))
                    }
                ),
                check.names = FALSE
            ),
            aes(
                xmin = `Alteration group 1`, xmax = `Alteration group 2`, y_position = y,
                annotations = TMB_labels
            ),
            manual = TRUE, tip_length = 0, textsize = 10, vjust = 0.8
        ) +
        scale_y_continuous(
            breaks = c(0, 1, 2, 3, 4, 5),
            labels = c(
                0,
                paste("1 (", round(exp(1) - 1, 1), ")", sep = ""),
                paste("2 (", round(exp(2) - 1, 1), ")", sep = ""),
                paste("3 (", round(exp(3) - 1, 1), ")", sep = ""),
                paste("4 (", round(exp(4) - 1, 1), ")", sep = ""),
                paste("5 (", round(exp(5) - 1, 1), ")", sep = "")
            )
        ) +
        coord_cartesian(ylim = c(-0.5, 5), clip = "off") +
        theme_classic() +
        labs(y = "Log transformed TMB\n(Mut/Mb)", x = "") +
        theme(
            text = element_text(size = 22),
            axis.text.x = element_blank(),
            axis.ticks.x = element_blank(),
            axis.title.x = element_blank(),
            axis.title.y = element_text(vjust = 1)
        )
} else {
    g1 <- ggplot(
        plot.data[
            plot.data$variable == gene & plot.data$plot == "Log transformed TMB (Mut/Mb)",
        ],
        aes(y = log(value + 1), x = status_grouped)
    ) +
        geom_violin(alpha = 0.5, scale = "width", fill = "grey50", color = "black") +
        geom_boxplot(width = 0.3, fill = "white", color = "black", outlier.size = 0) +
        geom_point(color = "black", size = 3.5) +
        geom_hline(yintercept = log(10 + 1), color = "black", linetype = "dashed") +
        geom_text(
            data = summ.stats[summ.stats$Gene == gene, ],
            aes(x = `Alteration group`, label = round(`Median TMB`, 1)), y = -0.5, size = 8
        ) +
        scale_y_continuous(
            breaks = c(0, 1, 2, 3, 4, 5),
            labels = c(
                0,
                paste("1 (", round(exp(1) - 1, 1), ")", sep = ""),
                paste("2 (", round(exp(2) - 1, 1), ")", sep = ""),
                paste("3 (", round(exp(3) - 1, 1), ")", sep = ""),
                paste("4 (", round(exp(4) - 1, 1), ")", sep = ""),
                paste("5 (", round(exp(5) - 1, 1), ")", sep = "")
            )
        ) +
        coord_cartesian(ylim = c(-0.5, 5), clip = "off") +
        theme_classic() +
        labs(y = "Log transformed TMB\n(Mut/Mb)", x = "") +
        theme(
            text = element_text(size = 22),
            axis.text.x = element_blank(),
            axis.ticks.x = element_blank(),
            axis.title.x = element_blank(),
            axis.title.y = element_text(vjust = 1)
        )
}
if (nrow(pvals[pvals$Gene == gene & pvals$PD_L1_labels != "ns", ]) > 0) {
    g2 <- ggplot(
        plot.data[plot.data$variable == gene & plot.data$plot == "PD-L1 TPS level", ],
        aes(y = value, x = status_grouped, fill = PD_L1_IHC_22C3_interp)
    ) +
        geom_bar(stat = "identity", color = "black", alpha = 0.5) +
        geom_label(
            aes(
                label = paste(N, " (", round(value, 2) * 100, "%)", sep = ""),
                color = PD_L1_IHC_22C3_interp
            ),
            position = position_stack(vjust = 0.5), size = 8, fill = "white"
        ) +
        geom_signif(
            inherit.aes = FALSE,
            data = data.frame(
                pvals[pvals$Gene == gene & pvals$PD_L1_labels != "ns", ],
                y = sapply(
                    seq(1, nrow(pvals[pvals$Gene == gene & pvals$PD_L1_labels != "ns", ]), 1) / 20,
                    function(x) {
                        x + 1.05
                    }
                ),
                check.names = FALSE
            ),
            aes(
                xmin = `Alteration group 1`, xmax = `Alteration group 2`, y_position = y,
                annotations = PD_L1_labels
            ),
            manual = TRUE, tip_length = 0, textsize = 10, vjust = 0.8
        ) +
        scale_fill_manual(
            labels = c(
                levels(plot.data$PD_L1_IHC_22C3_interp)[1:2],
                expression("High (" >= "50%)")
            ),
            values = c("blue", "purple", "deeppink")
        ) +
        scale_color_manual(values = rep("black", 3)) +
        scale_y_continuous(labels = scales::percent, breaks = seq(0, 1, 0.25)) +
        scale_x_discrete(labels = scales::wrap_format(5)) +
        coord_cartesian(ylim = c(0, 1), clip = "off") +
        theme_classic() +
        labs(
            y = paste(
                "Tumors with PD-L1 level<br>",
                "<b style='color:#0000FF'>blue</b> = Negative<br>",
                "<b style='color:#A020F0'>purple</b> = Low<br>",
                "<b style='color:#FF1493'>pink</b> = High",
                sep = ""
            ),
            x = ""
        ) +
        guides(
            fill = "none",
            color = "none"
        ) +
        theme(
            text = element_text(size = 20),
            axis.title.y = element_markdown(size = 22),
            axis.text.x = element_text(size = 22)
        )
} else {
    g2 <- ggplot(
        plot.data[plot.data$variable == gene & plot.data$plot == "PD-L1 TPS level", ],
        aes(y = value, x = status_grouped, fill = PD_L1_IHC_22C3_interp)
    ) +
        geom_bar(stat = "identity", color = "black", alpha = 0.5) +
        geom_label(
            aes(
                label = paste(N, " (", round(value, 2) * 100, "%)", sep = ""),
                color = PD_L1_IHC_22C3_interp
            ),
            position = position_stack(vjust = 0.5), size = 8, fill = "white"
        ) +
        scale_fill_manual(
            labels = c(
                levels(plot.data$PD_L1_IHC_22C3_interp)[1:2],
                expression("High (" >= "50%)")
            ),
            values = c("blue", "purple", "deeppink")
        ) +
        scale_color_manual(values = rep("black", 3)) +
        scale_y_continuous(labels = scales::percent, breaks = seq(0, 1, 0.25)) +
        scale_x_discrete(labels = scales::wrap_format(5)) +
        coord_cartesian(ylim = c(0, 1), clip = "off") +
        theme_classic() +
        labs(
            y = paste(
                "Tumors with PD-L1 level<br>",
                "<b style='color:#0000FF'>blue</b> = Negative<br>",
                "<b style='color:#A020F0'>purple</b> = Low<br>",
                "<b style='color:#FF1493'>pink</b> = High",
                sep = ""
            ),
            x = ""
        ) +
        guides(
            fill = "none",
            color = "none"
        ) +
        theme(
            text = element_text(size = 22),
            axis.title.y = element_markdown(size = 22),
            axis.text.x = element_text(size = 22)
        )
}
g <- ggarrange(
    NULL, g1, NULL, g2,
    nrow = 4, ncol = 1,
    heights = c(
        (nrow(pvals[pvals$Gene == gene & pvals$TMB_labels != "ns", ]) / 2) / 10, 1,
        (nrow(pvals[pvals$Gene == gene & pvals$PD_L1_labels != "ns", ]) / 2) / 10, 1.5
    ),
    align = "v", labels = c("", "A", "", "B"), font.label = list(size = 30), label.y = 1.5
)
ggsave(
    "Output/Plots/TMB_PDL1_dist_KRAS_drivers.tiff", g,
    device = "tiff", bg = "white",
    width = 20, height = 20, limitsize = FALSE
)
g
Figure 3: TMB and PD-L1 22C3 IHC TPS in KRAS mutated NSCLC

2.5 Co-occurrence and network analysis

2.5.1 Co-occurrence and network analysis of gene-level alterations and associations with immunotherapy markers

### Co-occurrence analysis

# extract data needed for alteration matrix, isolating unique instances
plot.data <- unique(variant.df[, c("de_id", "variant_type", "gene")])

# create binary matrix of genes and detected alterations for each patient
alt.matrix <- t(
    reshape2::acast(
        plot.data, de_id ~ gene,
        value.var = "variant_type",
        fun.aggregate = function(x) {
            ifelse(length(x) > 0, 1, 0)
        }
    )
)

# calculate number of alterations per gene
plot.data <- ddply(plot.data, .(gene, variant_type), summarize, var_count = length(de_id))

# convert alteration counts to frequencies using appropriate denominators
for (j in names(table(plot.data$variant_type))) {
    total.patients <- ifelse(
        j == "CNV",
        sum(!(patient.df$DNA_CNV %in% c("Fail", "Not Performed"))),
        ifelse(
            j == "Fusion",
            sum(!(patient.df$RNA_Rearrangement %in% c("Fail", "Not Performed"))),
            sum(!(patient.df$DNA_SNV %in% c("Fail", "Not Performed")))
        )
    )
    plot.data$var_freq[plot.data$variant_type == j] <-
        plot.data$var_count[plot.data$variant_type == j] / total.patients
}

# calculate combined frequencies of alterations per gene and sort
# from highest to lowest
gene.freq <- ddply(
    plot.data, .(gene), summarize,
    freq = sum(var_freq),
    perc = paste(round(sum(var_freq), 4) * 100, "%", sep = "")
)
gene.freq <- gene.freq[order(gene.freq$freq, decreasing = TRUE), ]

# remove genes that have <1% prevalence when rounded using calculated gene frequencies
alt.matrix <- alt.matrix[gene.freq$gene[round(gene.freq$freq, 3) >= 0.01], ]

# sort alteration matrix based on mutual exclusivity of alterations
alt.matrix <- memoSort(alt.matrix)

# create color vector to color genes that have tier 1 or 2 alterations
gene.col <- ifelse(
    gene.freq$gene %in%
        therapy.df$marker_component_marker[grep("Guideline", therapy.df$amp_cap_asco_tier)],
    "red",
    ifelse(
        gene.freq$gene %in%
            therapy.df$marker_component_marker[grep("Clinical", therapy.df$amp_cap_asco_tier)],
        "orange", "black"
    )
)
names(gene.col) <- gene.freq$gene

# calculate pairwise odds ratios, 95% CI, and p-values from Fisher's exact test
# and calculate the proportion of patient overlap between two genes out of all
# the patients with a detected alterations for those genes
co.occur.out <- alterationCoOcurr(t(alt.matrix))
results <- co.occur.out$result.summary

# write out results for significant alteration co-occurrences
sub.results <- rbind(
    data.frame(
        `Feature 1` = "Gene 1",
        `Feature 2` = "Gene 2",
        `Proportion of co-occurrence` = "Proportion of co-occurrence",
        OR = "OR",
        `OR 95%CI lower` = "OR 95%CI lower",
        `OR 95%CI upper` = "OR 95%CI upper",
        `P-value` = "P-value",
        `Adjusted P-value` = "FDR q-value",
        check.names = FALSE
    ),
    data.frame(
        `Feature 1` = "Significantly co-occurring alterations (OR > 1 and FDR q-value < 0.05)",
        `Feature 2` = "",
        `Proportion of co-occurrence` = "",
        OR = "",
        `OR 95%CI lower` = "",
        `OR 95%CI upper` = "",
        `P-value` = "",
        `Adjusted P-value` = "",
        check.names = FALSE
    ),
    results[
        results$OR > 1 &
            results$`Adjusted P-value` < 0.05,
    ],
    data.frame(
        `Feature 1` = "Significantly exclusive alterations (OR < 1 and FDR q-value < 0.05)",
        `Feature 2` = "",
        `Proportion of co-occurrence` = "",
        OR = "",
        `OR 95%CI lower` = "",
        `OR 95%CI upper` = "",
        `P-value` = "",
        `Adjusted P-value` = "",
        check.names = FALSE
    ),
    results[
        results$OR < 1 &
            results$`Adjusted P-value` < 0.05,
    ]
)

wb <- createWorkbook()
addWorksheet(wb, "Co-occurrence")
writeData(
    wb, "Co-occurrence", sub.results,
    keepNA = TRUE, colNames = FALSE
)
setColWidths(
    wb, "Co-occurrence",
    cols = seq_len(ncol(sub.results)),
    widths = rep(15, ncol(sub.results))
)
addStyle(
    wb, "Co-occurrence",
    cols = seq_len(ncol(sub.results)), rows = 1:(nrow(sub.results) + 1),
    gridExpand = TRUE, style = left_just, stack = TRUE
)
mergeCells(
    wb, "Co-occurrence",
    cols = c(1:ncol(sub.results)),
    rows = grep("co-occurring", sub.results[, 1])
)
mergeCells(
    wb, "Co-occurrence",
    cols = seq_len(ncol(sub.results)),
    rows = grep("exclusive", sub.results[, 1])
)
addStyle(
    wb, "Co-occurrence",
    cols = seq_len(ncol(sub.results)),
    rows = c(
        1, grep("co-occurring", sub.results[, 1]),
        grep("exclusive", sub.results[, 1])
    ),
    gridExpand = TRUE, style = bold, stack = TRUE
)
addStyle(
    wb, "Co-occurrence",
    cols = seq_len(ncol(sub.results)),
    rows = c(1, 2, (nrow(sub.results) + 1)), gridExpand = TRUE,
    style = horizontal_border_med, stack = TRUE
)
addStyle(
    wb, "Co-occurrence",
    cols = seq_len(ncol(sub.results)),
    rows = c(
        grep("co-occurring", sub.results[, 1]) + 1,
        grep("exclusive", sub.results[, 1]),
        grep("exclusive", sub.results[, 1]) + 1
    ),
    gridExpand = TRUE, style = horizontal_border_thin, stack = TRUE
)
convertNum(sub.results, wb, "Co-occurrence", FALSE)
saveWorkbook(
    wb, "Output/Tables/Co_occurrence_results_alterations.xlsx",
    overwrite = TRUE
)

# convert odds ratios < 1 to negative inverse to make color scales more equal among
# values and cap odds ratios to reduce effect of extreme outliers on color scale
co.occur.out$or.matrix[
    co.occur.out$or.matrix < 1 & !is.na(co.occur.out$or.matrix)
] <- -(1 / co.occur.out$or.matrix[
    co.occur.out$or.matrix < 1 & !is.na(co.occur.out$or.matrix)
])
co.occur.out$or.matrix[co.occur.out$or.matrix > 10] <- 10
co.occur.out$or.matrix[co.occur.out$or.matrix < -10] <- -10

# mask half of the matrix for plotting
co.occur.out$or.matrix[upper.tri(co.occur.out$or.matrix)] <- NA
co.occur.out$padj.matrix[upper.tri(co.occur.out$padj.matrix)] <- NA
co.occur.out$prop.matrix[upper.tri(co.occur.out$prop.matrix)] <- NA

# combine odds ratios and p-values for plotting
plot.data <- merge(
    merge(
        reshape2::melt(
            co.occur.out$or.matrix[
                rev(rownames(co.occur.out$or.matrix)),
                rev(colnames(co.occur.out$or.matrix))
            ]
        ),
        reshape2::melt(
            co.occur.out$padj.matrix[
                rev(rownames(co.occur.out$padj.matrix)),
                rev(colnames(co.occur.out$padj.matrix))
            ]
        ),
        by = c(1, 2),
        suffix = c(".or", ".p"),
        sort = FALSE
    ),
    reshape2::melt(
        co.occur.out$prop.matrix[
            rev(rownames(co.occur.out$prop.matrix)),
            rev(colnames(co.occur.out$prop.matrix))
        ]
    ),
    by = c(1, 2),
    sort = FALSE
)

# create heatmap of results
g1 <- ggplot(plot.data, aes(y = Var1, x = Var2, fill = value.or)) +
    geom_tile(color = "white") +
    geom_tile(
        color = "white",
        fill = ifelse(plot.data$value == 0 &
            !is.na(plot.data$value) &
            plot.data$value.p >= 0.05, "white", NA)
    ) +
    geom_point(
        aes(shape = ifelse(value == 0 & !is.na(value),
            "No overlapping\nalterations", ""
        )),
        size = 2, color = "black"
    ) +
    geom_point(aes(size = ifelse(value.p < 0.05, "FDR q-value\n< 0.05", ""))) +
    scale_y_discrete(position = "right") +
    scale_fill_gradient2(
        low = "#075AFF", mid = "white", high = "#FF0000", midpoint = 0,
        name = "Fisher's exact test\nodds ratio",
        labels = c(expression("" <= 0.1), "0.2", "1", "5", expression("" >= 10)),
        na.value = "white"
    ) +
    scale_size_manual(
        values = c(2, NA), na.value = NA, breaks = "FDR q-value\n< 0.05",
        name = NULL
    ) +
    scale_shape_manual(
        values = c(4, NA), na.value = NA, breaks = "No overlapping\nalterations",
        name = NULL
    ) +
    coord_fixed() +
    theme(
        text = element_text(size = 12),
        axis.text.x = element_text(
            angle = 90, hjust = 1, vjust = 0.5,
            color = rev(gene.col[rownames(alt.matrix)])
        ),
        axis.title.x = element_blank(),
        axis.text.y = element_text(color = rev(gene.col[rownames(alt.matrix)])),
        axis.title.y = element_blank(),
        legend.position = c(0.1, 0.5),
        legend.text.align = 0,
        legend.key = element_blank(),
        legend.title = element_text(size = 20),
        legend.text = element_text(size = 20),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.border = element_blank(),
        panel.background = element_rect(color = "white"),
        plot.margin = margin(1, 1, 1, 40)
    )

### Network analysis

# get results with FDR < 0.05 to include in network analysis
net.data <- plot.data[plot.data$value.p < 0.05 & !is.na(plot.data$value.p), ]

# create variable with reverted ORs
net.data$revert.or <- net.data$value.or
net.data$revert.or[net.data$revert.or < 0] <-
    1 / abs(net.data$revert.or[net.data$revert.or < 0])

# create column to designate effect direction, then make absolute OR variable
net.data$or.direction <- ifelse(net.data$value.or < 0, "-", "+")
net.data$abs.value.or <- abs(net.data$value.or)

# import data into igraph and make ORs the weight
net.igraph <- graph_from_data_frame(net.data, directed = FALSE)
E(net.igraph)$weight <- net.data$revert.or

# calculate degree and weighted degree of each node
V(net.igraph)$degree <- degree(net.igraph)
V(net.igraph)$weighted.degree <- strength(net.igraph, weights = E(net.igraph)$abs.value.or)

# detect sub-communities within the network
set.seed(1234)
net.clust <- cluster_infomap(
    net.igraph,
    e.weights = E(net.igraph)$revert.or,
    v.weights = V(net.igraph)$weighted.degree
)
V(net.igraph)$cluster <- net.clust$membership

# mark which genes had Tier 1 or 2 alterations detected
V(net.igraph)$amp_cap_asco_tier <- ifelse(
    V(net.igraph)$name %in%
        therapy.df$marker_component_marker[grep("Guideline", therapy.df$amp_cap_asco_tier)],
    "Guideline indicated",
    ifelse(
        V(net.igraph)$name %in%
            therapy.df$marker_component_marker[grep("Clinical", therapy.df$amp_cap_asco_tier)],
        "Clinical trial or therapy in other tumor type", "Neither, but known\npathogenic"
    )
)

# output network data
net.results <- data.frame(
    `Feature` = V(net.igraph)$name,
    Degree = V(net.igraph)$degree,
    `Weighted degree` = V(net.igraph)$weighted.degree,
    `Network cluster` = V(net.igraph)$cluster,
    check.names = FALSE
)

wb <- createWorkbook()
addWorksheet(wb, "Network results")
writeData(wb, "Network results", net.results, keepNA = TRUE, colNames = TRUE)
setColWidths(
    wb, "Network results",
    cols = seq_len(ncol(net.results)),
    widths = c(rep(15, ncol(net.results)))
)
addStyle(
    wb, "Network results",
    cols = seq_len(ncol(net.results)), rows = 1,
    gridExpand = TRUE, style = bold, stack = TRUE
)
addStyle(
    wb, "Network results",
    cols = seq_len(ncol(net.results)),
    rows = c(1, 2, (nrow(net.results) + 2)), gridExpand = TRUE,
    style = horizontal_border_med, stack = TRUE
)
convertNum(net.results, wb, "Network results", TRUE)
saveWorkbook(wb, "Output/Tables/Network_results.xlsx", overwrite = TRUE)

# plot network using force atlas 2 algorithm to place nodes
set.seed(1234)
layout <- ForceAtlas2::layout.forceatlas2(
    data.frame(from = net.data$Var1, to = net.data$Var2, weights = net.data$revert.or),
    directed = FALSE, iterations = 2000, k = 1000, gravity = 400, plotstep = 0
)
V(net.igraph)$amp_cap_asco_tier[
    V(net.igraph)$amp_cap_asco_tier == "Clinical trial or therapy in other tumor type"
] <- "Clinical trial or therapy\nin other tumor type"
g2 <- ggraph(net.igraph, layout = as.matrix(layout[, -1])) +
    geom_edge_fan(aes(color = value.or), alpha = 0.75, width = 1.5) +
    geom_node_point(
        aes(
            color = factor(as.character(cluster), levels = paste(1:max(cluster))),
            shape = factor(
                amp_cap_asco_tier,
                levels = c(
                    "Guideline indicated",
                    "Clinical trial or therapy\nin other tumor type",
                    "Neither, but known\npathogenic"
                )
            )
        ),
        size = log(V(net.igraph)$weighted.degree) * 6, alpha = 0.5
    ) +
    geom_node_point(
        aes(
            color = factor(as.character(cluster), levels = paste(1:max(cluster))),
            shape = factor(
                amp_cap_asco_tier,
                levels = c(
                    "Guideline indicated",
                    "Clinical trial or therapy\nin other tumor type",
                    "Neither, but known\npathogenic"
                )
            )
        ),
        size = log(V(net.igraph)$weighted.degree) * 5
    ) +
    geom_node_text(aes(label = name), size = 4) +
    geom_node_text(aes(label = name), size = 4) +
    scale_edge_colour_gradient2(
        low = "#075AFF", mid = "white", high = "#FF0000", midpoint = 0,
        name = "Fisher's exact test\nodds ratio (edges)",
        breaks = c(-10, -5, 0, 5, 10),
        labels = c(expression("" <= 0.1), "0.2", "1", "5", expression("" >= 10)),
        na.value = "white"
    ) +
    scale_shape_manual(values = c(16, 15, 17)) +
    guides(
        color = guide_legend(
            title = "Algorithm detected\ncluster (nodes)",
            override.aes = list(size = 8)
        ),
        shape = guide_legend(
            title = "Clinical evidence (nodes)",
            override.aes = list(size = 8),
            keyheight = 3
        ),
        size = "none"
    ) +
    theme_void() +
    theme(
        legend.title = element_text(size = 20),
        legend.text = element_text(size = 20),
        legend.spacing.y = unit(10, "mm")
    )

# output both co-occurrence results and network
g <- ggarrange(
    NULL, g1, NULL, g2,
    ncol = 4, widths = c(-0.01, 0.8, -0.01, 1.2), labels = c("", "A", "", "B"),
    font.label = list(size = 35)
)
ggsave(
    "Output/Plots/Co_occurrence_network_plots.tiff", g,
    device = "tiff", bg = "white", width = 30, height = 13
)
g
Figure 4: Gene-level co-occurrence and network analysis of genomic alterations in NSCLC

2.5.2 Association of gene clusters with histology, TMB, and PD-L1 IHC

### Co-occurring cluster associations

# test/plot association of each cluster with histology, TMB, and PD-L1
plot.list <- list()
for (clust in unique(net.results$`Network cluster`)) {
    # calculate number of variants in the cluster and get gene combos
    n.alts <- data.frame(
        table(variant.df$de_id[
            variant.df$gene %in% net.results$Feature[net.results$`Network cluster` == clust] &
                (variant.df$gene %in% results$`Feature 1`[results$OR > 1] |
                    variant.df$gene %in% results$`Feature 2`[results$OR > 1])
        ])
    )
    for (j in n.alts$Var1) {
        n.alts$gene.combos[n.alts$Var1 == j] <- paste(
            unique(variant.df$gene[
                variant.df$de_id == j &
                    variant.df$gene %in% net.results$Feature[net.results$`Network cluster` == clust]
            ]),
            collapse = " + "
        )
    }
    plot.data <- merge(
        patient.df[, c("de_id", "histology_group", "TMB", "PD_L1_IHC_22C3_interp")],
        n.alts,
        by = 1, all.x = TRUE
    )
    plot.data$Freq[plot.data$Freq > 2] <- 2
    plot.data$Freq[
        is.na(plot.data$Freq) &
            plot.data$de_id %in% patient.df$de_id[
                patient.df$DNA_SNV %in% c("Pass", "Limited") |
                    patient.df$DNA_CNV %in% c("Pass", "Limited") |
                    patient.df$RNA_Rearrangements %in% c("Pass", "Limited")
            ]
    ] <- 0
    plot.data <- plot.data[!is.na(plot.data$Freq), ]
    plot.data$histology_group[grep("Squamous", plot.data$histology_group)] <- "Squamous cell"

    # plot breakdown of most common gene combinations
    plot.data.prop <- ddply(
        plot.data[
            !is.na(plot.data$gene.combos) &
                plot.data$gene.combos %in%
                    names(table(plot.data$gene.combos)[table(plot.data$gene.combos) > 3]),
        ],
        .(gene.combos), summarize,
        count = length(de_id),
        prop = length(de_id) / nrow(plot.data[!is.na(plot.data$gene.combos), ])
    )
    plot.data.prop <- plot.data.prop[order(plot.data.prop$prop, decreasing = TRUE), ]
    g1 <- ggplot() +
        annotate(
            "text",
            x = 1, y = 1,
            size = ifelse(nrow(plot.data.prop) <= 15, 7.5, 130 / nrow(plot.data.prop)),
            label = paste(
                plot.data.prop$gene.combo, " (N=", plot.data.prop$count,
                ", ", round(plot.data.prop$prop, 3) * 100, "%)",
                sep = "", collapse = "\n"
            )
        ) +
        theme_void()

    # test/plot association with histology group
    plot.data.prop <- rbind(
        data.frame(
            Freq = "No\nvariants",
            ddply(
                plot.data[plot.data$Freq == "0", ], .(histology_group), summarize,
                count = length(de_id),
                prop = length(de_id) / nrow(plot.data[plot.data$Freq == "0", ])
            )
        ),
        data.frame(
            Freq = "One\nvariant",
            ddply(
                plot.data[plot.data$Freq == "1", ], .(histology_group), summarize,
                count = length(de_id),
                prop = length(de_id) / nrow(plot.data[plot.data$Freq == "1", ])
            )
        ),
        data.frame(
            Freq = "Two or\nmore variants",
            ddply(
                plot.data[plot.data$Freq == "2", ], .(histology_group), summarize,
                count = length(de_id),
                prop = length(de_id) / nrow(plot.data[plot.data$Freq == "2", ])
            )
        )
    )
    pvals <- data.frame()
    for (j in unique(plot.data.prop$Freq)) {
        for (k in unique(plot.data.prop$Freq)) {
            if (j != k) {
                fish.p <- fisher.test(
                    matrix(
                        c(
                            plot.data.prop$count[plot.data.prop$Freq == j],
                            plot.data.prop$count[plot.data.prop$Freq == k]
                        ),
                        nrow = 2
                    )
                )$p.value
                pvals <- rbind(
                    pvals,
                    data.frame(
                        `Alteration group 1` = j, `Alteration group 2` = k, P = fish.p,
                        check.names = FALSE
                    )
                )
            }
        }
    }
    pvals <- pvals[c(1, 2, 4), ]
    pvals$P <- p.adjust(pvals$P, method = "bonferroni")
    g2 <- ggplot(plot.data.prop, aes(x = Freq, y = prop, fill = histology_group)) +
        geom_bar(stat = "identity", color = "black", alpha = 0.75) +
        geom_label(
            aes(
                label = paste(count, "\n(", round(prop, 2) * 100, "%)", sep = ""),
                color = histology_group
            ),
            position = position_fill(0.5), size = 7, fill = "white"
        ) +
        geom_signif(
            xmin = c(1, 1, 2), xmax = c(2, 3, 3), y_position = c(1.05, 1.15, 1.25),
            annotations = ifelse(pvals$P < 1E-4, "<0.0001", round(pvals$P, 3)),
            textsize = 7, tip_length = 0
        ) +
        scale_y_continuous(labels = scales::percent, breaks = seq(0, 1, 0.25)) +
        scale_fill_manual(values = c("black", "grey50")) +
        scale_color_manual(values = rep("black", 2)) +
        theme_classic() +
        theme(
            text = element_text(size = 24),
            axis.title.x = element_blank(),
            axis.title.y = element_markdown(size = 24),
            legend.title = element_blank(),
            legend.position = "top"
        ) +
        guides(fill = "none", color = "none") +
        labs(y = paste(
            "Tumors with histology<br>",
            "<b style='color:#000000'>black</b> = Adenocarcinoma<br>",
            "<b style='color:#7F7F7F'>gray</b> = Squamous cell<br>",
            sep = ""
        ))

    # test/plot association with TMB
    summ.stats <- ddply(plot.data, .(Freq), summarize, median = median(na.omit(TMB)))
    g3 <- ggplot(
        plot.data[!is.na(plot.data$TMB), ],
        aes(x = as.character(Freq), y = log(TMB + 1))
    ) +
        geom_violin(alpha = 0.5, scale = "width", fill = "grey50", color = "black") +
        geom_boxplot(width = 0.3, fill = "white", color = "black", outlier.size = 0) +
        geom_point(color = "black", size = 3.5) +
        geom_hline(yintercept = log(10 + 1), color = "black", linetype = "dashed") +
        geom_text(data = summ.stats, aes(label = round(median, 1), y = -0.2), size = 7) +
        stat_pwc(
            y.position = max(na.omit(log(plot.data$TMB + 1))) + 0.25,
            method = "t_test", p.adjust.method = "bonferroni", label = "p.adj.format",
            step.increase = 0.1, label.size = 7, , tip.length = 0
        ) +
        scale_x_discrete(labels = c("No\nvariants", "One\nvariant", "Two or\nmore variants")) +
        scale_y_continuous(
            breaks = c(0, 1, 2, 3, 4, 5),
            labels = c(
                0,
                paste("1 (", round(exp(1) - 1, 1), ")", sep = ""),
                paste("2 (", round(exp(2) - 1, 1), ")", sep = ""),
                paste("3 (", round(exp(3) - 1, 1), ")", sep = ""),
                paste("4 (", round(exp(4) - 1, 1), ")", sep = ""),
                paste("5 (", round(exp(5) - 1, 1), ")", sep = "")
            )
        ) +
        theme_classic() +
        theme(
            text = element_text(size = 24),
            axis.title.x = element_blank()
        ) +
        labs(y = "Log transformed TMB (Mut/Mb)")

    # test/plot association with PD-L1 IHC
    plot.data.prop <- rbind(
        data.frame(
            Freq = "No\nvariants",
            ddply(
                plot.data[plot.data$Freq == "0", ], .(PD_L1_IHC_22C3_interp), summarize,
                count = length(de_id),
                prop = length(de_id) / nrow(plot.data[plot.data$Freq == "0", ])
            )
        ),
        data.frame(
            Freq = "One\nvariant",
            ddply(
                plot.data[plot.data$Freq == "1", ], .(PD_L1_IHC_22C3_interp), summarize,
                count = length(de_id),
                prop = length(de_id) / nrow(plot.data[plot.data$Freq == "1", ])
            )
        ),
        data.frame(
            Freq = "Two or\nmore variants",
            ddply(
                plot.data[plot.data$Freq == "2", ], .(PD_L1_IHC_22C3_interp), summarize,
                count = length(de_id),
                prop = length(de_id) / nrow(plot.data[plot.data$Freq == "2", ])
            )
        )
    )
    plot.data.prop <- plot.data.prop[!is.na(plot.data.prop$PD_L1_IHC_22C3_interp), ]
    pvals <- data.frame()
    for (j in unique(plot.data.prop$Freq)) {
        for (k in unique(plot.data.prop$Freq)) {
            if (j != k) {
                chisq.p <- chisq.test(
                    matrix(
                        c(
                            plot.data.prop$count[plot.data.prop$Freq == j],
                            plot.data.prop$count[plot.data.prop$Freq == k]
                        ),
                        nrow = 3
                    )
                )$p.value
                pvals <- rbind(
                    pvals,
                    data.frame(
                        `Alteration group 1` = j, `Alteration group 2` = k, P = chisq.p,
                        check.names = FALSE
                    )
                )
            }
        }
    }
    pvals <- pvals[c(1, 2, 4), ]
    pvals$P <- p.adjust(pvals$P, method = "bonferroni")
    g4 <- ggplot(plot.data.prop, aes(x = Freq, y = prop, fill = PD_L1_IHC_22C3_interp)) +
        geom_bar(stat = "identity", color = "black", alpha = 0.5) +
        geom_label(
            aes(
                label = paste(count, "\n(", round(prop, 2) * 100, "%)", sep = ""),
                color = PD_L1_IHC_22C3_interp
            ),
            position = position_fill(0.5), size = 7, fill = "white"
        ) +
        geom_signif(
            xmin = c(1, 1, 2), xmax = c(2, 3, 3), y_position = c(1.05, 1.15, 1.25),
            annotations = ifelse(pvals$P < 1E-4, "<0.0001", round(pvals$P, 3)),
            textsize = 7, tip_length = 0
        ) +
        scale_y_continuous(labels = scales::percent, breaks = seq(0, 1, 0.25)) +
        scale_fill_manual(values = c("blue", "purple", "deeppink")) +
        scale_color_manual(values = rep("black", 3)) +
        theme_classic() +
        theme(
            text = element_text(size = 24),
            axis.title.x = element_blank(),
            axis.title.y = element_markdown(size = 24),
            legend.title = element_blank(),
            legend.position = "top"
        ) +
        guides(fill = "none", color = "none") +
        labs(
            y = paste(
                "Tumors with PD-L1 level<br>",
                "<b style='color:#0000FF'>blue</b> = Negative<br>",
                "<b style='color:#A020F0'>purple</b> = Low<br>",
                "<b style='color:#FF1493'>pink</b> = High",
                sep = ""
            )
        )

    # output plots
    g <- ggarrange(
        g1, NULL, g2, NULL, g3, NULL, g4,
        nrow = 1, ncol = 7,
        widths = c(1, 0, 1, 0, 1, 0, 1)
    )
    plot.list[[length(plot.list) + 1]] <- g
    names(plot.list)[length(plot.list)] <- clust
}

# output plots of specific clusters with significant associations
g <- ggarrange(
    plotlist = plot.list[c(2, 3, 4, 8, 9, 11, 12, 13)],
    nrow = 4, ncol = 2, heights = rep(1, 4), widths = rep(1, 2),
    labels = c("A", "B", "C", "D", "E", "F", "G", "H"), font.label = list(size = 40)
)
ggsave(
    "Output/Plots/Gene_module_associations_sigassocs.tiff", g,
    device = "tiff", bg = "white",
    width = 50, height = 30, limitsize = FALSE
)
g

# combine and save other plots
g <- ggarrange(
    plotlist = plot.list[c(1, 5, 6, 7, 10)],
    nrow = 3, ncol = 2, heights = rep(1, 4), widths = rep(1, 4),
    labels = c("A", "B", "C", "D", "E"), font.label = list(size = 40)
)
ggsave(
    "Output/Plots/Gene_module_associations_others.tiff", g,
    device = "tiff", bg = "white",
    width = 50, height = 25, limitsize = FALSE
)
g
Figure 5: Associations between variant number in gene clusters and histology, TMB, and PD-L1 IHC
Figure 6: Associations between variant number in gene clusters and histology, TMB, and PD-L1 IHC

3 R session information

R version 4.4.1 (2024-06-14 ucrt)
Platform: x86_64-w64-mingw32/x64
Running under: Windows 10 x64 (build 19045)

Matrix products: default


locale:
[1] LC_COLLATE=English_United States.utf8 
[2] LC_CTYPE=English_United States.utf8   
[3] LC_MONETARY=English_United States.utf8
[4] LC_NUMERIC=C                          
[5] LC_TIME=English_United States.utf8    

time zone: America/New_York
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] ggraph_2.2.1     igraph_2.0.3     GGally_2.2.1     plotly_4.10.4   
 [5] ggtext_0.1.2     ggpubr_0.6.0     ggplot2_3.5.1    kableExtra_1.4.0
 [9] foreach_1.5.2    openxlsx_4.2.5.2 tibble_3.2.1     readxl_1.4.3    
[13] plyr_1.8.9      

loaded via a namespace (and not attached):
  [1] gridExtra_2.3        rlang_1.1.4          magrittr_2.0.3      
  [4] compiler_4.4.1       mgcv_1.9-1           systemfonts_1.1.0   
  [7] vctrs_0.6.5          reshape2_1.4.4       stringr_1.5.1       
 [10] shape_1.4.6.1        pkgconfig_2.0.3      fastmap_1.2.0       
 [13] backports_1.5.0      labeling_0.4.3       utf8_1.2.4          
 [16] rmarkdown_2.27       markdown_1.13        nloptr_2.1.1        
 [19] ragg_1.3.2           purrr_1.0.2          jomo_2.7-6          
 [22] xfun_0.45            glmnet_4.1-8         logistf_1.26.0      
 [25] cachem_1.1.0         jsonlite_1.8.8       pan_1.9             
 [28] tweenr_2.0.3         broom_1.0.6          R6_2.5.1            
 [31] stringi_1.8.4        RColorBrewer_1.1-3   rpart_4.1.23        
 [34] boot_1.3-30          car_3.1-2            cellranger_1.1.0    
 [37] Rcpp_1.0.12          iterators_1.0.14     knitr_1.47          
 [40] nnet_7.3-19          Matrix_1.7-0         splines_4.4.1       
 [43] tidyselect_1.2.1     rstudioapi_0.16.0    abind_1.4-5         
 [46] yaml_2.3.8           viridis_0.6.5        codetools_0.2-20    
 [49] lattice_0.22-6       withr_3.0.0          ForceAtlas2_0.1     
 [52] evaluate_0.24.0      survival_3.7-0       ggstats_0.6.0       
 [55] polyclip_1.10-6      zip_2.3.1            xml2_1.3.6          
 [58] pillar_1.9.0         carData_3.0-5        mice_3.16.0         
 [61] generics_0.1.3       munsell_0.5.1        commonmark_1.9.1    
 [64] scales_1.3.0         minqa_1.2.7          glue_1.7.0          
 [67] lazyeval_0.2.2       tools_4.4.1          data.table_1.15.4   
 [70] lme4_1.1-35.4        ggsignif_0.6.4       graphlayouts_1.1.1  
 [73] tidygraph_1.3.1      cowplot_1.1.3        grid_4.4.1          
 [76] tidyr_1.3.1          colorspace_2.1-0     nlme_3.1-165        
 [79] formula.tools_1.7.1  ggforce_0.4.2        cli_3.6.3           
 [82] textshaping_0.4.0    fansi_1.0.6          viridisLite_0.4.2   
 [85] svglite_2.1.3        dplyr_1.1.4          gtable_0.3.5        
 [88] rstatix_0.7.2        digest_0.6.36        operator.tools_1.6.3
 [91] ggrepel_0.9.5        htmlwidgets_1.6.4    farver_2.1.2        
 [94] memoise_2.0.1        htmltools_0.5.8.1    lifecycle_1.0.4     
 [97] httr_1.4.7           mitml_0.4-5          gridtext_0.1.5      
[100] MASS_7.3-61