---
title: 'NSCLC TruSight Oncology 500 (TSO 500) Landscape Study 2024'
author: 'Zachary D Wallen'
date: today
date-format: long
engine: knitr
execute:
echo: true
warning: false
error: false
format:
html:
toc: true
toc-depth: 5
number-sections: true
code-tools: true
embed-resources: true
---
# Setting up the R environment
## Create needed functions
```{r functions}
# Function to convert numbers from strings back to numbers in excel output
# parameters:
# df - the data.frame being outputted into excel
# wb - the excel workbook object
# sheet - the excel sheet name
# colnames - does df have column names, TRUE/FALSE
convertNum <- function(df, wb, sheet, colnames) {
library(foreach)
cn <- expand.grid(seq_len(ncol(df)), seq_len(nrow(df)))[, 1]
rn <- expand.grid(seq_len(ncol(df)), seq_len(nrow(df)))[, 2]
trash <- foreach(cn = cn, rn = rn) %dopar% {
if (!is.numeric(df[rn, cn]) && !is.na(as.numeric(as.character(df[rn, cn])))) {
row.offset <- ifelse(colnames, 1, 0)
if (df[rn, cn] == Inf) {
openxlsx::writeData(wb, sheet, "Inf", startCol = cn, startRow = row.offset + rn)
} else {
openxlsx::writeData(wb, sheet, as.numeric(as.character(df[rn, cn])),
startCol = cn, startRow = row.offset + rn
)
}
}
}
}
# Function to sort a matrix for better visualization of mutual exclusivity
# of alterations across samples
# Retrieved from https://gist.github.com/armish/564a65ab874a770e2c26, and modified
# to work with character, binary, or logical matrices where NA, '', 0, or FALSE
# represents the absence of alterations in a gene
# parameters:
# M - an alteration by sample matrix
memoSort <- function(M) {
geneOrder <- sort(
rowSums(!is.na(M) & M != "" & M > 0 & M != FALSE),
decreasing = TRUE, index.return = TRUE
)$ix
scoreCol <- function(x) {
score <- 0
for (i in 1:length(x)) {
if (!is.na(x[i]) & x[i] != "" & x[i] > 0 & x[i] != FALSE) {
score <- score + 2^(length(x) - i)
}
}
return(score)
}
scores <- apply(M[geneOrder, ], 2, scoreCol)
sampleOrder <- sort(scores, decreasing = TRUE, index.return = TRUE)$ix
return(M[geneOrder, sampleOrder])
}
# Function to perform association testing between each level of a categorical variable and
# one or more binary variables describing different group strata. Performs Fisher's
# exact test when no covariates are specified, and Firth's logistic regression (via the
# logistf functionin the logistf package) when covariates are specified.
# parameters:
# var - column name of the target categorical variable to test
# strata - a vector of categorical variable column names denoting different strata
# ref - a vector specifying the reference level for each group specified with strata
# (only used when performing Firth's logisitic regression)
# covars - a vector of variable names to include as covariates in testing
# trend.var - column name of a variable to use for performing a test of trend across strata
# data.df - a data.frame containing the variables to be included in testing
catStratTest <- function(var, strata, ref = NULL, covars = NULL, trend.var = NULL, data.df) {
# prepare lists for output
n <- list()
stats <- list()
or <- list()
p <- list()
# perform testing and summary statistic calculations for each test variables
for (i in strata) {
i <- as.character(i)
# calculate frequency table and total N
df <- table(data.df[, var], data.df[, i])
n[[i]] <- colSums(df)
# for each level of the target variable...
for (j in rownames(df)) {
j <- as.character(j)
# calculate frequency table for presence and absence of current level
df2 <- table(ifelse(data.df[, var] == j, 0, 1), data.df[, i])
# calculate summary statistics
stats[[i]][j] <- list(apply(df2, 2, function(x) {
paste(x, " ", "(", round(x / sum(x) * 100, 1), "%", ")", sep = "")
}))
if (!is.null(covars)) {
# perform Firth's logistic regression
res <- logistf::logistf(
ifelse(data.df[, var] == j, 1, 0) ~ .,
data = data.frame(
x = ifelse(data.df[, i] == ref[which(strata == i)], " ", data.df[, i]),
data.df[, covars, drop = FALSE]
)
)
or[[i]][j] <- list(paste(
round(exp(res$coefficients[2]), 2),
" [", round(exp(res$ci.lower[2]), 2), "; ",
round(exp(res$ci.upper[2]), 2), "]",
sep = ""
))
p[[i]][j] <- list(formatC(res$prob[2], digits = 1, format = "e"))
} else {
# perform Fisher's exact test
res <- fisher.test(df2)
or[[i]][j] <- list(paste(
round(res$estimate, 2),
" [", round(res$conf.int[1], 2), "; ",
round(res$conf.int[2], 2), "]",
sep = ""
))
p[[i]][j] <- list(formatC(res$p.value, digits = 1, format = "e"))
}
}
}
# if provided, perform test for trend for each variable level using the trend variable
if (!is.null(trend.var)) {
ptrend <- c()
for (j in seq_along(names(table(data.df[, var])))) {
if (!is.null(covars)) {
ptrend <- c(ptrend, as.vector(
anova(
logistf::logistf(
ifelse(
data.df[, var] == names(table(data.df[, var]))[j], 1, 0
) ~ .,
data = data.df[, c(trend.var, covars)]
),
logistf::logistf(
ifelse(
data.df[, var] == names(table(data.df[, var]))[j], 1, 0
) ~ .,
data = data.df[, covars, drop = FALSE]
),
method = "PLR"
)$pval
))
names(ptrend)[j] <- names(table(data.df[, var]))[j]
} else {
ptrend <- c(ptrend, as.vector(
anova(
logistf::logistf(
ifelse(
data.df[, var] == names(table(data.df[, var]))[j], 1, 0
) ~ data.df[, trend.var]
),
logistf::logistf(
ifelse(
data.df[, var] == names(table(data.df[, var]))[j], 1, 0
) ~ 1
),
method = "PLR"
)$pval
))
names(ptrend)[j] <- names(table(data.df[, var]))[j]
}
}
} else {
ptrend <- NULL
}
return(list(n = n, stats = stats, or = or, p = p, ptrend = ptrend))
}
# Function to perform association testing between a quantitative variable and
# one or more binary variables describing different group strata. Performs linear
# regression with or without adjustment for covariates.
# parameters:
# var - column name of the target quantitative variable to test
# strata - a vector of categorical variable column names denoting different strata
# ref - a vector specifying the reference level for each group specified with strata
# covars - a vector of variable names to include as covariates in testing
# trend.var - column name of a variable to use for performing a test of trend across strata
# data.df - a data.frame containing the variables to be included in testing
quantStratTest <- function(var, strata, ref, covars = NULL, trend.var = NULL, data.df) {
# prepare lists for output
n <- list()
stats <- list()
beta <- list()
p <- list()
# perform testing and summary statistic calculations for each test variables
for (i in strata) {
i <- as.character(i)
# calculate summary statistics
df <- ddply(
data.df, i,
.fun = function(x) {
c(
n = length(na.omit(x[, var])),
mean = mean(na.omit(x[, var])),
sd = sd(na.omit(x[, var]))
)
}
)
n[[i]] <- df$n
names(n[[i]]) <- df[, i]
stats[[i]] <- paste(round(df$mean, 1), round(df$sd, 1), sep = "±")
names(stats[[i]]) <- df[, i]
# perform linear regression with or without covariates
if (!is.null(covars)) {
res <- lm(
data.df[, var] ~ .,
data = data.frame(
x = ifelse(data.df[, i] == ref[which(strata == i)], " ", data.df[, i]),
data.df[, covars, drop = FALSE]
)
)
beta[[i]] <- paste(
round(summary(res)$coefficients[2, 1], 2),
" [", round(confint(res)[2, 1], 2), "; ", round(confint(res)[2, 2], 2), "]",
sep = ""
)
p[[i]] <- formatC(summary(res)$coefficients[2, 4], digits = 1, format = "e")
} else {
res <- lm(
data.df[, var] ~ .,
data = data.frame(
x = ifelse(data.df[, i] == ref[which(strata == i)], " ", data.df[, i])
)
)
beta[[i]] <- paste(
round(summary(res)$coefficients[2, 1], 2),
" [", round(confint(res)[2, 1], 2), "; ", round(confint(res)[2, 2], 2), "]",
sep = ""
)
p[[i]] <- formatC(summary(res)$coefficients[2, 4], digits = 1, format = "e")
}
}
# if provided, perform test for trend using the trend variable
if (!is.null(trend.var)) {
if (!is.null(covars)) {
ptrend <- anova(
lm(data.df[, var] ~ ., data = data.df[, c(trend.var, covars)]),
lm(data.df[, var] ~ ., data = data.df[, covars, drop = FALSE])
)$`Pr(>F)`[2]
} else {
ptrend <- anova(
lm(data.df[, var] ~ ., data = data.df[, trend.var, drop = FALSE]),
lm(data.df[, var] ~ 1)
)$`Pr(>F)`[2]
}
} else {
ptrend <- NULL
}
return(list(n = n, stats = stats, beta = beta, p = p, ptrend = ptrend))
}
# Function to perform pairwise Fisher exact tests or Firth's penalized logistic regression
# on an alteration by sample binary matrix for detection of co-occurrence between alterations.
# Will also calculate the proportion of overlapping samples each pair of alteration were detected in together.
# Will perform Fisher exact test if no covariates provided and logistic regression if they
# are provided.
# parameters:
# M - a sample by alteration binary matrix where '1' denotes a detected alteration
# and '0' denotes absence of detected alteration for each sample
# covars - NULL (default) or data.frame of covariates to include in the analysis. Should
# have same rownames (sample IDs) as 'M'.
alterationCoOcurr <- function(M, covars = NULL) {
# create matrices for outputs
or.matrix <- matrix(nrow = ncol(M), ncol = ncol(M), dimnames = list(colnames(M), colnames(M)))
or.lower.matrix <- matrix(nrow = ncol(M), ncol = ncol(M), dimnames = list(colnames(M), colnames(M)))
or.upper.matrix <- matrix(nrow = ncol(M), ncol = ncol(M), dimnames = list(colnames(M), colnames(M)))
p.matrix <- matrix(nrow = ncol(M), ncol = ncol(M), dimnames = list(colnames(M), colnames(M)))
prop.matrix <- matrix(nrow = ncol(M), ncol = ncol(M), dimnames = list(colnames(M), colnames(M)))
# if covariates given, make sure covariate data lines up with alteration matrix
if (!is.null(covars)) {
if (
!(sum(rownames(M) %in% rownames(covars)) != nrow(M)) |
!(sum(rownames(covars) %in% rownames(M)) != nrow(covars))
) {
stop("samples included in feature and sample data are different")
}
covars <- covars[rownames(M), ]
}
# begin calculations and testing
for (x in seq_len(ncol(M))) {
for (y in seq_len(ncol(M))) {
# create testing data for pair of alterations
if (!is.null(covars)) {
test.data <- data.frame(y = M[, y], x = M[, x], covars)
} else {
test.data <- data.frame(y = M[, y], x = M[, x])
}
# perform testing
if (
colnames(M)[x] == colnames(M)[y] |
sum(grepl("TMB", colnames(M)[x]) + grepl("TMB", colnames(M)[y])) == 2 |
sum(grepl("PD.*?L1", colnames(M)[x]) + grepl("PD.*?L1", colnames(M)[y])) == 2
) {
# give null results if testing the same variable or a level of immune marker
# with another level of the same immune marker
or.matrix[x, y] <- 1
or.lower.matrix[x, y] <- NA
or.upper.matrix[x, y] <- NA
p.matrix[x, y] <- 1
prop.matrix[x, y] <- 1
} else if (
sum(test.data$x[!is.na(test.data$x) & !is.na(test.data$y)]) == 0 |
sum(test.data$y[!is.na(test.data$x) & !is.na(test.data$y)]) == 0
) {
# give null results for alterations with no overlapping results due to
# missing data
or.matrix[x, y] <- 1
or.lower.matrix[x, y] <- NA
or.upper.matrix[x, y] <- NA
p.matrix[x, y] <- 1
prop.matrix[x, y] <- 0
} else {
# perform test
if (!is.null(covars)) {
res <- logistf::logistf(y ~ ., data = test.data)
} else {
res <- with(test.data, fisher.test(y, x))
}
# add results to matrices
if (!is.null(covars)) {
or.matrix[x, y] <- exp(res$coefficients["x"])
or.lower.matrix[x, y] <- exp(res$ci.lower["x"])
or.upper.matrix[x, y] <- exp(res$ci.upper["x"])
p.matrix[x, y] <- res$prob["x"]
} else {
or.matrix[x, y] <- res$estimate
or.lower.matrix[x, y] <- res$conf.int[1]
or.upper.matrix[x, y] <- res$conf.int[2]
p.matrix[x, y] <- res$p.value
}
prop.matrix[x, y] <- sum(na.omit(M[, x] + M[, y] == 2)) /
sum(na.omit(M[, x] + M[, y] > 0))
}
}
}
# multiple test correct p-values for unique tests
padj.matrix <- p.matrix
padj.matrix[lower.tri(padj.matrix)] <- p.adjust(
padj.matrix[lower.tri(padj.matrix)],
method = "BH"
)
padj.matrix[upper.tri(padj.matrix)] <- p.adjust(
padj.matrix[upper.tri(padj.matrix)],
method = "BH"
)
# combine results into one data.frame
res <- data.frame(
`Feature 1` = t(combn(rownames(or.matrix), 2))[, 1],
`Feature 2` = t(combn(rownames(or.matrix), 2))[, 2],
`Proportion of co-occurrence` = prop.matrix[lower.tri(prop.matrix)],
OR = or.matrix[lower.tri(or.matrix)],
`OR 95%CI lower` = or.lower.matrix[lower.tri(or.lower.matrix)],
`OR 95%CI upper` = or.upper.matrix[lower.tri(or.upper.matrix)],
`P-value` = p.matrix[lower.tri(p.matrix)],
`Adjusted P-value` = padj.matrix[lower.tri(padj.matrix)],
check.names = FALSE
)
res <- res[order(res$`Adjusted P-value`), ]
# return data
return(list(
result.summary = res,
or.matrix = or.matrix,
or.lower.matrix = or.lower.matrix,
or.upper.matrix = or.upper.matrix,
p.matrix = p.matrix,
padj.matrix = padj.matrix,
prop.matrix = prop.matrix
))
}
```
## Load required R packages
```{r packages}
library(plyr)
library(readxl)
library(tibble)
library(openxlsx)
library(foreach)
library(kableExtra)
library(ggplot2)
library(ggpubr)
library(ggtext)
library(plotly)
library(GGally)
library(igraph)
library(ggraph)
```
## Create excel styles used for formatting output
```{r styles}
bold <- createStyle(textDecoration = "bold")
left_just <- createStyle(halign = "left", valign = "center", wrapText = TRUE)
right_just <- createStyle(halign = "right", valign = "center", wrapText = TRUE)
center <- createStyle(halign = "center", valign = "center", wrapText = TRUE)
horizontal_border_med <- createStyle(border = "top", borderStyle = "medium")
horizontal_border_thin <- createStyle(border = "top", borderStyle = "thin")
percentage <- createStyle(numFmt = "PERCENTAGE")
```
# Statistical analysis
## Import and prepare data
```{r import-data}
# read data
patient.df <- data.frame(read_xlsx(
"Data/NSCLC_TSO500_Landscape_Study_de_id_data.xlsx",
sheet = 1,
na = c("NULL", "NA", "N/A")
))
variant.df <- data.frame(read_xlsx(
"Data/NSCLC_TSO500_Landscape_Study_de_id_data.xlsx",
sheet = 2,
na = c("NULL", "NA", "N/A")
))
therapy.df <- data.frame(read_xlsx(
"Data/NSCLC_TSO500_Landscape_Study_de_id_data.xlsx",
sheet = 3,
na = c("NULL", "NA", "N/A")
))
# remove data for patients without adenocarcinoma or squamous cell carcinoma
# and for hypermutated patients
patient.df <- patient.df[
patient.df$histology_group %in% c("Adenocarcinoma", "Squamous cell carcinoma") &
!(patient.df$de_id %in% patient.df$de_id[patient.df$TMB > 200]),
]
therapy.df <- therapy.df[therapy.df$de_id %in% patient.df$de_id, ]
variant.df <- variant.df[variant.df$de_id %in% patient.df$de_id, ]
# make sure factor levels of age bins are in the correct order
patient.df$patient_age_cat <- factor(
patient.df$patient_age_cat,
levels = c(
sort(unique(patient.df$patient_age_cat))[-1],
sort(unique(patient.df$patient_age_cat))[1]
)
)
# order tissue specimen status
patient.df$tissue_status <- factor(
patient.df$tissue_status,
levels = c("Primary", "Advanced", "Metastatic")
)
# collapse sub-groups of clinical stages
patient.df$stage[grep("Stage IV|Metastatic", patient.df$stage)] <- "Stage IV"
patient.df$stage[grep("Stage III", patient.df$stage)] <- "Stage III"
patient.df$stage[
grepl("Stage II", patient.df$stage) & patient.df$stage != "Stage III"
] <- "Stage II"
patient.df$stage[
grepl("Stage I", patient.df$stage) &
patient.df$stage != "Stage IV" &
patient.df$stage != "Stage III" &
patient.df$stage != "Stage II"
] <- "Stage I"
patient.df$stage[
is.na(patient.df$stage) | !grepl("Stage I", patient.df$stage)
] <- "Unknown"
# re-create TMB interpretation with very high/high/low, log transform TMB,
# and make sure to mask TMB score for those that failed testing
patient.df$TMB_interp <- NA
patient.df$TMB_interp[patient.df$TMB >= 20] <- "Very high (≥20)"
patient.df$TMB_interp[patient.df$TMB < 20 & patient.df$TMB >= 10] <- "High (10-19)"
patient.df$TMB_interp[patient.df$TMB < 10] <- "Not high (<10)"
patient.df$TMB_interp[
patient.df$de_id %in% patient.df$de_id[patient.df$DNA_TMB %in% c("Fail", "Not Performed")]
] <- NA
patient.df$TMB[
patient.df$de_id %in% patient.df$de_id[patient.df$DNA_TMB %in% c("Fail", "Not Performed")]
] <- NA
patient.df$TMB_interp <- factor(
patient.df$TMB_interp,
levels = c("Not high (<10)", "High (10-19)", "Very high (≥20)")
)
# re-create PD-L1 with high/low/negative and make sure to mask PD-L1 score for those
# that failed testing
patient.df$PD_L1_IHC_22C3_interp <- NA
patient.df$PD_L1_IHC_22C3_interp[
patient.df$PD_L1_IHC_22C3_result >= 50
] <- "High (≥50%)"
patient.df$PD_L1_IHC_22C3_interp[
patient.df$PD_L1_IHC_22C3_result < 50 &
patient.df$PD_L1_IHC_22C3_result >= 1
] <- "Low (1-49%)"
patient.df$PD_L1_IHC_22C3_interp[
patient.df$PD_L1_IHC_22C3_result == 0
] <- "Negative (<1%)"
patient.df$PD_L1_IHC_22C3_interp[
patient.df$de_id %in%
patient.df$de_id[patient.df$PD_L1_IHC_22C3 %in% c("Fail", "Not Performed")]
] <- NA
patient.df$PD_L1_IHC_22C3_result[
patient.df$de_id %in%
patient.df$de_id[patient.df$PD_L1_IHC_22C3 %in% c("Fail", "Not Performed")]
] <- NA
patient.df$PD_L1_IHC_22C3_interp <- factor(
patient.df$PD_L1_IHC_22C3_interp,
levels = c("Negative (<1%)", "Low (1-49%)", "High (≥50%)")
)
# recode MSI variable and make those without MSI or failed testing NA for MSI
patient.df$MSI[patient.df$MSI == "MSI_H"] <- "MSI High"
patient.df$MSI[patient.df$MSI == "MSS"] <- "Stable"
patient.df$MSI[patient.df$MSI == "FT"] <- NA
patient.df$MSI[patient.df$MSI == ""] <- NA
# recode SNV type variable to make more simple categories and add CNV and Fusion entries
variant.df$snv_type[grep("^Complex|^Nonstop|^Promoter|Unknown", variant.df$snv_type)] <- "Other SNV"
variant.df$snv_type[grep("^Deletion|^Insertion", variant.df$snv_type)] <- "Indel"
variant.df$snv_type[grep("^Splice", variant.df$snv_type)] <- "Splice site"
variant.df$snv_type[grep("^Substitution", variant.df$snv_type)] <- "Substitution"
variant.df$snv_type[variant.df$variant_type == "CNV"] <- "CNV"
variant.df$snv_type[variant.df$variant_type == "Fusion"] <- "Fusion"
# recode AMP/CAP/ASCO tier variable to only have two categories and be more clear
therapy.df$amp_cap_asco_tier[
grep("1", therapy.df$amp_cap_asco_tier)
] <- "Guideline indicated"
therapy.df$amp_cap_asco_tier[
grep("2", therapy.df$amp_cap_asco_tier)
] <- "Clinical trial or therapy in other tumor type"
# create variables to stratify patients by histology
patient.df$adeno <- ifelse(patient.df$histology_group == "Adenocarcinoma", 1, 2)
patient.df$scc <- ifelse(patient.df$histology_group == "Squamous cell carcinoma", 1, 2)
hist_strata <- c("adeno", "scc")
# add histology groups to all data
variant.df <- merge(
patient.df[, c("de_id", "histology_group")],
variant.df,
sort = FALSE
)
therapy.df <- merge(
patient.df[, c("de_id", "histology_group")],
therapy.df,
sort = FALSE
)
```
## Patient characteristics
```{r patient-summ-stats}
# create result data.frame
results <- data.frame()
results <- rbind(
results,
data.frame(
Variable = "",
N = "All patients", `Summary stats` = "",
N1 = "Adenocarcinoma", `N1 summary stats` = "",
N2 = "Squamous cell carcinoma", `N2 summary stats` = "", OR = "", P = "",
check.names = FALSE
)
)
results <- rbind(
results,
data.frame(
Variable = "Variable",
N = "N", `Summary stats` = "Summary stats",
N1 = "N", `N1 summary stats` = "Summary stats",
N2 = "N", `N2 summary stats` = "Summary stats", OR = "OR/Beta", P = "P",
check.names = FALSE
)
)
# get total number of patients
results <- rbind(
results,
data.frame(
Variable = "Total number of patients",
N = nrow(patient.df), `Summary stats` = "-",
N1 = table(patient.df$histology_group)[1], `N1 summary stats` = "-",
N2 = table(patient.df$histology_group)[2], `N2 summary stats` = "-", OR = "-", P = "-",
check.names = FALSE
)
)
### gender
# perform testing
var <- "patient_gender"
res <- catStratTest(
var = var, strata = "histology_group",
data.df = patient.df[patient.df$patient_gender != "Unspecified", ]
)
# get totals for all patients
data.df <- table(
patient.df[patient.df$patient_gender != "Unspecified", var],
rep("0", nrow(patient.df[patient.df$patient_gender != "Unspecified", ]))
)
# add results to result data.frame
results <- rbind(
results,
data.frame(
Variable = c("Gender (N, %)", rownames(data.df)),
N = c(sum(data.df), rep("", nrow(data.df))),
`Summary stats` = c(
"",
paste(
rowSums(data.df), " ", "(", round(rowSums(data.df) / sum(data.df) * 100, 1), "%", ")",
sep = ""
)
),
N1 = c(res$n[[1]][1], rep("", nrow(data.df))),
`N1 summary stats` = c("", res$stats[[1]][[1]][, 1]),
N2 = c(res$n[[1]][2], rep("", nrow(data.df))),
`N2 summary stats` = c("", res$stats[[1]][[1]][, 2]),
OR = c("", "", res$or[[1]][[2]][1]),
P = c("", "", res$p[[1]][[2]][1]),
check.names = FALSE
)
)
### age
# perform testing
var <- "patient_age"
res <- quantStratTest(
var = var, strata = "histology_group", ref = "Adenocarcinoma", data.df = patient.df
)
# add results to result data.frame
results <- rbind(
results,
data.frame(
Variable = "Age (Mean±SD)",
N = nrow(patient.df[!is.na(patient.df[, var]), ]),
`Summary stats` = paste(
round(mean(na.omit(patient.df[, var])), 1),
round(sd(na.omit(patient.df[, var])), 1),
sep = "±"
),
N1 = res$n[[1]][1],
`N1 summary stats` = res$stats[[1]][1],
N2 = res$n[[1]][2],
`N2 summary stats` = res$stats[[1]][2],
OR = res$beta[[1]],
P = res$p[[1]],
check.names = FALSE
)
)
### age group
# perform testing
var <- "patient_age_cat"
res <- catStratTest(var = var, strata = "histology_group", data.df = patient.df)
# get totals for all patients
data.df <- table(patient.df[, var], rep("0", nrow(patient.df)))
# add results to result data.frame
results <- rbind(
results,
data.frame(
Variable = c("Age group (N, %)", rownames(data.df)),
N = c(sum(data.df), rep("", nrow(data.df))),
`Summary stats` = c(
"",
paste(
rowSums(data.df), " ", "(", round(rowSums(data.df) / sum(data.df) * 100, 1), "%", ")",
sep = ""
)
),
N1 = c(res$n[[1]][1], rep("", nrow(data.df))),
`N1 summary stats` = c("", sapply(res$stats[[1]], function(x) {
x[1, 1]
})),
N2 = c(res$n[[1]][2], rep("", nrow(data.df))),
`N2 summary stats` = c("", sapply(res$stats[[1]], function(x) {
x[1, 2]
})),
OR = c("", sapply(res$or[[1]], function(x) {
x[1]
})),
P = c("", sapply(res$p[[1]], function(x) {
x[1]
})),
check.names = FALSE
)
)
### histology
# get frequencies
data.df <- table(patient.df$histology_group, patient.df$histology_group)
# add results to result data.frame
results <- rbind(
results,
data.frame(
Variable = c("Histology group (N, %)", rownames(data.df)),
N = c(sum(data.df), rep("", nrow(data.df))),
`Summary stats` = c(
"",
paste(
rowSums(data.df), " ", "(", round(rowSums(data.df) / sum(data.df) * 100, 1), "%", ")",
sep = ""
)
),
N1 = c(sum(data.df[, 1]), rep("", nrow(data.df))),
`N1 summary stats` = c(
"",
gsub(
"0 \\(0\\%\\)", "-",
paste(
data.df[, 1], " ", "(", round(data.df[, 1] / sum(data.df[, 1]) * 100, 1), "%", ")",
sep = ""
)
)
),
N2 = c(sum(data.df[, 2]), rep("", nrow(data.df))),
`N2 summary stats` = c(
"",
gsub(
"0 \\(0\\%\\)", "-",
paste(
data.df[, 2], " ", "(", round(data.df[, 2] / sum(data.df[, 2]) * 100, 1), "%", ")",
sep = ""
)
)
),
OR = c("", rep("-", nrow(data.df))),
P = c("", rep("-", nrow(data.df))),
check.names = FALSE
)
)
### tissue specimen location
# perform testing
var <- "tissue_status"
res <- catStratTest(var = var, strata = "histology_group", data.df = patient.df)
# get totals for all patients
data.df <- table(patient.df[, var], rep("0", nrow(patient.df)))
# add results to result data.frame
results <- rbind(
results,
data.frame(
Variable = c("Tissue specimen location (N, %)", rownames(data.df)),
N = c(sum(data.df), rep("", nrow(data.df))),
`Summary stats` = c(
"",
paste(
rowSums(data.df), " ", "(", round(rowSums(data.df) / sum(data.df) * 100, 1), "%", ")",
sep = ""
)
),
N1 = c(res$n[[1]][1], rep("", nrow(data.df))),
`N1 summary stats` = c("", sapply(res$stats[[1]], function(x) {
x[1, 1]
})),
N2 = c(res$n[[1]][2], rep("", nrow(data.df))),
`N2 summary stats` = c("", sapply(res$stats[[1]], function(x) {
x[1, 2]
})),
OR = c("", sapply(res$or[[1]], function(x) {
x[1]
})),
P = c("", sapply(res$p[[1]], function(x) {
x[1]
})),
check.names = FALSE
)
)
### unknown clinical stage
# get frequencies
data.df <- table(
factor(
ifelse(patient.df$stage == "Unknown",
"Unknown clinical stage (N, %)",
"Known clinical stage (N, %)"
),
levels = c("Unknown clinical stage (N, %)", "Known clinical stage (N, %)")
),
patient.df$histology_group
)
# add results to result data.frame
results <- rbind(
results,
data.frame(
Variable = rownames(data.df),
N = rep("", 2),
`Summary stats` = paste(
rowSums(data.df), " ", "(", round(rowSums(data.df) / sum(data.df) * 100, 1), "%", ")",
sep = ""
),
N1 = rep("", 2),
`N1 summary stats` = paste(
data.df[, 1], " ", "(", round(data.df[, 1] / sum(data.df[, 1]) * 100, 1), "%", ")",
sep = ""
),
N2 = rep("", 2),
`N2 summary stats` = paste(
data.df[, 2], " ", "(", round(data.df[, 2] / sum(data.df[, 2]) * 100, 1), "%", ")",
sep = ""
),
OR = rep("-", nrow(data.df)),
P = rep("-", nrow(data.df)),
check.names = FALSE
)
)
### clinical stage
# perform testing
var <- "stage"
res <- catStratTest(
var = var, strata = "histology_group", data.df = patient.df[patient.df[, var] != "Unknown", ]
)
# get totals for all patients
data.df <- table(
patient.df[patient.df[, var] != "Unknown", var],
rep("0", nrow(patient.df[patient.df[, var] != "Unknown", ]))
)
# add results to result data.frame
results <- rbind(
results,
data.frame(
Variable = c("Known clinical stage (N, %)", rownames(data.df)),
N = c(sum(data.df), rep("", nrow(data.df))),
`Summary stats` = c(
"",
paste(
rowSums(data.df), " ", "(", round(rowSums(data.df) / sum(data.df) * 100, 1), "%", ")",
sep = ""
)
),
N1 = c(res$n[[1]][1], rep("", nrow(data.df))),
`N1 summary stats` = c("", sapply(res$stats[[1]], function(x) {
x[1, 1]
})),
N2 = c(res$n[[1]][2], rep("", nrow(data.df))),
`N2 summary stats` = c("", sapply(res$stats[[1]], function(x) {
x[1, 2]
})),
OR = c("", sapply(res$or[[1]], function(x) {
x[1]
})),
P = c("", sapply(res$p[[1]], function(x) {
x[1]
})),
check.names = FALSE
)
)
### Number of detected alterations
# create data for testing
test.data <- rbind(
merge(
patient.df[
!(patient.df$DNA_SNV %in% c("Fail", "Not Performed") |
patient.df$DNA_CNV %in% c("Fail", "Not Performed") |
patient.df$RNA_Rearrangements %in% c("Fail", "Not Performed")),
c("de_id", "histology_group")
],
ddply(variant.df, .(de_id), summarize, var_count = length(de_id)),
by = "de_id"
),
data.frame(
patient.df[
!(patient.df$de_id %in% variant.df$de_id) &
!(patient.df$DNA_SNV %in% c("Fail", "Not Performed") |
patient.df$DNA_CNV %in% c("Fail", "Not Performed") |
patient.df$RNA_Rearrangements %in% c("Fail", "Not Performed")),
c("de_id", "histology_group")
],
var_count = 0
)
)
# perform testing
var <- "var_count"
res <- quantStratTest(
var = var, strata = "histology_group", ref = "Adenocarcinoma", data.df = test.data
)
# add results to result data.frame
results <- rbind(
results,
data.frame(
Variable = "Number of detected alterations (Mean±SD)",
N = nrow(test.data[!is.na(test.data[, var]), ]),
`Summary stats` = paste(
round(mean(na.omit(test.data[, var])), 1),
round(sd(na.omit(test.data[, var])), 1),
sep = "±"
),
N1 = res$n[[1]][1],
`N1 summary stats` = res$stats[[1]][1],
N2 = res$n[[1]][2],
`N2 summary stats` = res$stats[[1]][2],
OR = res$beta[[1]],
P = res$p[[1]],
check.names = FALSE
)
)
### Tier 1 or 2 genomic alterations
# create data for testing
test.data <- data.frame(
patient.df,
`Guideline indicated` = ifelse(
patient.df$de_id %in% therapy.df$de_id[
grepl("Guideline", therapy.df$amp_cap_asco_tier) &
therapy.df$marker_component_marker_type %in% c("SNV", "CNV", "Fusion")
], 0, 1
),
`Clinical trial or therapy in other tumor type` = ifelse(
patient.df$de_id %in% therapy.df$de_id[
grepl("Clinical", therapy.df$amp_cap_asco_tier) &
therapy.df$marker_component_marker_type %in% c("SNV", "CNV", "Fusion")
], 0, 1
),
`Neither above, but known pathogenic` = ifelse(
!(patient.df$de_id %in% therapy.df$de_id[
therapy.df$marker_component_marker_type %in% c("SNV", "CNV", "Fusion")
]), 0, 1
),
check.names = FALSE
)
# perform testing
var <- c(
"Guideline indicated",
"Clinical trial or therapy in other tumor type",
"Neither above, but known pathogenic"
)
res <- list()
for (i in var) {
res[[i]] <- catStratTest(var = i, strata = "histology_group", data.df = test.data)
}
# add results to result data.frame
results <- rbind(
results,
data.frame(
Variable = c("Genomic alterations with known or potential clinical significance (N, %)", var),
N = c(nrow(test.data), rep("", length(var))),
`Summary stats` = c(
"",
paste(
colSums(test.data[, var] == 0 & !is.na(test.data[, var])), " ", "(",
round(
colSums(test.data[, var] == 0 & !is.na(test.data[, var])) /
nrow(test.data) * 100, 1
), "%", ")",
sep = ""
)
),
N1 = c(sapply(res, function(x) {
x$n[[1]][1]
})[1], rep("", length(var))),
`N1 summary stats` = c("", sapply(res, function(x) {
x$stats[[1]][[1]][1, 1]
})),
N2 = c(sapply(res, function(x) {
x$n[[1]][2]
})[1], rep("", length(var))),
`N2 summary stats` = c("", sapply(res, function(x) {
x$stats[[1]][[1]][1, 2]
})),
OR = c("", sapply(res, function(x) {
x$or[[1]][[2]][1]
})),
P = c("", sapply(res, function(x) {
x$p[[1]][[2]][1]
})),
check.names = FALSE
)
)
### TMB score (mut/Mb)
# perform testing
var <- "TMB"
res <- quantStratTest(
var = var, strata = "histology_group", ref = "Adenocarcinoma", data.df = patient.df
)
# add results to result data.frame
results <- rbind(
results,
data.frame(
Variable = "TMB (Mut/Mb) (Mean±SD)",
N = nrow(patient.df[!is.na(patient.df[, var]), ]),
`Summary stats` = paste(
round(mean(na.omit(patient.df[, var])), 1),
round(sd(na.omit(patient.df[, var])), 1),
sep = "±"
),
N1 = res$n[[1]][1],
`N1 summary stats` = res$stats[[1]][1],
N2 = res$n[[1]][2],
`N2 summary stats` = res$stats[[1]][2],
OR = res$beta[[1]],
P = res$p[[1]],
check.names = FALSE
)
)
### TMB very high, high, not high
# create data for testing
total.patients <- length(patient.df[!is.na(patient.df$TMB_interp), "de_id"])
test.data <- data.frame(
patient.df,
`Very high (≥20)` = ifelse(
patient.df$de_id %in% patient.df$de_id[patient.df$TMB_interp == "Very high (≥20)"], 0,
ifelse(
!(patient.df$de_id %in% patient.df$de_id) |
patient.df$de_id %in% patient.df$de_id[is.na(patient.df$TMB_interp)], NA, 1
)
),
`High (10-19)` = ifelse(
patient.df$de_id %in% patient.df$de_id[patient.df$TMB_interp == "High (10-19)"], 0,
ifelse(
!(patient.df$de_id %in% patient.df$de_id) |
patient.df$de_id %in% patient.df$de_id[is.na(patient.df$TMB_interp)], NA, 1
)
),
`Not high (<10)` = ifelse(
patient.df$de_id %in% patient.df$de_id[patient.df$TMB_interp == "Not high (<10)"], 0,
ifelse(
!(patient.df$de_id %in% patient.df$de_id) |
patient.df$de_id %in% patient.df$de_id[is.na(patient.df$TMB_interp)], NA, 1
)
),
check.names = FALSE
)
# perform testing
var <- c("Very high (≥20)", "High (10-19)", "Not high (<10)")
res <- list()
for (i in var) {
res[[i]] <- catStratTest(var = i, strata = "histology_group", data.df = test.data)
}
# add results to result data.frame
results <- rbind(
results,
data.frame(
Variable = c("TMB level (N, %)", var),
N = c(total.patients, rep("", length(var))),
`Summary stats` = c(
"",
paste(
colSums(test.data[, var] == 0 & !is.na(test.data[, var])), " ", "(",
round(
colSums(test.data[, var] == 0 & !is.na(test.data[, var])) / total.patients * 100, 1
), "%", ")",
sep = ""
)
),
N1 = c(sapply(res, function(x) {
x$n[[1]][1]
})[1], rep("", length(var))),
`N1 summary stats` = c("", sapply(res, function(x) {
x$stats[[1]][[1]][1, 1]
})),
N2 = c(sapply(res, function(x) {
x$n[[1]][2]
})[1], rep("", length(var))),
`N2 summary stats` = c("", sapply(res, function(x) {
x$stats[[1]][[1]][1, 2]
})),
OR = c("", sapply(res, function(x) {
x$or[[1]][[2]][1]
})),
P = c("", sapply(res, function(x) {
x$p[[1]][[2]][1]
})),
check.names = FALSE
)
)
### PD-L1 score (% expressing cells)
# perform testing
var <- "PD_L1_IHC_22C3_result"
res <- quantStratTest(
var = var, strata = "histology_group", ref = "Adenocarcinoma", data.df = patient.df
)
# add results to result data.frame
results <- rbind(
results,
data.frame(
Variable = "PD-L1 22C3 TPS (Mean±SD)",
N = nrow(patient.df[!is.na(patient.df[, var]), ]),
`Summary stats` = paste(
round(mean(na.omit(patient.df[, var])), 1),
round(sd(na.omit(patient.df[, var])), 1),
sep = "±"
),
N1 = res$n[[1]][1],
`N1 summary stats` = res$stats[[1]][1],
N2 = res$n[[1]][2],
`N2 summary stats` = res$stats[[1]][2],
OR = res$beta[[1]],
P = res$p[[1]],
check.names = FALSE
)
)
### PD-L1 high, low, negative
# create data for testing
total.patients <- length(patient.df[!is.na(patient.df$PD_L1_IHC_22C3_interp), "de_id"])
test.data <- data.frame(
patient.df,
`High (≥50%)` = ifelse(
patient.df$de_id %in%
patient.df$de_id[patient.df$PD_L1_IHC_22C3_interp == "High (≥50%)"], 0,
ifelse(
!(patient.df$de_id %in% patient.df$de_id) |
patient.df$de_id %in%
patient.df$de_id[is.na(patient.df$PD_L1_IHC_22C3_interp)], NA, 1
)
),
`Low (1-49%)` = ifelse(
patient.df$de_id %in%
patient.df$de_id[patient.df$PD_L1_IHC_22C3_interp == "Low (1-49%)"], 0,
ifelse(
!(patient.df$de_id %in% patient.df$de_id) |
patient.df$de_id %in%
patient.df$de_id[is.na(patient.df$PD_L1_IHC_22C3_interp)], NA, 1
)
),
`Negative (<1%)` = ifelse(
patient.df$de_id %in%
patient.df$de_id[patient.df$PD_L1_IHC_22C3_interp == "Negative (<1%)"], 0,
ifelse(
!(patient.df$de_id %in% patient.df$de_id) |
patient.df$de_id %in%
patient.df$de_id[is.na(patient.df$PD_L1_IHC_22C3_interp)], NA, 1
)
),
check.names = FALSE
)
# perform testing
var <- c("High (≥50%)", "Low (1-49%)", "Negative (<1%)")
res <- list()
for (i in var) {
res[[i]] <- catStratTest(var = i, strata = "histology_group", data.df = test.data)
}
# add results to result data.frame
results <- rbind(
results,
data.frame(
Variable = c("PD-L1 level (N, %)", var),
N = c(total.patients, rep("", length(var))),
`Summary stats` = c(
"",
paste(
colSums(test.data[, var] == 0 & !is.na(test.data[, var])), " ", "(",
round(
colSums(test.data[, var] == 0 & !is.na(test.data[, var])) / total.patients * 100, 1
), "%", ")",
sep = ""
)
),
N1 = c(sapply(res, function(x) {
x$n[[1]][1]
})[1], rep("", length(var))),
`N1 summary stats` = c("", sapply(res, function(x) {
x$stats[[1]][[1]][1, 1]
})),
N2 = c(sapply(res, function(x) {
x$n[[1]][2]
})[1], rep("", length(var))),
`N2 summary stats` = c("", sapply(res, function(x) {
x$stats[[1]][[1]][1, 2]
})),
OR = c("", sapply(res, function(x) {
x$or[[1]][[2]][1]
})),
P = c("", sapply(res, function(x) {
x$p[[1]][[2]][1]
})),
check.names = FALSE
)
)
### MSI level high or stable
# perform testing
var <- "MSI"
res <- catStratTest(var = var, strata = "histology_group", data.df = patient.df)
# get totals for all patients
data.df <- table(patient.df[, var], rep("0", nrow(patient.df)))
# add results to result data.frame
results <- rbind(
results,
data.frame(
Variable = c("MSI level (N, %)", rownames(data.df)),
N = c(sum(data.df), rep("", nrow(data.df))),
`Summary stats` = c(
"",
paste(
rowSums(data.df), " ", "(", round(rowSums(data.df) / sum(data.df) * 100, 1), "%", ")",
sep = ""
)
),
N1 = c(res$n[[1]][1], rep("", nrow(data.df))),
`N1 summary stats` = c("", res$stats[[1]][[1]][, 1]),
N2 = c(res$n[[1]][2], rep("", nrow(data.df))),
`N2 summary stats` = c("", res$stats[[1]][[1]][, 2]),
OR = c("", "", res$or[[1]][[2]][1]),
P = c("", "", res$p[[1]][[2]][1]),
check.names = FALSE
)
)
### write out results
# create workbook
wb <- createWorkbook()
# add worksheet, write data, and format output
addWorksheet(wb, "Patient characteristics")
writeData(wb, "Patient characteristics", results, keepNA = TRUE, colNames = FALSE)
setColWidths(
wb, "Patient characteristics",
cols = seq_len(ncol(results)),
widths = c(41, 5, 15, rep(c(5, 14), 2), 19.5, 11)
)
addStyle(
wb, "Patient characteristics",
cols = seq_len(ncol(results)), rows = 1:(nrow(results) + 1),
gridExpand = TRUE, style = left_just, stack = TRUE
)
addStyle(
wb, "Patient characteristics",
cols = 1,
rows = c(
which(results$Variable == "Female"):which(results$Variable == "Male"),
which(results$Variable == "≤40"):which(results$Variable == ">90"),
which(results$Variable == "Adenocarcinoma"):which(results$Variable == "Squamous cell carcinoma"),
which(results$Variable == "Primary"):which(results$Variable == "Metastatic"),
which(results$Variable == "Stage I"):which(results$Variable == "Stage IV"),
which(results$Variable == "Guideline indicated"):which(results$Variable == "Neither above, but known pathogenic"),
which(results$Variable == "Very high (≥20)"):which(results$Variable == "Not high (<10)"),
which(results$Variable == "High (≥50%)"):which(results$Variable == "Negative (<1%)"),
which(results$Variable == "MSI High"):which(results$Variable == "Stable")
),
gridExpand = TRUE, style = center, stack = TRUE
)
mergeCells(wb, "Patient characteristics", cols = c(2, 3), rows = 1)
mergeCells(wb, "Patient characteristics", cols = c(4, 5), rows = 1)
mergeCells(wb, "Patient characteristics", cols = c(6, 7), rows = 1)
addStyle(
wb, "Patient characteristics",
cols = seq_len(ncol(results)), rows = 1:2,
gridExpand = TRUE, style = bold, stack = TRUE
)
addStyle(
wb, "Patient characteristics",
cols = seq_len(ncol(results)),
rows = c(1, 3, (nrow(results) + 1)), gridExpand = TRUE,
style = horizontal_border_med, stack = TRUE
)
addStyle(
wb, "Patient characteristics",
cols = seq_len(ncol(results)),
rows = c(
grep("Gender", results$Variable),
grep("Age \\(", results$Variable),
grep("Age group", results$Variable),
grep("Histology", results$Variable),
grep("Unknown", results$Variable),
grep("Known", results$Variable)[2],
grep("Tissue", results$Variable),
grep("Number", results$Variable),
grep("Genomic", results$Variable),
grep("TMB", results$Variable),
grep("PD-L1", results$Variable),
grep("MSI level", results$Variable)
),
gridExpand = TRUE, style = horizontal_border_thin, stack = TRUE
)
# convert numbers from strings back to numbers
convertNum(results, wb, "Patient characteristics", FALSE)
# save workbook
saveWorkbook(wb, "Output/Tables/Patient_characteristics.xlsx", overwrite = TRUE)
```
## Genomic alteration characterization
### Genomic alteration prevalence: Gene-level
```{r gene-level-prevalence}
# create result data.frame
results <- data.frame()
results <- rbind(
results,
data.frame(
Variable = "",
N = "All patients", Frequency = "",
N1 = "Adenocarcinoma", `N1 summary stats` = "",
N2 = "Squamous cell carcinoma", `N2 summary stats` = "", OR = "", P = "",
check.names = FALSE
)
)
results <- rbind(
results,
data.frame(
Variable = "Gene",
N = "N", Frequency = "Summary stats",
N1 = "N", `N1 summary stats` = "Summary stats",
N2 = "N", `N2 summary stats` = "Summary stats", OR = "OR", P = "FDR q-value",
check.names = FALSE
)
)
# get total number of patients
results <- rbind(
results,
data.frame(
Variable = "Total number of patients",
N = nrow(patient.df), Frequency = "-",
N1 = table(patient.df$histology_group)[1], `N1 summary stats` = "-",
N2 = table(patient.df$histology_group)[2], `N2 summary stats` = "-", OR = "-", P = "-",
check.names = FALSE
)
)
### SNVs
# create data for testing
total.patients <- sum(!(patient.df$DNA_SNV %in% c("Fail", "Not Performed")))
var <- c()
test.data <- patient.df
ref.data <- variant.df[variant.df$variant_type == "SNV", ]
for (i in unique(ref.data$gene)) {
test.data$gene <- ifelse(
patient.df$de_id %in% ref.data$de_id[ref.data$gene == i], 0,
ifelse(
!(patient.df$DNA_SNV %in% c("Fail", "Not Performed")), 1, NA
)
)
if (sum(na.omit(test.data$gene == 0)) / total.patients >= 0.001) {
var <- append(var, i)
colnames(test.data)[ncol(test.data)] <- i
} else {
test.data <- test.data[, -ncol(test.data)]
}
}
var <- colnames(t(memoSort(t(abs(test.data[, var] - 1)))))
test.data <- test.data[, c(colnames(patient.df), var)]
# perform testing
res <- list()
for (i in var) {
res[[i]] <- catStratTest(var = i, strata = "histology_group", data.df = test.data)
}
# add results to result data.frame
results <- rbind(
results,
data.frame(
Variable = c("Single nucleotide variants (N, %)", var),
N = c(total.patients, rep("", length(var))),
Frequency = c(
"",
paste(
colSums(test.data[, var] == 0 & !is.na(test.data[, var])), " ", "(",
round(
colSums(test.data[, var] == 0 & !is.na(test.data[, var])) / total.patients * 100, 1
), "%", ")",
sep = ""
)
),
N1 = c(sapply(res, function(x) {
x$n[[1]][1]
})[1], rep("", length(var))),
`N1 summary stats` = c("", sapply(res, function(x) {
x$stats[[1]][[1]][1, 1]
})),
N2 = c(sapply(res, function(x) {
x$n[[1]][2]
})[1], rep("", length(var))),
`N2 summary stats` = c("", sapply(res, function(x) {
x$stats[[1]][[1]][1, 2]
})),
OR = c("", sapply(res, function(x) {
x$or[[1]][[2]][1]
})),
P = c("", sapply(res, function(x) {
x$p[[1]][[2]][1]
})),
check.names = FALSE
)
)
### CNVs
# create data for testing
total.patients <- sum(!(patient.df$DNA_CNV %in% c("Fail", "Not Performed")))
var <- c()
test.data <- patient.df
ref.data <- variant.df[variant.df$variant_type == "CNV", ]
for (i in unique(ref.data$gene)) {
test.data$gene <- ifelse(
patient.df$de_id %in% ref.data$de_id[ref.data$gene == i], 0,
ifelse(
!(patient.df$DNA_CNV %in% c("Fail", "Not Performed")), 1, NA
)
)
if (sum(na.omit(test.data$gene == 0)) / total.patients >= 0.001) {
var <- append(var, i)
colnames(test.data)[ncol(test.data)] <- i
} else {
test.data <- test.data[, -ncol(test.data)]
}
}
var <- colnames(t(memoSort(t(abs(test.data[, var] - 1)))))
test.data <- test.data[, c(colnames(patient.df), var)]
# perform testing
res <- list()
for (i in var) {
res[[i]] <- catStratTest(var = i, strata = "histology_group", data.df = test.data)
}
# add results to result data.frame
results <- rbind(
results,
data.frame(
Variable = c("Copy number variants (N, %)", var),
N = c(total.patients, rep("", length(var))),
Frequency = c(
"",
paste(
colSums(test.data[, var] == 0 & !is.na(test.data[, var])), " ", "(",
round(
colSums(test.data[, var] == 0 & !is.na(test.data[, var])) / total.patients * 100, 1
), "%", ")",
sep = ""
)
),
N1 = c(sapply(res, function(x) {
x$n[[1]][1]
})[1], rep("", length(var))),
`N1 summary stats` = c("", sapply(res, function(x) {
x$stats[[1]][[1]][1, 1]
})),
N2 = c(sapply(res, function(x) {
x$n[[1]][2]
})[1], rep("", length(var))),
`N2 summary stats` = c("", sapply(res, function(x) {
x$stats[[1]][[1]][1, 2]
})),
OR = c("", sapply(res, function(x) {
x$or[[1]][[2]][1]
})),
P = c("", sapply(res, function(x) {
x$p[[1]][[2]][1]
})),
check.names = FALSE
)
)
### Fusions
# create data for testing
total.patients <- sum(!(patient.df$RNA_Rearrangements %in% c("Fail", "Not Performed")))
var <- c()
test.data <- patient.df
ref.data <- variant.df[variant.df$variant_type == "Fusion", ]
ref.data$gene[
grep("Skip", ref.data$variant)
] <- grep("Skip", ref.data$variant, value = TRUE)
for (i in unique(ref.data$gene)) {
test.data$gene <- ifelse(
patient.df$de_id %in% ref.data$de_id[ref.data$gene == i], 0,
ifelse(
!(patient.df$RNA_Rearrangements %in% c("Fail", "Not Performed")), 1, NA
)
)
if (sum(na.omit(test.data$gene == 0)) / total.patients >= 0.001) {
var <- append(var, i)
colnames(test.data)[ncol(test.data)] <- i
} else {
test.data <- test.data[, -ncol(test.data)]
}
}
var <- colnames(t(memoSort(t(abs(test.data[, var] - 1)))))
test.data <- test.data[, c(colnames(patient.df), var)]
# perform testing
res <- list()
for (i in var) {
res[[i]] <- catStratTest(var = i, strata = "histology_group", data.df = test.data)
}
# add results to result data.frame
results <- rbind(
results,
data.frame(
Variable = c("Fusions/Skipping Variants (N, %)", var),
N = c(total.patients, rep("", length(var))),
Frequency = c(
"",
paste(
colSums(test.data[, var] == 0 & !is.na(test.data[, var])), " ", "(",
round(
colSums(test.data[, var] == 0 & !is.na(test.data[, var])) / total.patients * 100, 1
), "%", ")",
sep = ""
)
),
N1 = c(sapply(res, function(x) {
x$n[[1]][1]
})[1], rep("", length(var))),
`N1 summary stats` = c("", sapply(res, function(x) {
x$stats[[1]][[1]][1, 1]
})),
N2 = c(sapply(res, function(x) {
x$n[[1]][2]
})[1], rep("", length(var))),
`N2 summary stats` = c("", sapply(res, function(x) {
x$stats[[1]][[1]][1, 2]
})),
OR = c("", sapply(res, function(x) {
x$or[[1]][[2]][1]
})),
P = c("", sapply(res, function(x) {
x$p[[1]][[2]][1]
})),
check.names = FALSE
)
)
### multiple testing correction
# adjust p-values for multiple testing
for (i in grep("P", colnames(results))) {
results[!(results[, i] %in% c("FDR q-value", "-", "")), i] <- formatC(
p.adjust(
as.numeric(results[!(results[, i] %in% c("FDR q-value", "-", "")), i]),
method = "BH"
),
digits = 1, format = "e"
)
}
### write out results
# create workbook
wb <- createWorkbook()
# add worksheet, write data, and format output
addWorksheet(wb, "Alteration prevalence")
writeData(wb, "Alteration prevalence", results, keepNA = TRUE, colNames = FALSE)
setColWidths(
wb, "Alteration prevalence",
cols = seq_len(ncol(results)),
widths = c(33, 5, 15, rep(c(5, 14), 2), 19.5, 11)
)
addStyle(
wb, "Alteration prevalence",
cols = seq_len(ncol(results)), rows = 1:(nrow(results) + 1),
gridExpand = TRUE, style = left_just, stack = TRUE
)
addStyle(
wb, "Alteration prevalence",
cols = 1,
rows = c(
(grep("^Single", results$Variable) + 1):(grep("^Copy", results$Variable) - 1),
(grep("^Copy", results$Variable) + 1):(grep("^Fusions", results$Variable) - 1),
(grep("^Fusions", results$Variable) + 1):nrow(results)
),
gridExpand = TRUE, style = center, stack = TRUE
)
mergeCells(wb, "Alteration prevalence", cols = c(2, 3), rows = 1)
mergeCells(wb, "Alteration prevalence", cols = c(4, 5), rows = 1)
mergeCells(wb, "Alteration prevalence", cols = c(6, 7), rows = 1)
addStyle(
wb, "Alteration prevalence",
cols = seq_len(ncol(results)), rows = 1:2,
gridExpand = TRUE, style = bold, stack = TRUE
)
addStyle(
wb, "Alteration prevalence",
cols = seq_len(ncol(results)),
rows = c(1, 3, (nrow(results) + 1)), gridExpand = TRUE,
style = horizontal_border_med, stack = TRUE
)
addStyle(
wb, "Alteration prevalence",
cols = seq_len(ncol(results)),
rows = c(
grep("^Single", results$Variable),
grep("^Copy", results$Variable),
grep("^Fusions", results$Variable)
),
gridExpand = TRUE, style = horizontal_border_thin, stack = TRUE
)
# convert numbers from strings back to numbers
convertNum(results, wb, "Alteration prevalence", FALSE)
# save workbook and results
saveWorkbook(
wb, "Output/Tables/Patient_alteration_prevalence_by_gene.xlsx",
overwrite = TRUE
)
gene.results <- results
```
### Genomic alteration prevalence: Variant-level
```{r variant-level-prevalence}
# create result data.frame
results <- data.frame()
results <- rbind(
results,
data.frame(
Variable = "",
Type = "",
N = "All patients", Frequency = "",
N1 = "Adenocarcinoma", `N1 summary stats` = "",
N2 = "Squamous cell carcinoma", `N2 summary stats` = "", OR = "", P = "",
check.names = FALSE
)
)
results <- rbind(
results,
data.frame(
Variable = "Variant",
Type = "SNV type",
N = "N", Frequency = "Summary stats",
N1 = "N", `N1 summary stats` = "Summary stats",
N2 = "N", `N2 summary stats` = "Summary stats", OR = "OR", P = "FDR q-value",
check.names = FALSE
)
)
# get total number of patients
results <- rbind(
results,
data.frame(
Variable = "Total number of patients",
Type = "",
N = nrow(patient.df), Frequency = "-",
N1 = table(patient.df$histology_group)[1], `N1 summary stats` = "-",
N2 = table(patient.df$histology_group)[2], `N2 summary stats` = "-", OR = "-", P = "-",
check.names = FALSE
)
)
### SNVs
# create data for testing
total.patients <- sum(!(patient.df$DNA_SNV %in% c("Fail", "Not Performed")))
var <- c()
test.data <- patient.df
ref.data <- variant.df[variant.df$variant_type == "SNV", ]
for (i in unique(ref.data$variant_go_name)) {
test.data$gene <- ifelse(
patient.df$de_id %in% ref.data$de_id[ref.data$variant_go_name == i], 0,
ifelse(
!(patient.df$DNA_SNV %in% c("Fail", "Not Performed")), 1, NA
)
)
if (sum(na.omit(test.data$gene == 0)) / total.patients >= 0.001) {
var <- append(var, i)
colnames(test.data)[ncol(test.data)] <- i
} else {
test.data <- test.data[, -ncol(test.data)]
}
}
var <- colnames(t(memoSort(t(abs(test.data[, var] - 1)))))
snv.type <- variant.df$snv_type
names(snv.type) <- variant.df$variant_go_name
test.data <- test.data[, c(colnames(patient.df), var)]
# perform testing
res <- list()
for (i in var) {
res[[i]] <- catStratTest(var = i, strata = "histology_group", data.df = test.data)
}
# add results to result data.frame
results <- rbind(
results,
data.frame(
Variable = c("Single nucleotide variants (N, %)", var),
Type = c("", snv.type[var]),
N = c(total.patients, rep("", length(var))),
Frequency = c(
"",
paste(
colSums(test.data[, var] == 0 & !is.na(test.data[, var])), " ", "(",
round(
colSums(test.data[, var] == 0 & !is.na(test.data[, var])) / total.patients * 100, 1
), "%", ")",
sep = ""
)
),
N1 = c(sapply(res, function(x) {
x$n[[1]][1]
})[1], rep("", length(var))),
`N1 summary stats` = c("", sapply(res, function(x) {
x$stats[[1]][[1]][1, 1]
})),
N2 = c(sapply(res, function(x) {
x$n[[1]][2]
})[1], rep("", length(var))),
`N2 summary stats` = c("", sapply(res, function(x) {
x$stats[[1]][[1]][1, 2]
})),
OR = c("", sapply(res, function(x) {
x$or[[1]][[2]][1]
})),
P = c("", sapply(res, function(x) {
x$p[[1]][[2]][1]
})),
check.names = FALSE
)
)
### CNVs
# create data for testing
total.patients <- sum(!(patient.df$DNA_CNV %in% c("Fail", "Not Performed")))
var <- c()
test.data <- patient.df
ref.data <- variant.df[variant.df$variant_type == "CNV", ]
for (i in unique(ref.data$variant_go_name)) {
test.data$gene <- ifelse(
patient.df$de_id %in% ref.data$de_id[ref.data$variant_go_name == i], 0,
ifelse(
!(patient.df$DNA_CNV %in% c("Fail", "Not Performed")), 1, NA
)
)
if (sum(na.omit(test.data$gene == 0)) / total.patients >= 0.001) {
var <- append(var, i)
colnames(test.data)[ncol(test.data)] <- i
} else {
test.data <- test.data[, -ncol(test.data)]
}
}
var <- colnames(t(memoSort(t(abs(test.data[, var] - 1)))))
test.data <- test.data[, c(colnames(patient.df), var)]
# perform testing
res <- list()
for (i in var) {
res[[i]] <- catStratTest(var = i, strata = "histology_group", data.df = test.data)
}
# add results to result data.frame
results <- rbind(
results,
data.frame(
Variable = c("Copy number variants (N, %)", var),
Type = "",
N = c(total.patients, rep("", length(var))),
Frequency = c(
"",
paste(
colSums(test.data[, var] == 0 & !is.na(test.data[, var])), " ", "(",
round(
colSums(test.data[, var] == 0 & !is.na(test.data[, var])) / total.patients * 100, 1
), "%", ")",
sep = ""
)
),
N1 = c(sapply(res, function(x) {
x$n[[1]][1]
})[1], rep("", length(var))),
`N1 summary stats` = c("", sapply(res, function(x) {
x$stats[[1]][[1]][1, 1]
})),
N2 = c(sapply(res, function(x) {
x$n[[1]][2]
})[1], rep("", length(var))),
`N2 summary stats` = c("", sapply(res, function(x) {
x$stats[[1]][[1]][1, 2]
})),
OR = c("", sapply(res, function(x) {
x$or[[1]][[2]][1]
})),
P = c("", sapply(res, function(x) {
x$p[[1]][[2]][1]
})),
check.names = FALSE
)
)
### Fusions
# create data for testing
total.patients <- sum(!(patient.df$RNA_Rearrangements %in% c("Fail", "Not Performed")))
var <- c()
test.data <- patient.df
ref.data <- variant.df[variant.df$variant_type == "Fusion", ]
ref.data$variant_go_name[
grep("Skip", ref.data$variant)
] <- grep("Skip", ref.data$variant, value = TRUE)
for (i in unique(ref.data$variant_go_name)) {
test.data$gene <- ifelse(
patient.df$de_id %in% ref.data$de_id[ref.data$variant_go_name == i], 0,
ifelse(
!(patient.df$RNA_Rearrangements %in% c("Fail", "Not Performed")), 1, NA
)
)
if (sum(na.omit(test.data$gene == 0)) / total.patients >= 0.001) {
var <- append(var, i)
colnames(test.data)[ncol(test.data)] <- i
} else {
test.data <- test.data[, -ncol(test.data)]
}
}
var <- colnames(t(memoSort(t(abs(test.data[, var] - 1)))))
test.data <- test.data[, c(colnames(patient.df), var)]
# perform testing
res <- list()
for (i in var) {
res[[i]] <- catStratTest(var = i, strata = "histology_group", data.df = test.data)
}
# add results to result data.frame
results <- rbind(
results,
data.frame(
Variable = c("Fusions/Skipping Variants (N, %)", var),
Type = "",
N = c(total.patients, rep("", length(var))),
Frequency = c(
"",
paste(
colSums(test.data[, var] == 0 & !is.na(test.data[, var])), " ", "(",
round(
colSums(test.data[, var] == 0 & !is.na(test.data[, var])) / total.patients * 100, 1
), "%", ")",
sep = ""
)
),
N1 = c(sapply(res, function(x) {
x$n[[1]][1]
})[1], rep("", length(var))),
`N1 summary stats` = c("", sapply(res, function(x) {
x$stats[[1]][[1]][1, 1]
})),
N2 = c(sapply(res, function(x) {
x$n[[1]][2]
})[1], rep("", length(var))),
`N2 summary stats` = c("", sapply(res, function(x) {
x$stats[[1]][[1]][1, 2]
})),
OR = c("", sapply(res, function(x) {
x$or[[1]][[2]][1]
})),
P = c("", sapply(res, function(x) {
x$p[[1]][[2]][1]
})),
check.names = FALSE
)
)
### multiple testing correction
# adjust p-values for multiple testing
for (i in grep("P", colnames(results))) {
results[!(results[, i] %in% c("FDR q-value", "-", "")), i] <- formatC(
p.adjust(
as.numeric(results[!(results[, i] %in% c("FDR q-value", "-", "")), i]),
method = "BH"
),
digits = 1, format = "e"
)
}
### write out results
# create workbook
wb <- createWorkbook()
# add worksheet, write data, and format output
addWorksheet(wb, "Alteration prevalence")
writeData(wb, "Alteration prevalence", results, keepNA = TRUE, colNames = FALSE)
setColWidths(
wb, "Alteration prevalence",
cols = seq_len(ncol(results)),
widths = c(33, 12, 5, 15, rep(c(5, 14), 2), 19.5, 11)
)
addStyle(
wb, "Alteration prevalence",
cols = seq_len(ncol(results)), rows = 1:(nrow(results) + 1),
gridExpand = TRUE, style = left_just, stack = TRUE
)
addStyle(
wb, "Alteration prevalence",
cols = 1,
rows = c(
(grep("^Single", results$Variable) + 1):(grep("^Copy", results$Variable) - 1),
(grep("^Copy", results$Variable) + 1):(grep("^Fusions", results$Variable) - 1),
(grep("^Fusions", results$Variable) + 1):nrow(results)
),
gridExpand = TRUE, style = center, stack = TRUE
)
mergeCells(wb, "Alteration prevalence", cols = c(3, 4), rows = 1)
mergeCells(wb, "Alteration prevalence", cols = c(5, 6), rows = 1)
mergeCells(wb, "Alteration prevalence", cols = c(7, 8), rows = 1)
addStyle(
wb, "Alteration prevalence",
cols = seq_len(ncol(results)), rows = 1:2,
gridExpand = TRUE, style = bold, stack = TRUE
)
addStyle(
wb, "Alteration prevalence",
cols = seq_len(ncol(results)),
rows = c(1, 3, (nrow(results) + 1)), gridExpand = TRUE,
style = horizontal_border_med, stack = TRUE
)
addStyle(
wb, "Alteration prevalence",
cols = seq_len(ncol(results)),
rows = c(
grep("^Single", results$Variable),
grep("^Copy", results$Variable),
grep("^Fusions", results$Variable)
),
gridExpand = TRUE, style = horizontal_border_thin, stack = TRUE
)
# convert numbers from strings back to numbers
convertNum(results, wb, "Alteration prevalence", FALSE)
# save workbook and results
saveWorkbook(
wb, "Output/Tables/Patient_alteration_prevalence_by_variant.xlsx",
overwrite = TRUE
)
var.results <- results
```
### Visualization of genomic alterations and alteration prevalence
```{r alteration-overview}
#| label: fig-alteration-overview
#| fig-cap: "Overview of genomic alterations detected by the TSO500 in NSCLC cases"
#| fig-width: 17
#| fig-height: 20
# extract data needed for alteration matrix, isolating unique instances
totals.list <- c()
totals.list <- append(totals.list, nrow(patient.df))
plot.data <- unique(variant.df[, c("de_id", "snv_type", "gene")])
# extract data needed for stacked bar plotting, isolating unique instances and
# calculate number of alterations per gene
plot.data <- ddply(
unique(variant.df[, c("de_id", "variant_type", "snv_type", "gene")]),
.(de_id, gene, variant_type), summarize,
snv_type = snv_type[1]
)
plot.data <- ddply(plot.data, .(gene, snv_type), summarize, var_count = length(de_id))
plot.data$snv_type <- factor(
plot.data$snv_type,
levels = c("Substitution", "Indel", "Splice site", "Other SNV", "Fusion", "CNV")
)
# convert alteration counts to frequencies using appropriate denominators
for (j in names(table(plot.data$snv_type))) {
total.patients <- ifelse(
j == "CNV", sum(!(patient.df$DNA_CNV %in% c("Fail", "Not Performed"))),
ifelse(
j == "Fusion", sum(!(patient.df$RNA_Rearrangement %in% c("Fail", "Not Performed"))),
sum(!(patient.df$DNA_SNV %in% c("Fail", "Not Performed")))
)
)
plot.data$var_freq[plot.data$snv_type == j] <-
plot.data$var_count[plot.data$snv_type == j] / total.patients
totals.list <- append(totals.list, total.patients)
}
# calculate combined frequencies of alterations per gene and sort
# from highest to lowest
gene.freq <- ddply(
plot.data, .(gene), summarize,
freq = sum(var_freq),
perc = paste(round(sum(var_freq), 4) * 100, "%", sep = "")
)
gene.freq <- gene.freq[order(gene.freq$freq, decreasing = TRUE), ]
plot.data$gene <- factor(plot.data$gene, levels = gene.freq$gene)
# create color vector to color genes that have tier 1 or 2 alterations
gene.col <- ifelse(
levels(plot.data$gene) %in%
therapy.df$marker_component_marker[grep("Guideline", therapy.df$amp_cap_asco_tier)],
"red",
ifelse(
levels(plot.data$gene) %in%
therapy.df$marker_component_marker[grep("Clinical", therapy.df$amp_cap_asco_tier)],
"orange", "black"
)
)
names(gene.col) <- levels(plot.data$gene)
# create color vector for filling in plots based on alteration type
col <- RColorBrewer::brewer.pal(12, name = "Paired")[c(1:4, 9:10)]
names(col) <- levels(plot.data$snv_type)
# tag genes with alterations that have different frequencies between histology
gene.freq$sub_diff[
gene.freq$gene %in%
sapply(
strsplit(var.results$Variable[
var.results$Type == "Substitution" &
as.numeric(var.results$P) < 0.05
], " "),
function(x) {
x[1]
}
)
] <- col["Substitution"]
gene.freq$indel_diff[
gene.freq$gene %in%
sapply(
strsplit(var.results$Variable[
var.results$Type == "Indel" &
as.numeric(var.results$P) < 0.05
], " "),
function(x) {
x[1]
}
)
] <- col["Indel"]
gene.freq$splice_diff[
gene.freq$gene %in%
sapply(
strsplit(var.results$Variable[
var.results$Type == "Splice site" &
as.numeric(var.results$P) < 0.05
], " "),
function(x) {
x[1]
}
)
] <- col["Splice site"]
gene.freq$other_diff[
gene.freq$gene %in%
sapply(
strsplit(var.results$Variable[
var.results$Type == "Other SNV" &
as.numeric(var.results$P) < 0.05
], " "),
function(x) {
x[1]
}
)
] <- col["Other SNV"]
gene.freq$cnv_diff[
gene.freq$gene %in%
gene.results$Variable[
(grep("^Copy", gene.results$Variable) + 1):(grep("^Fusions", gene.results$Variable) - 1)
][
as.numeric(gene.results$P[
(grep("^Copy", gene.results$Variable) + 1):(grep("^Fusions", gene.results$Variable) - 1)
]) < 0.05
]
] <- col["CNV"]
gene.freq$fusion_diff[
gene.freq$gene %in%
gsub(
" Exon 14 Skipping", "",
gene.results$Variable[
(grep("^Fusions", gene.results$Variable) + 1):nrow(gene.results)
][
as.numeric(gene.results$P[
(grep("^Fusions", gene.results$Variable) + 1):nrow(gene.results)
]) < 0.05
]
)
] <- col["Fusion"]
# create bar plot of patients with SNVs, CNV, and fusions/rearrangements, and
# guide-line indicated alterations and alterations with clinical trial/treatment
# in other tumor type
plot.data.sub <- ddply(
unique(variant.df[, c("de_id", "snv_type")]), .(snv_type), summarize,
prop = round(length(de_id) / totals.list[1], 2) * 100
)
plot.data.sub$snv_type <- factor(
plot.data.sub$snv_type,
levels = plot.data.sub[order(plot.data.sub$prop, decreasing = TRUE), "snv_type"]
)
g1 <- ggplot(plot.data.sub, aes(x = snv_type, y = prop, fill = snv_type)) +
geom_bar(stat = "identity", color = "black", alpha = 0.75) +
geom_text(aes(label = paste(prop, "%", sep = "")), size = 8, vjust = -1) +
scale_y_continuous(limits = c(0, max(plot.data.sub$prop) + 5)) +
scale_fill_manual(
breaks = levels(plot.data$snv_type), values = col, na.value = NA,
labels = levels(plot.data$snv_type)
) +
coord_cartesian(clip = "off") +
theme_classic() +
theme(
text = element_text(size = 24),
axis.title.x = element_blank(),
legend.position = "none"
) +
labs(y = "Percent of tumors\nwith alteration type")
plot.data.sub <- ddply(
unique(
therapy.df[
therapy.df$marker_component_marker_type %in% c("SNV", "CNV", "Fusion"),
c("de_id", "amp_cap_asco_tier")
]
),
.(amp_cap_asco_tier), summarize,
prop = round(length(de_id) / totals.list[1], 2) * 100
)
plot.data.sub$amp_cap_asco_tier <- factor(
plot.data.sub$amp_cap_asco_tier,
levels = c("Guideline indicated", "Clinical trial or therapy in other tumor type")
)
g2 <- ggplot(plot.data.sub, aes(x = amp_cap_asco_tier, y = prop, fill = amp_cap_asco_tier)) +
geom_bar(stat = "identity", color = "black", alpha = 0.75) +
geom_text(aes(label = paste(prop, "%", sep = "")), size = 8, vjust = -1) +
scale_x_discrete(labels = stringr::str_wrap(levels(plot.data.sub$amp_cap_asco_tier), 10)) +
scale_y_continuous(limits = c(0, max(plot.data.sub$prop) + 5)) +
scale_fill_manual(values = c("red", "orange"), na.value = NA) +
coord_cartesian(clip = "off") +
theme_classic() +
theme(
text = element_text(size = 24),
axis.title.x = element_blank(),
axis.title.y = element_blank(),
axis.ticks.y = element_blank(),
axis.text.y = element_blank(),
axis.line.y = element_blank(),
axis.line.x = element_line(),
legend.position = "none"
)
g1 <- ggarrange(g1, NULL, g2, nrow = 1, ncol = 3, widths = c(0.75, 0, 0.25), align = "h")
# create stacked bar plot of all genes with alterations detected in >=2% of patients
target.genes <- gene.freq$gene[round(gene.freq$freq, 2) >= 0.02]
plot.data.sub <- plot.data[plot.data$gene %in% target.genes, ]
g2 <- ggplot(plot.data.sub, aes(x = gene, y = var_freq, fill = snv_type)) +
geom_bar(position = "stack", stat = "identity") +
geom_text(
inherit.aes = FALSE,
data = gene.freq[gene.freq$gene %in% target.genes, ],
aes(x = gene, y = freq + 0.01, label = perc),
size = 6, angle = 90, hjust = 0
) +
geom_point(
inherit.aes = FALSE,
data = gene.freq[gene.freq$gene %in% target.genes, ],
aes(x = gene, y = freq + 0.01 + 0.1),
color = gene.freq$sub_diff[gene.freq$gene %in% target.genes],
size = 5
) +
geom_point(
inherit.aes = FALSE,
data = gene.freq[gene.freq$gene %in% target.genes, ],
aes(x = gene, y = freq + 0.01 + 0.11),
color = gene.freq$indel_diff[gene.freq$gene %in% target.genes],
size = 5
) +
geom_point(
inherit.aes = FALSE,
data = gene.freq[gene.freq$gene %in% target.genes, ],
aes(x = gene, y = freq + 0.01 + 0.12),
color = gene.freq$splice_diff[gene.freq$gene %in% target.genes],
size = 5
) +
geom_point(
inherit.aes = FALSE,
data = gene.freq[gene.freq$gene %in% target.genes, ],
aes(x = gene, y = freq + 0.01 + 0.12),
color = gene.freq$fusion_diff[gene.freq$gene %in% target.genes],
size = 5
) +
geom_point(
inherit.aes = FALSE,
data = gene.freq[gene.freq$gene %in% target.genes, ],
aes(x = gene, y = freq + 0.01 + 0.12),
color = gene.freq$cnv_diff[gene.freq$gene %in% target.genes],
size = 5
) +
geom_point(
inherit.aes = FALSE,
data = gene.freq[gene.freq$gene %in% target.genes, ],
aes(x = gene, y = freq + 0.01 + 0.12),
color = gene.freq$other_diff[gene.freq$gene %in% target.genes],
size = 5
) +
geom_bar(
inherit.aes = FALSE,
data = gene.freq[gene.freq$gene %in% target.genes, ],
aes(x = gene, y = freq),
position = "stack", stat = "identity", color = "grey50", fill = NA
) +
scale_discrete_manual("fill",
values = col[names(table(plot.data.sub$snv_type)
[table(plot.data.sub$snv_type) > 0])],
name = ""
) +
scale_y_continuous(limits = c(0, max(gene.freq$freq) + 0.15), expand = c(0, 0)) +
coord_cartesian(clip = "off") +
guides(fill = "none") +
theme_classic() +
theme(
text = element_text(size = 24),
axis.text.x = element_text(angle = 45, hjust = 1, color = gene.col[target.genes]),
axis.title.x = element_markdown(size = 20),
plot.margin = margin(20, 5, 5, 5)
) +
labs(
y = paste(
"Alteration prevalence\n",
"(number of SNVs, CNVs or fusions ÷\ntotal patients passing assay)"
),
x = paste("Genes with cumulative SNV, CNV, and/or fusion prevalence of 2%<br>",
"<b style='color:#E74C3C'>red</b> = Guideline-indicated alteration(s) detected<br>",
"<b style='color:#F39C12'>orange</b> = Alteration(s) detected with clinical trial or therapy in other tumor type",
sep = ""
)
)
plot.data.sub <- plot.data[
plot.data$gene %in% target.genes &
plot.data$gene %in% gene.freq$gene[round(gene.freq$freq, 2) < 0.1],
]
g2 <- g2 + annotation_custom(
grob = ggplotGrob(
ggplot(plot.data.sub, aes(x = gene, y = var_freq, fill = snv_type)) +
geom_bar(position = "stack", stat = "identity") +
geom_text(
inherit.aes = FALSE,
data = gene.freq[round(gene.freq$freq, 2) >= 0.02 &
round(gene.freq$freq, 2) < 0.1, ],
aes(x = gene, y = freq + 0.001, label = perc),
size = 6, angle = 90, hjust = 0
) +
geom_bar(
inherit.aes = FALSE,
data = gene.freq[round(gene.freq$freq, 2) >= 0.02 &
round(gene.freq$freq, 2) < 0.1, ],
aes(x = gene, y = freq),
position = "stack", stat = "identity", color = "grey50", fill = NA
) +
scale_discrete_manual("fill",
values = col[names(table(plot.data.sub$snv_type)
[table(plot.data.sub$snv_type) > 0])],
name = ""
) +
scale_y_continuous(
breaks = seq(0, 0.1, 0.02), labels = round(seq(0, 0.1, 0.02), 2),
limits = c(0, 0.11), expand = c(0, 0)
) +
coord_cartesian(clip = "off") +
theme_classic() +
guides(fill = "none") +
theme(
text = element_text(size = 24),
axis.title.y = element_blank(),
axis.text.x = element_text(
angle = 45, hjust = 1,
color = gene.col[-seq_len(nrow(gene.freq[round(gene.freq$freq, 2) >= 0.1, ]))]
),
axis.title.x = element_blank(),
plot.margin = margin(0, 0, 0, 0)
)
),
xmin = nrow(gene.freq[round(gene.freq$freq, 2) >= 0.1, ]) - 0.5,
xmax = length(unique(plot.data[plot.data$gene %in% target.genes, ]$gene)) + 0.5,
ymin = round(max(gene.freq$freq[round(gene.freq$freq, 2) < 0.1]) * 3.4, 2),
ymax = max(gene.freq$freq) + 0.15
)
# extract data needed for stacked bar plotting of RNA-based results
# isolating unique instances and calculate number of alterations per gene
plot.data <- variant.df[
variant.df$snv_type == "Fusion",
c("de_id", "snv_type", "gene", "variant")
]
plot.data$gene[
grep("Skip", plot.data$variant)
] <- grep("Skip", plot.data$variant, value = TRUE)
plot.data <- ddply(
unique(plot.data[, c("de_id", "snv_type", "gene")]),
.(gene, snv_type), summarize,
var_count = length(de_id)
)
plot.data$snv_type[grepl("Skip", plot.data$gene)] <- "Exon skipping variant"
plot.data$gene <- sapply(strsplit(plot.data$gene, " "), function(x) {
x[1]
})
# convert alteration counts to frequencies using appropriate denominators
total.patients <- sum(
!(patient.df$RNA_Rearrangements %in% c("Fail", "Not Performed"))
)
plot.data$var_freq <- plot.data$var_count / total.patients
# calculate combined frequencies of alterations per gene and sort
# from highest to lowest
gene.freq <- ddply(
plot.data, .(gene), summarize,
freq = sum(var_freq),
perc = paste(round(sum(var_freq), 4) * 100, "%", sep = "")
)
gene.freq <- gene.freq[order(gene.freq$freq, decreasing = TRUE), ]
# make sure levels of variables are in correct order
plot.data$gene <- factor(plot.data$gene, levels = gene.freq$gene)
# create color vector to color genes that have tier 1 alterations or
# alterations matched to a clinical trial
gene.col <- ifelse(
levels(plot.data$gene) %in%
therapy.df$marker_component_marker[grep("Guideline", therapy.df$amp_cap_asco_tier)],
"red",
ifelse(
levels(plot.data$gene) %in%
therapy.df$marker_component_marker[grep("Clinical", therapy.df$amp_cap_asco_tier)],
"orange", "black"
)
)
names(gene.col) <- levels(plot.data$gene)
# tag genes with alterations that have different frequencies between histology
gene.freq$fusion_diff[
gene.freq$gene %in%
gsub(
" Exon 14 Skipping", "",
gene.results$Variable[
(grep("^Fusions", gene.results$Variable) + 1):nrow(gene.results)
][
as.numeric(gene.results$P[
(grep("^Fusions", gene.results$Variable) + 1):nrow(gene.results)
]) < 0.05
]
)
] <- "#CAB2D6"
gene.freq$fusion_diff[gene.freq$gene == "MET"] <- "#ABEBC6"
# create stacked bar plot of genes with fusions/rearrangements detected in >= 0.1%
# of patients
col <- c("#ABEBC6", "#CAB2D6")
names(col) <- c("Exon skipping variant", "Fusion")
target.genes <- gene.freq$gene[gene.freq$freq >= 0.001]
plot.data.sub <- plot.data[plot.data$gene %in% target.genes, ]
g3 <- ggplot(plot.data.sub, aes(x = gene, y = var_freq, fill = snv_type)) +
geom_bar(position = "stack", stat = "identity") +
geom_text(
inherit.aes = FALSE,
data = gene.freq[gene.freq$gene %in% target.genes, ],
aes(x = gene, y = freq + 0.001, label = perc),
size = 7, angle = 90, hjust = 0
) +
geom_point(
inherit.aes = FALSE,
data = gene.freq[gene.freq$gene %in% target.genes, ],
aes(x = gene, y = freq + 0.001 + 0.02),
color = gene.freq$fusion_diff[gene.freq$gene %in% target.genes],
size = 5
) +
geom_bar(
inherit.aes = FALSE,
data = gene.freq[gene.freq$gene %in% target.genes, ],
aes(x = gene, y = freq),
position = "stack", stat = "identity", color = "grey50", fill = NA
) +
scale_discrete_manual("fill",
values = col[names(table(plot.data.sub$snv_type)
[table(plot.data.sub$snv_type) > 0])],
name = ""
) +
scale_y_continuous(limits = c(0, max(plot.data$var_freq) + 0.03), expand = c(0, 0)) +
coord_cartesian(clip = "off") +
theme_classic() +
theme(
text = element_text(size = 24),
axis.text.x = element_text(angle = 45, hjust = 1, color = gene.col[target.genes]),
axis.title.x = element_markdown(size = 20),
legend.title = element_blank(),
legend.text = element_text(size = 24),
legend.position = "top",
plot.margin = margin(20, 5, 5, 5)
) +
labs(
y = paste(
"Alteration prevalence\n",
"(number of fusions/variants ÷\ntotal patients passing assay)"
),
x = paste("Fusions or skipping variants detected in 0.1% of patients by RNA-seq<br>",
"<b style='color:#E74C3C'>red</b> = Guideline-indicated alteration(s) detected<br>",
"<b style='color:#F39C12'>orange</b> = Alteration(s) detected with clinical trial or therapy in other tumor type",
sep = ""
)
)
# put all plots together
g <- ggarrange(
NULL, g1, NULL, g2, NULL, g3,
nrow = 6, ncol = 1, heights = c(0.03, 0.5, 0, 1, 0, 0.6),
labels = c("", "A", "", "B", "", "C"), font.label = list(size = 30), label.y = 1.08
)
ggsave(
"Output/Plots/Genomic_alteration_overview.tiff", g,
device = "tiff", bg = "white", width = 17, height = 20
)
g
```
## Association of TMB and PD-L1 IHC with known driver mutations
### Distribution of TMB and PD-L1 in tumors with non-KRAS driver mutations
```{r tmb-pdl1-dist-non-KRAS}
#| label: fig-tmb-pdl1-dist-non-KRAS
#| fig-cap: "TMB and PD-L1 22C3 IHC TPS in non-KRAS driver mutated NSCLC"
#| fig-width: 37
#| fig-height: 22
# construct data needed for plotting and testing, isolating unique instances
target.genes <- c("ALK", "EGFR", "MET", "BRAF", "ROS1", "ERBB2")
plot.data <- patient.df[, c("de_id", "TMB", "PD_L1_IHC_22C3_interp", "histology_group")]
for (gene in target.genes) {
plot.data <- cbind(
plot.data,
ifelse(
patient.df$de_id %in% therapy.df$de_id[
therapy.df$marker_component_marker == gene &
grepl("Guideline", therapy.df$amp_cap_asco_tier) &
therapy.df$marker_component_marker_type %in% c("SNV", "CNV", "Fusion")
],
"Driver mutation",
ifelse(
patient.df$de_id %in% variant.df$de_id[
variant.df$gene == gene &
!(variant.df$de_id %in% therapy.df$de_id[
therapy.df$marker_component_marker == gene &
grepl("Guideline", therapy.df$amp_cap_asco_tier) &
therapy.df$marker_component_marker_type %in% c("SNV", "CNV", "Fusion")
])
],
paste("Non-guideline indicated ", gene, " alterations", sep = ""),
ifelse(
!(patient.df$de_id %in% variant.df$de_id[
variant.df$gene %in% therapy.df$marker_component_marker[
grepl("Guideline", therapy.df$amp_cap_asco_tier) &
therapy.df$marker_component_marker_type %in% c("SNV", "CNV", "Fusion")
]
]),
"No driver gene alterations detected", NA
)
)
)
)
colnames(plot.data)[ncol(plot.data)] <- gene
}
# format data
plot.data <- reshape2::melt(plot.data, id = c("de_id", "PD_L1_IHC_22C3_interp", "TMB", "histology_group"))
plot.data <- plot.data[
!is.na(plot.data$TMB) &
!is.na(plot.data$PD_L1_IHC_22C3_interp) &
!is.na(plot.data$value),
]
colnames(plot.data)[ncol(plot.data)] <- "status"
colnames(plot.data)[colnames(plot.data) == "TMB"] <- "value"
# add exact driver mutation details
extra.plot.data <- data.frame()
for (j in seq_len(nrow(plot.data))) {
if (plot.data$status[j] == "Driver mutation") {
variant <- unique(therapy.df$marker_component[
therapy.df$de_id == plot.data$de_id[j] &
therapy.df$marker_component_marker == plot.data$variable[j] &
grepl("Guideline", therapy.df$amp_cap_asco_tier) &
therapy.df$marker_component_marker_type %in% c("SNV", "CNV", "Fusion")
])
if (length(variant) > 1) {
plot.data$status[j] <- variant[1]
extra.plot.data <- rbind(
extra.plot.data, data.frame(plot.data[j, -ncol(plot.data)], status = variant[-1])
)
} else {
plot.data$status[j] <- variant
}
}
}
plot.data <- rbind(plot.data, extra.plot.data)
# group less prevalent mutations together if they are found in <0.5% of patients
plot.data$status_grouped <- plot.data$status
threshold <- 0.005 * nrow(patient.df)
for (gene in target.genes) {
if (
(length(plot.data$status_grouped[plot.data$variable == gene &
grepl("Fusion", plot.data$status_grouped)]) ==
length(plot.data$status_grouped[plot.data$variable == gene &
!grepl("^No", plot.data$status_grouped)])) &
length(table(plot.data$status_grouped[plot.data$variable == gene &
!grepl("^No", plot.data$status_grouped)])[
table(plot.data$status_grouped[plot.data$variable == gene &
!grepl("^No", plot.data$status_grouped)]) < threshold
]) ==
length(table(plot.data$status_grouped[plot.data$variable == gene &
!grepl("^No", plot.data$status_grouped)]))
) {
plot.data$status_grouped[
plot.data$variable == gene & !grepl("^No", plot.data$status_grouped) &
plot.data$status_grouped %in%
names(table(plot.data$status_grouped)[table(plot.data$status_grouped) < threshold])
] <- paste(gene, " Fusions", sep = "")
} else if (
length(table(plot.data$status_grouped[plot.data$variable == gene &
!grepl("^No", plot.data$status_grouped)])[
table(plot.data$status_grouped[plot.data$variable == gene &
!grepl("^No", plot.data$status_grouped)]) < threshold
]) ==
length(table(plot.data$status_grouped[plot.data$variable == gene &
!grepl("^No", plot.data$status_grouped)]))
) {
plot.data$status_grouped[
plot.data$variable == gene & !grepl("^No", plot.data$status_grouped) &
plot.data$status_grouped %in%
names(table(plot.data$status_grouped)[table(plot.data$status_grouped) < threshold])
] <- paste(gene, " guideline indicated alterations", sep = "")
} else {
plot.data$status_grouped[
plot.data$variable == gene & !grepl("^No", plot.data$status_grouped) &
plot.data$status_grouped %in%
names(table(plot.data$status_grouped)[table(plot.data$status_grouped) < threshold])
] <- paste("Other ", gene, " guideline indicated alterations", sep = "")
}
}
# group all EGFR driver mutations together
plot.data$status_grouped[
plot.data$variable == "EGFR" &
!(plot.data$status_grouped %in%
c("Non-guideline indicated EGFR alterations", "No driver gene alterations detected"))
] <- "EGFR guideline indicated alterations"
# re-label other ALK guideline indicated as other ALK fusions
plot.data$status_grouped[grep("Other ALK", plot.data$status_grouped)] <- "Other ALK fusions"
# re-label other MET guideline indicated as MET amplification
plot.data$status_grouped[grep("Other MET", plot.data$status_grouped)] <- "MET amplification"
# calculate PD-L1 group totals and percents
totals <- data.frame()
for (gene in target.genes) {
for (
status_grouped in levels(factor(plot.data[plot.data$variable == gene, ]$status_grouped))
) {
totals <- rbind(totals, ddply(
plot.data[plot.data$variable == gene & plot.data$status_grouped == status_grouped, ],
.(variable, status_grouped, PD_L1_IHC_22C3_interp), summarize,
N = length(de_id),
value = length(de_id) /
nrow(plot.data[plot.data$variable == gene & plot.data$status_grouped == status_grouped, ])
))
}
}
# calculate/collate summary stats and write out results
summ.stats <- ddply(
plot.data, .(variable, status_grouped), summarize,
`Alterations in group` = paste(unique(status), collapse = ", "),
`Median TMB` = median(value)
)
for (gene in target.genes) {
summ.stats$`Alterations in group`[
summ.stats$variable == gene & grepl("Non-guideline", summ.stats$status_grouped)
] <- paste(
unique(variant.df$variant_go_name[
variant.df$gene == gene &
!(variant.df$variant_go_name %in%
therapy.df$marker_component[grep("Guideline", therapy.df$amp_cap_asco_tier)])
]),
collapse = ", "
)
}
summ.stats$`Alterations in group`[grep("No ", summ.stats$`Alterations in group`)] <- "-"
colnames(summ.stats)[1:2] <- c("Gene", "Alteration group")
summ.stats$`Alteration group` <- gsub(
"Non-guideline indicated", "Other", summ.stats$`Alteration group`
)
wb1 <- createWorkbook()
addWorksheet(wb1, "Alteration groups")
writeData(
wb1, "Alteration groups", summ.stats[, -ncol(summ.stats)],
keepNA = TRUE, colNames = TRUE
)
setColWidths(
wb1, "Alteration groups",
cols = seq_len(ncol(summ.stats) - 1),
widths = c(6, 30, 30)
)
addStyle(
wb1, "Alteration groups",
cols = seq_len(ncol(summ.stats) - 1), rows = 1,
gridExpand = TRUE, style = bold, stack = TRUE
)
addStyle(
wb1, "Alteration groups",
cols = seq_len(ncol(summ.stats) - 1),
rows = c(1, 2, (nrow(summ.stats) + 2)), gridExpand = TRUE,
style = horizontal_border_med, stack = TRUE
)
convertNum(summ.stats, wb1, "Alteration groups", TRUE)
saveWorkbook(
wb1, "Output/Tables/Alteration_groups_non-KRAS_drivers.xlsx",
overwrite = TRUE
)
# test for differences in frequencies of PD-L1 levels and TMB
# and write out the results
pvals <- data.frame()
for (gene in target.genes) {
for (j in unique(plot.data$status_grouped[plot.data$variable == gene])) {
for (k in unique(plot.data$status_grouped[plot.data$variable == gene])) {
if (j != k) {
pdl1.p <- logistf::logistftest(logistf::logistf(
ifelse(
status_grouped == names(table(status_grouped))[1], 1, 0
) ~ PD_L1_IHC_22C3_interp + histology_group,
data = plot.data[plot.data$variable == gene & plot.data$status_grouped %in% c(j, k), ]
))$prob
tmb.p <- summary(lm(
log(value + 1) ~ status_grouped + histology_group,
data = plot.data[plot.data$variable == gene & plot.data$status_grouped %in% c(j, k), ]
))$coefficients[2, 4]
pvals <- rbind(
pvals,
data.frame(
Gene = gene, `Alteration group 1` = j, `Alteration group 2` = k,
`PD-L1 Adjusted P-value` = pdl1.p, `TMB Adjusted P-value` = tmb.p,
check.names = FALSE
)
)
}
}
}
pvals <- pvals[!duplicated(pvals$`TMB Adjusted P-value`), ]
pvals$`PD-L1 Adjusted P-value`[pvals$variable == gene] <- p.adjust(
pvals$`PD-L1 Adjusted P-value`[pvals$variable == gene],
method = "bonferroni"
)
pvals$`TMB Adjusted P-value`[pvals$variable == gene] <- p.adjust(
pvals$`TMB Adjusted P-value`[pvals$variable == gene],
method = "bonferroni"
)
}
pvals$`Alteration group 1` <- gsub(
"Non-guideline indicated", "Other", pvals$`Alteration group 1`
)
pvals$`Alteration group 2` <- gsub(
"Non-guideline indicated", "Other", pvals$`Alteration group 2`
)
wb2 <- createWorkbook()
addWorksheet(wb2, "TMB and PD-L1")
writeData(wb2, "TMB and PD-L1", pvals, keepNA = TRUE, colNames = TRUE)
setColWidths(
wb2, "TMB and PD-L1",
cols = seq_len(ncol(pvals)),
widths = c(6, 30, 30, 21, 21)
)
addStyle(
wb2, "TMB and PD-L1",
cols = seq_len(ncol(pvals)), rows = 1,
gridExpand = TRUE, style = bold, stack = TRUE
)
addStyle(
wb2, "TMB and PD-L1",
cols = seq_len(ncol(pvals)),
rows = c(1, 2, (nrow(pvals) + 2)), gridExpand = TRUE,
style = horizontal_border_med, stack = TRUE
)
convertNum(pvals, wb2, "TMB and PD-L1", TRUE)
saveWorkbook(
wb2, "Output/Tables/Testing_diff_in_TMB_PD_L1_non-KRAS_drivers.xlsx",
overwrite = TRUE
)
pvals$PD_L1_labels <- ifelse(
pvals$`PD-L1 Adjusted P-value` < 0.0001, "****",
ifelse(
pvals$`PD-L1 Adjusted P-value` < 0.001, "***",
ifelse(
pvals$`PD-L1 Adjusted P-value` < 0.01, "**",
ifelse(pvals$`PD-L1 Adjusted P-value` < 0.05, "*", "ns")
)
)
)
pvals$TMB_labels <- ifelse(
pvals$`TMB Adjusted P-value` < 0.0001, "****",
ifelse(
pvals$`TMB Adjusted P-value` < 0.001, "***",
ifelse(
pvals$`TMB Adjusted P-value` < 0.01, "**",
ifelse(pvals$`TMB Adjusted P-value` < 0.05, "*", "ns")
)
)
)
# combine data for plotting
plot.data <- data.frame(
variable = c(plot.data$variable, totals$variable),
status_grouped = c(plot.data$status_grouped, totals$status_grouped),
PD_L1_IHC_22C3_interp = c(plot.data$PD_L1_IHC_22C3_interp, totals$PD_L1_IHC_22C3_interp),
N = c(rep(NA, nrow(plot.data)), totals$N),
value = c(plot.data$value, totals$value),
plot = c(
rep("Log transformed TMB (Mut/Mb)", nrow(plot.data)),
rep("PD-L1 TPS level", nrow(totals))
)
)
plot.data$status_grouped <- gsub(
"Non-guideline indicated", "Other", plot.data$status_grouped
)
plot.data$status_grouped <- factor(
plot.data$status_grouped,
levels = c(
sort(unique(plot.data$status_grouped)[
!(unique(plot.data$status_grouped) %in%
c(
unique(grep("Other.*guideline", plot.data$status_grouped, value = TRUE)),
grep(
"Other.*alterations",
unique(grep(
"Other.*guideline", plot.data$status_grouped,
value = TRUE, invert = TRUE
)),
value = TRUE
),
"No driver gene alterations detected"
))
]),
unique(grep("Other.*guideline", plot.data$status_grouped, value = TRUE)),
grep(
"Other.*alterations",
unique(grep(
"Other.*guideline", plot.data$status_grouped,
value = TRUE, invert = TRUE
)),
value = TRUE
),
"No driver gene alterations detected"
)
)
# plot distributions of TMB and PD-L1 for driver genes
plot.list <- list()
for (gene in target.genes) {
if (nrow(pvals[pvals$Gene == gene & pvals$TMB_labels != "ns", ]) > 0) {
g1 <- ggplot(
plot.data[
plot.data$variable == gene & plot.data$plot == "Log transformed TMB (Mut/Mb)",
],
aes(y = log(value + 1), x = status_grouped)
) +
geom_violin(alpha = 0.5, scale = "width", fill = "grey50", color = "black") +
geom_boxplot(width = 0.3, fill = "white", color = "black", outlier.size = 0) +
geom_point(color = "black", size = 3.5) +
geom_hline(yintercept = log(10 + 1), color = "black", linetype = "dashed") +
geom_text(
data = summ.stats[summ.stats$Gene == gene, ],
aes(x = `Alteration group`, label = round(`Median TMB`, 1)), y = -0.5, size = 8
) +
geom_signif(
data = data.frame(
pvals[pvals$Gene == gene & pvals$TMB_labels != "ns", ],
y = sapply(
seq(1, nrow(pvals[pvals$Gene == gene & pvals$TMB_labels != "ns", ]), 1) / 4,
function(x) {
x + max(log(
plot.data$value[plot.data$variable == gene &
plot.data$plot == "Log transformed TMB (Mut/Mb)"] + 1
))
}
),
check.names = FALSE
),
aes(
xmin = `Alteration group 1`, xmax = `Alteration group 2`, y_position = y,
annotations = TMB_labels
),
manual = TRUE, tip_length = 0, textsize = 10, vjust = 0.8
) +
scale_y_continuous(
breaks = c(0, 1, 2, 3, 4, 5),
labels = c(
0,
paste("1 (", round(exp(1) - 1, 1), ")", sep = ""),
paste("2 (", round(exp(2) - 1, 1), ")", sep = ""),
paste("3 (", round(exp(3) - 1, 1), ")", sep = ""),
paste("4 (", round(exp(4) - 1, 1), ")", sep = ""),
paste("5 (", round(exp(5) - 1, 1), ")", sep = "")
)
) +
coord_cartesian(ylim = c(-0.5, 5), clip = "off") +
theme_classic() +
labs(y = "Log transformed TMB\n(Mut/Mb)", x = "") +
theme(
text = element_text(size = 22),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_text(vjust = 1)
)
} else {
g1 <- ggplot(
plot.data[
plot.data$variable == gene & plot.data$plot == "Log transformed TMB (Mut/Mb)",
],
aes(y = log(value + 1), x = status_grouped)
) +
geom_violin(alpha = 0.5, scale = "width", fill = "grey50", color = "black") +
geom_boxplot(width = 0.3, fill = "white", color = "black", outlier.size = 0) +
geom_point(color = "black", size = 3.5) +
geom_hline(yintercept = log(10 + 1), color = "black", linetype = "dashed") +
geom_text(
data = summ.stats[summ.stats$Gene == gene, ],
aes(x = `Alteration group`, label = round(`Median TMB`, 1)), y = -0.5, size = 8
) +
scale_y_continuous(
breaks = c(0, 1, 2, 3, 4, 5),
labels = c(
0,
paste("1 (", round(exp(1) - 1, 1), ")", sep = ""),
paste("2 (", round(exp(2) - 1, 1), ")", sep = ""),
paste("3 (", round(exp(3) - 1, 1), ")", sep = ""),
paste("4 (", round(exp(4) - 1, 1), ")", sep = ""),
paste("5 (", round(exp(5) - 1, 1), ")", sep = "")
)
) +
coord_cartesian(ylim = c(-0.5, 5), clip = "off") +
theme_classic() +
labs(y = "Log transformed TMB\n(Mut/Mb)", x = "") +
theme(
text = element_text(size = 22),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_text(vjust = 1)
)
}
if (nrow(pvals[pvals$Gene == gene & pvals$PD_L1_labels != "ns", ]) > 0) {
g2 <- ggplot(
plot.data[plot.data$variable == gene & plot.data$plot == "PD-L1 TPS level", ],
aes(y = value, x = status_grouped, fill = PD_L1_IHC_22C3_interp)
) +
geom_bar(stat = "identity", color = "black", alpha = 0.5) +
geom_label(
aes(
label = paste(N, " (", round(value, 2) * 100, "%)", sep = ""),
color = PD_L1_IHC_22C3_interp
),
position = position_stack(vjust = 0.5), size = 8, fill = "white"
) +
geom_signif(
inherit.aes = FALSE,
data = data.frame(
pvals[pvals$Gene == gene & pvals$PD_L1_labels != "ns", ],
y = sapply(
seq(1, nrow(pvals[pvals$Gene == gene & pvals$PD_L1_labels != "ns", ]), 1) / 20,
function(x) {
x + 1.05
}
),
check.names = FALSE
),
aes(
xmin = `Alteration group 1`, xmax = `Alteration group 2`, y_position = y,
annotations = PD_L1_labels
),
manual = TRUE, tip_length = 0, textsize = 10, vjust = 0.8
) +
scale_fill_manual(
labels = c(
levels(plot.data$PD_L1_IHC_22C3_interp)[1:2],
expression("High (" >= "50%)")
),
values = c("blue", "purple", "deeppink")
) +
scale_color_manual(values = rep("black", 3)) +
scale_y_continuous(labels = scales::percent, breaks = seq(0, 1, 0.25)) +
scale_x_discrete(labels = scales::wrap_format(10)) +
coord_cartesian(ylim = c(0, 1), clip = "off") +
theme_classic() +
labs(
y = paste(
"Tumors with PD-L1 level<br>",
"<b style='color:#0000FF'>blue</b> = Negative<br>",
"<b style='color:#A020F0'>purple</b> = Low<br>",
"<b style='color:#FF1493'>pink</b> = High",
sep = ""
),
x = ""
) +
guides(
fill = "none",
color = "none"
) +
theme(
text = element_text(size = 20),
axis.title.y = element_markdown(size = 22),
axis.text.x = element_text(size = 22)
)
} else {
g2 <- ggplot(
plot.data[plot.data$variable == gene & plot.data$plot == "PD-L1 TPS level", ],
aes(y = value, x = status_grouped, fill = PD_L1_IHC_22C3_interp)
) +
geom_bar(stat = "identity", color = "black", alpha = 0.5) +
geom_label(
aes(
label = paste(N, " (", round(value, 2) * 100, "%)", sep = ""),
color = PD_L1_IHC_22C3_interp
),
position = position_stack(vjust = 0.5), size = 8, fill = "white"
) +
scale_fill_manual(
labels = c(
levels(plot.data$PD_L1_IHC_22C3_interp)[1:2],
expression("High (" >= "50%)")
),
values = c("blue", "purple", "deeppink")
) +
scale_color_manual(values = rep("black", 3)) +
scale_y_continuous(labels = scales::percent, breaks = seq(0, 1, 0.25)) +
scale_x_discrete(labels = scales::wrap_format(10)) +
coord_cartesian(ylim = c(0, 1), clip = "off") +
theme_classic() +
labs(
y = paste(
"Tumors with PD-L1 level<br>",
"<b style='color:#0000FF'>blue</b> = Negative<br>",
"<b style='color:#A020F0'>purple</b> = Low<br>",
"<b style='color:#FF1493'>pink</b> = High",
sep = ""
),
x = ""
) +
guides(
fill = "none",
color = "none"
) +
theme(
text = element_text(size = 22),
axis.title.y = element_markdown(size = 22),
axis.text.x = element_text(size = 22)
)
}
g <- ggarrange(
NULL, g1, NULL, g2,
nrow = 4, ncol = 1,
heights = c(
(nrow(pvals[pvals$Gene == gene & pvals$TMB_labels != "ns", ]) / 2) / 10, 1,
(nrow(pvals[pvals$Gene == gene & pvals$PD_L1_labels != "ns", ]) / 2) / 10, 1.5
),
align = "v"
)
plot.list[[length(plot.list) + 1]] <- g
}
# combine and save plots
g <- ggarrange(
plotlist = plot.list, nrow = 2, ncol = 3, heights = rep(1, 2), widths = rep(1, 3),
labels = c("A", "B", "C", "D", "E", "F"), font.label = list(size = 35)
)
ggsave(
"Output/Plots/TMB_PDL1_dist_non-KRAS_drivers.tiff", g,
device = "tiff", bg = "white",
width = 37, height = 22, limitsize = FALSE
)
g
```
### Distribution of TMB and PD-L1 in tumors with key KRAS driver mutations
```{r tmb-pdl1-dist-KRAS-muts}
#| label: fig-tmb-pdl1-dist-KRAS-muts
#| fig-cap: "TMB and PD-L1 22C3 IHC TPS in KRAS mutated NSCLC"
#| fig-width: 20
#| fig-height: 20
# construct data needed for plotting and testing, isolating unique instances
gene <- "KRAS"
plot.data <- patient.df[, c("de_id", "TMB", "PD_L1_IHC_22C3_interp", "histology_group")]
plot.data <- cbind(
plot.data,
ifelse(
patient.df$de_id %in% therapy.df$de_id[
therapy.df$marker_component_marker == gene &
grepl("Guideline", therapy.df$amp_cap_asco_tier) &
therapy.df$marker_component_marker_type %in% c("SNV", "CNV", "Fusion")
],
"Driver mutation",
ifelse(
patient.df$de_id %in% variant.df$de_id[
variant.df$gene == gene &
!(variant.df$de_id %in% therapy.df$de_id[
therapy.df$marker_component_marker == gene &
grepl("Guideline", therapy.df$amp_cap_asco_tier) &
therapy.df$marker_component_marker_type %in% c("SNV", "CNV", "Fusion")
])
],
paste("Non-guideline indicated ", gene, " alterations", sep = ""),
ifelse(
!(patient.df$de_id %in% variant.df$de_id[
variant.df$gene %in% therapy.df$marker_component_marker[
grepl("Guideline", therapy.df$amp_cap_asco_tier) &
therapy.df$marker_component_marker_type %in% c("SNV", "CNV", "Fusion")
]
]),
"No driver gene alterations detected", NA
)
)
)
)
colnames(plot.data)[ncol(plot.data)] <- gene
# format data
plot.data <- reshape2::melt(plot.data, id = c("de_id", "PD_L1_IHC_22C3_interp", "TMB", "histology_group"))
plot.data <- plot.data[
!is.na(plot.data$TMB) &
!is.na(plot.data$PD_L1_IHC_22C3_interp) &
!is.na(plot.data$value),
]
colnames(plot.data)[ncol(plot.data)] <- "status"
colnames(plot.data)[colnames(plot.data) == "TMB"] <- "value"
# add exact driver mutation details
extra.plot.data <- data.frame()
for (j in seq_len(nrow(plot.data))) {
if (plot.data$status[j] == "Driver mutation") {
variant <- unique(therapy.df$marker_component[
therapy.df$de_id == plot.data$de_id[j] &
therapy.df$marker_component_marker == plot.data$variable[j] &
grepl("Guideline", therapy.df$amp_cap_asco_tier) &
therapy.df$marker_component_marker_type %in% c("SNV", "CNV", "Fusion")
])
if (length(variant) > 1) {
plot.data$status[j] <- variant[1]
extra.plot.data <- rbind(
extra.plot.data, data.frame(plot.data[j, -ncol(plot.data)], status = variant[-1])
)
} else {
plot.data$status[j] <- variant
}
}
}
plot.data <- rbind(plot.data, extra.plot.data)
# group less prevalent mutations together if they are found in <0.5% of patients
plot.data$status_grouped <- plot.data$status
threshold <- 0.005 * nrow(patient.df)
if (
(length(plot.data$status_grouped[plot.data$variable == gene &
grepl("Fusion", plot.data$status_grouped)]) ==
length(plot.data$status_grouped[plot.data$variable == gene &
!grepl("^No", plot.data$status_grouped)])) &
length(table(plot.data$status_grouped[plot.data$variable == gene &
!grepl("^No", plot.data$status_grouped)])[
table(plot.data$status_grouped[plot.data$variable == gene &
!grepl("^No", plot.data$status_grouped)]) < threshold
]) ==
length(table(plot.data$status_grouped[plot.data$variable == gene &
!grepl("^No", plot.data$status_grouped)]))
) {
plot.data$status_grouped[
plot.data$variable == gene & !grepl("^No", plot.data$status_grouped) &
plot.data$status_grouped %in%
names(table(plot.data$status_grouped)[table(plot.data$status_grouped) < threshold])
] <- paste(gene, " Fusions", sep = "")
} else if (
length(table(plot.data$status_grouped[plot.data$variable == gene &
!grepl("^No", plot.data$status_grouped)])[
table(plot.data$status_grouped[plot.data$variable == gene &
!grepl("^No", plot.data$status_grouped)]) < threshold
]) ==
length(table(plot.data$status_grouped[plot.data$variable == gene &
!grepl("^No", plot.data$status_grouped)]))
) {
plot.data$status_grouped[
plot.data$variable == gene & !grepl("^No", plot.data$status_grouped) &
plot.data$status_grouped %in%
names(table(plot.data$status_grouped)[table(plot.data$status_grouped) < threshold])
] <- paste(gene, " guideline indicated alterations", sep = "")
} else {
plot.data$status_grouped[
plot.data$variable == gene & !grepl("^No", plot.data$status_grouped) &
plot.data$status_grouped %in%
names(table(plot.data$status_grouped)[table(plot.data$status_grouped) < threshold])
] <- paste("Other ", gene, " guideline indicated alterations", sep = "")
}
# calculate PD-L1 group totals and percents
totals <- data.frame()
for (
status_grouped in levels(factor(plot.data[plot.data$variable == gene, ]$status_grouped))
) {
totals <- rbind(totals, ddply(
plot.data[plot.data$variable == gene & plot.data$status_grouped == status_grouped, ],
.(variable, status_grouped, PD_L1_IHC_22C3_interp), summarize,
N = length(de_id),
value = length(de_id) /
nrow(plot.data[plot.data$variable == gene & plot.data$status_grouped == status_grouped, ])
))
}
# calculate/collate summary stats
summ.stats <- ddply(
plot.data, .(variable, status_grouped), summarize,
`Alterations in group` = paste(unique(status), collapse = ", "),
`Median TMB` = median(value)
)
summ.stats$`Alterations in group`[
summ.stats$variable == gene & grepl("Non-guideline", summ.stats$status_grouped)
] <- paste(
unique(variant.df$variant_go_name[
variant.df$gene == gene &
!(variant.df$variant_go_name %in%
therapy.df$marker_component[grep("Guideline", therapy.df$amp_cap_asco_tier)])
]),
collapse = ", "
)
summ.stats$`Alterations in group`[grep("No ", summ.stats$`Alterations in group`)] <- "-"
colnames(summ.stats)[1:2] <- c("Gene", "Alteration group")
summ.stats$`Alteration group` <- gsub(
"Non-guideline indicated", "Other", summ.stats$`Alteration group`
)
# test for differences in frequencies of PD-L1 levels and TMB
# and write out the results
pvals <- data.frame()
for (j in unique(plot.data$status_grouped[plot.data$variable == gene])) {
for (k in unique(plot.data$status_grouped[plot.data$variable == gene])) {
if (j != k) {
pdl1.p <- logistf::logistftest(logistf::logistf(
ifelse(
status_grouped == names(table(status_grouped))[1], 1, 0
) ~ PD_L1_IHC_22C3_interp + histology_group,
data = plot.data[plot.data$variable == gene & plot.data$status_grouped %in% c(j, k), ]
))$prob
tmb.p <- summary(lm(
log(value + 1) ~ status_grouped + histology_group,
data = plot.data[plot.data$variable == gene & plot.data$status_grouped %in% c(j, k), ]
))$coefficients[2, 4]
pvals <- rbind(
pvals,
data.frame(
Gene = gene, `Alteration group 1` = j, `Alteration group 2` = k,
`PD-L1 Adjusted P-value` = pdl1.p, `TMB Adjusted P-value` = tmb.p,
check.names = FALSE
)
)
}
}
}
pvals <- pvals[!duplicated(pvals$`TMB Adjusted P-value`), ]
pvals$`PD-L1 Adjusted P-value`[pvals$variable == gene] <- p.adjust(
pvals$`PD-L1 Adjusted P-value`[pvals$variable == gene],
method = "bonferroni"
)
pvals$`TMB Adjusted P-value`[pvals$variable == gene] <- p.adjust(
pvals$`TMB Adjusted P-value`[pvals$variable == gene],
method = "bonferroni"
)
pvals$`Alteration group 1` <- gsub(
"Non-guideline indicated", "Other", pvals$`Alteration group 1`
)
pvals$`Alteration group 2` <- gsub(
"Non-guideline indicated", "Other", pvals$`Alteration group 2`
)
wb <- createWorkbook()
addWorksheet(wb, "KRAS mutations")
writeData(wb, "KRAS mutations", pvals, keepNA = TRUE, colNames = TRUE)
setColWidths(
wb, "KRAS mutations",
cols = seq_len(ncol(pvals)),
widths = c(6, 30, 30, 21, 21)
)
addStyle(
wb, "KRAS mutations",
cols = seq_len(ncol(pvals)), rows = 1,
gridExpand = TRUE, style = bold, stack = TRUE
)
addStyle(
wb, "KRAS mutations",
cols = seq_len(ncol(pvals)),
rows = c(1, 2, (nrow(pvals) + 2)), gridExpand = TRUE,
style = horizontal_border_med, stack = TRUE
)
convertNum(pvals, wb, "KRAS mutations", TRUE)
saveWorkbook(
wb, "Output/Tables/Testing_diff_in_TMB_PD_L1_KRAS_drivers.xlsx",
overwrite = TRUE
)
pvals$PD_L1_labels <- ifelse(
pvals$`PD-L1 Adjusted P-value` < 0.0001, "****",
ifelse(
pvals$`PD-L1 Adjusted P-value` < 0.001, "***",
ifelse(
pvals$`PD-L1 Adjusted P-value` < 0.01, "**",
ifelse(pvals$`PD-L1 Adjusted P-value` < 0.05, "*", "ns")
)
)
)
pvals$TMB_labels <- ifelse(
pvals$`TMB Adjusted P-value` < 0.0001, "****",
ifelse(
pvals$`TMB Adjusted P-value` < 0.001, "***",
ifelse(
pvals$`TMB Adjusted P-value` < 0.01, "**",
ifelse(pvals$`TMB Adjusted P-value` < 0.05, "*", "ns")
)
)
)
# combine data for plotting
plot.data <- data.frame(
variable = c(plot.data$variable, totals$variable),
status_grouped = c(plot.data$status_grouped, totals$status_grouped),
PD_L1_IHC_22C3_interp = c(plot.data$PD_L1_IHC_22C3_interp, totals$PD_L1_IHC_22C3_interp),
N = c(rep(NA, nrow(plot.data)), totals$N),
value = c(plot.data$value, totals$value),
plot = c(
rep("Log transformed TMB (Mut/Mb)", nrow(plot.data)),
rep("PD-L1 TPS level", nrow(totals))
)
)
plot.data$status_grouped <- gsub(
"Non-guideline indicated", "Other", plot.data$status_grouped
)
plot.data$status_grouped <- factor(
plot.data$status_grouped,
levels = c(
sort(unique(plot.data$status_grouped)[
!(unique(plot.data$status_grouped) %in%
c(
unique(grep("Other.*guideline", plot.data$status_grouped, value = TRUE)),
grep(
"Other.*alterations",
unique(grep(
"Other.*guideline", plot.data$status_grouped,
value = TRUE, invert = TRUE
)),
value = TRUE
),
"No driver gene alterations detected"
))
]),
unique(grep("Other.*guideline", plot.data$status_grouped, value = TRUE)),
grep(
"Other.*alterations",
unique(grep(
"Other.*guideline", plot.data$status_grouped,
value = TRUE, invert = TRUE
)),
value = TRUE
),
"No driver gene alterations detected"
)
)
# plot distributions of TMB and PD-L1 for prevalent KRAS mutations
if (nrow(pvals[pvals$Gene == gene & pvals$TMB_labels != "ns", ]) > 0) {
g1 <- ggplot(
plot.data[
plot.data$variable == gene & plot.data$plot == "Log transformed TMB (Mut/Mb)",
],
aes(y = log(value + 1), x = status_grouped)
) +
geom_violin(alpha = 0.5, scale = "width", fill = "grey50", color = "black") +
geom_boxplot(width = 0.3, fill = "white", color = "black", outlier.size = 0) +
geom_point(color = "black", size = 3.5) +
geom_hline(yintercept = log(10 + 1), color = "black", linetype = "dashed") +
geom_text(
data = summ.stats[summ.stats$Gene == gene, ],
aes(x = `Alteration group`, label = round(`Median TMB`, 1)), y = -0.5, size = 8
) +
geom_signif(
data = data.frame(
pvals[pvals$Gene == gene & pvals$TMB_labels != "ns", ],
y = sapply(
seq(1, nrow(pvals[pvals$Gene == gene & pvals$TMB_labels != "ns", ]), 1) / 4,
function(x) {
x + max(log(
plot.data$value[plot.data$variable == gene &
plot.data$plot == "Log transformed TMB (Mut/Mb)"] + 1
))
}
),
check.names = FALSE
),
aes(
xmin = `Alteration group 1`, xmax = `Alteration group 2`, y_position = y,
annotations = TMB_labels
),
manual = TRUE, tip_length = 0, textsize = 10, vjust = 0.8
) +
scale_y_continuous(
breaks = c(0, 1, 2, 3, 4, 5),
labels = c(
0,
paste("1 (", round(exp(1) - 1, 1), ")", sep = ""),
paste("2 (", round(exp(2) - 1, 1), ")", sep = ""),
paste("3 (", round(exp(3) - 1, 1), ")", sep = ""),
paste("4 (", round(exp(4) - 1, 1), ")", sep = ""),
paste("5 (", round(exp(5) - 1, 1), ")", sep = "")
)
) +
coord_cartesian(ylim = c(-0.5, 5), clip = "off") +
theme_classic() +
labs(y = "Log transformed TMB\n(Mut/Mb)", x = "") +
theme(
text = element_text(size = 22),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_text(vjust = 1)
)
} else {
g1 <- ggplot(
plot.data[
plot.data$variable == gene & plot.data$plot == "Log transformed TMB (Mut/Mb)",
],
aes(y = log(value + 1), x = status_grouped)
) +
geom_violin(alpha = 0.5, scale = "width", fill = "grey50", color = "black") +
geom_boxplot(width = 0.3, fill = "white", color = "black", outlier.size = 0) +
geom_point(color = "black", size = 3.5) +
geom_hline(yintercept = log(10 + 1), color = "black", linetype = "dashed") +
geom_text(
data = summ.stats[summ.stats$Gene == gene, ],
aes(x = `Alteration group`, label = round(`Median TMB`, 1)), y = -0.5, size = 8
) +
scale_y_continuous(
breaks = c(0, 1, 2, 3, 4, 5),
labels = c(
0,
paste("1 (", round(exp(1) - 1, 1), ")", sep = ""),
paste("2 (", round(exp(2) - 1, 1), ")", sep = ""),
paste("3 (", round(exp(3) - 1, 1), ")", sep = ""),
paste("4 (", round(exp(4) - 1, 1), ")", sep = ""),
paste("5 (", round(exp(5) - 1, 1), ")", sep = "")
)
) +
coord_cartesian(ylim = c(-0.5, 5), clip = "off") +
theme_classic() +
labs(y = "Log transformed TMB\n(Mut/Mb)", x = "") +
theme(
text = element_text(size = 22),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_text(vjust = 1)
)
}
if (nrow(pvals[pvals$Gene == gene & pvals$PD_L1_labels != "ns", ]) > 0) {
g2 <- ggplot(
plot.data[plot.data$variable == gene & plot.data$plot == "PD-L1 TPS level", ],
aes(y = value, x = status_grouped, fill = PD_L1_IHC_22C3_interp)
) +
geom_bar(stat = "identity", color = "black", alpha = 0.5) +
geom_label(
aes(
label = paste(N, " (", round(value, 2) * 100, "%)", sep = ""),
color = PD_L1_IHC_22C3_interp
),
position = position_stack(vjust = 0.5), size = 8, fill = "white"
) +
geom_signif(
inherit.aes = FALSE,
data = data.frame(
pvals[pvals$Gene == gene & pvals$PD_L1_labels != "ns", ],
y = sapply(
seq(1, nrow(pvals[pvals$Gene == gene & pvals$PD_L1_labels != "ns", ]), 1) / 20,
function(x) {
x + 1.05
}
),
check.names = FALSE
),
aes(
xmin = `Alteration group 1`, xmax = `Alteration group 2`, y_position = y,
annotations = PD_L1_labels
),
manual = TRUE, tip_length = 0, textsize = 10, vjust = 0.8
) +
scale_fill_manual(
labels = c(
levels(plot.data$PD_L1_IHC_22C3_interp)[1:2],
expression("High (" >= "50%)")
),
values = c("blue", "purple", "deeppink")
) +
scale_color_manual(values = rep("black", 3)) +
scale_y_continuous(labels = scales::percent, breaks = seq(0, 1, 0.25)) +
scale_x_discrete(labels = scales::wrap_format(5)) +
coord_cartesian(ylim = c(0, 1), clip = "off") +
theme_classic() +
labs(
y = paste(
"Tumors with PD-L1 level<br>",
"<b style='color:#0000FF'>blue</b> = Negative<br>",
"<b style='color:#A020F0'>purple</b> = Low<br>",
"<b style='color:#FF1493'>pink</b> = High",
sep = ""
),
x = ""
) +
guides(
fill = "none",
color = "none"
) +
theme(
text = element_text(size = 20),
axis.title.y = element_markdown(size = 22),
axis.text.x = element_text(size = 22)
)
} else {
g2 <- ggplot(
plot.data[plot.data$variable == gene & plot.data$plot == "PD-L1 TPS level", ],
aes(y = value, x = status_grouped, fill = PD_L1_IHC_22C3_interp)
) +
geom_bar(stat = "identity", color = "black", alpha = 0.5) +
geom_label(
aes(
label = paste(N, " (", round(value, 2) * 100, "%)", sep = ""),
color = PD_L1_IHC_22C3_interp
),
position = position_stack(vjust = 0.5), size = 8, fill = "white"
) +
scale_fill_manual(
labels = c(
levels(plot.data$PD_L1_IHC_22C3_interp)[1:2],
expression("High (" >= "50%)")
),
values = c("blue", "purple", "deeppink")
) +
scale_color_manual(values = rep("black", 3)) +
scale_y_continuous(labels = scales::percent, breaks = seq(0, 1, 0.25)) +
scale_x_discrete(labels = scales::wrap_format(5)) +
coord_cartesian(ylim = c(0, 1), clip = "off") +
theme_classic() +
labs(
y = paste(
"Tumors with PD-L1 level<br>",
"<b style='color:#0000FF'>blue</b> = Negative<br>",
"<b style='color:#A020F0'>purple</b> = Low<br>",
"<b style='color:#FF1493'>pink</b> = High",
sep = ""
),
x = ""
) +
guides(
fill = "none",
color = "none"
) +
theme(
text = element_text(size = 22),
axis.title.y = element_markdown(size = 22),
axis.text.x = element_text(size = 22)
)
}
g <- ggarrange(
NULL, g1, NULL, g2,
nrow = 4, ncol = 1,
heights = c(
(nrow(pvals[pvals$Gene == gene & pvals$TMB_labels != "ns", ]) / 2) / 10, 1,
(nrow(pvals[pvals$Gene == gene & pvals$PD_L1_labels != "ns", ]) / 2) / 10, 1.5
),
align = "v", labels = c("", "A", "", "B"), font.label = list(size = 30), label.y = 1.5
)
ggsave(
"Output/Plots/TMB_PDL1_dist_KRAS_drivers.tiff", g,
device = "tiff", bg = "white",
width = 20, height = 20, limitsize = FALSE
)
g
```
## Co-occurrence and network analysis
### Co-occurrence and network analysis of gene-level alterations and associations with immunotherapy markers
```{r gene-level-co-occurrence}
#| label: fig-gene-level-co-occurrence
#| fig-cap: "Gene-level co-occurrence and network analysis of genomic alterations in NSCLC"
#| fig-width: 30
#| fig-height: 13
### Co-occurrence analysis
# extract data needed for alteration matrix, isolating unique instances
plot.data <- unique(variant.df[, c("de_id", "variant_type", "gene")])
# create binary matrix of genes and detected alterations for each patient
alt.matrix <- t(
reshape2::acast(
plot.data, de_id ~ gene,
value.var = "variant_type",
fun.aggregate = function(x) {
ifelse(length(x) > 0, 1, 0)
}
)
)
# calculate number of alterations per gene
plot.data <- ddply(plot.data, .(gene, variant_type), summarize, var_count = length(de_id))
# convert alteration counts to frequencies using appropriate denominators
for (j in names(table(plot.data$variant_type))) {
total.patients <- ifelse(
j == "CNV",
sum(!(patient.df$DNA_CNV %in% c("Fail", "Not Performed"))),
ifelse(
j == "Fusion",
sum(!(patient.df$RNA_Rearrangement %in% c("Fail", "Not Performed"))),
sum(!(patient.df$DNA_SNV %in% c("Fail", "Not Performed")))
)
)
plot.data$var_freq[plot.data$variant_type == j] <-
plot.data$var_count[plot.data$variant_type == j] / total.patients
}
# calculate combined frequencies of alterations per gene and sort
# from highest to lowest
gene.freq <- ddply(
plot.data, .(gene), summarize,
freq = sum(var_freq),
perc = paste(round(sum(var_freq), 4) * 100, "%", sep = "")
)
gene.freq <- gene.freq[order(gene.freq$freq, decreasing = TRUE), ]
# remove genes that have <1% prevalence when rounded using calculated gene frequencies
alt.matrix <- alt.matrix[gene.freq$gene[round(gene.freq$freq, 3) >= 0.01], ]
# sort alteration matrix based on mutual exclusivity of alterations
alt.matrix <- memoSort(alt.matrix)
# create color vector to color genes that have tier 1 or 2 alterations
gene.col <- ifelse(
gene.freq$gene %in%
therapy.df$marker_component_marker[grep("Guideline", therapy.df$amp_cap_asco_tier)],
"red",
ifelse(
gene.freq$gene %in%
therapy.df$marker_component_marker[grep("Clinical", therapy.df$amp_cap_asco_tier)],
"orange", "black"
)
)
names(gene.col) <- gene.freq$gene
# calculate pairwise odds ratios, 95% CI, and p-values from Fisher's exact test
# and calculate the proportion of patient overlap between two genes out of all
# the patients with a detected alterations for those genes
co.occur.out <- alterationCoOcurr(t(alt.matrix))
results <- co.occur.out$result.summary
# write out results for significant alteration co-occurrences
sub.results <- rbind(
data.frame(
`Feature 1` = "Gene 1",
`Feature 2` = "Gene 2",
`Proportion of co-occurrence` = "Proportion of co-occurrence",
OR = "OR",
`OR 95%CI lower` = "OR 95%CI lower",
`OR 95%CI upper` = "OR 95%CI upper",
`P-value` = "P-value",
`Adjusted P-value` = "FDR q-value",
check.names = FALSE
),
data.frame(
`Feature 1` = "Significantly co-occurring alterations (OR > 1 and FDR q-value < 0.05)",
`Feature 2` = "",
`Proportion of co-occurrence` = "",
OR = "",
`OR 95%CI lower` = "",
`OR 95%CI upper` = "",
`P-value` = "",
`Adjusted P-value` = "",
check.names = FALSE
),
results[
results$OR > 1 &
results$`Adjusted P-value` < 0.05,
],
data.frame(
`Feature 1` = "Significantly exclusive alterations (OR < 1 and FDR q-value < 0.05)",
`Feature 2` = "",
`Proportion of co-occurrence` = "",
OR = "",
`OR 95%CI lower` = "",
`OR 95%CI upper` = "",
`P-value` = "",
`Adjusted P-value` = "",
check.names = FALSE
),
results[
results$OR < 1 &
results$`Adjusted P-value` < 0.05,
]
)
wb <- createWorkbook()
addWorksheet(wb, "Co-occurrence")
writeData(
wb, "Co-occurrence", sub.results,
keepNA = TRUE, colNames = FALSE
)
setColWidths(
wb, "Co-occurrence",
cols = seq_len(ncol(sub.results)),
widths = rep(15, ncol(sub.results))
)
addStyle(
wb, "Co-occurrence",
cols = seq_len(ncol(sub.results)), rows = 1:(nrow(sub.results) + 1),
gridExpand = TRUE, style = left_just, stack = TRUE
)
mergeCells(
wb, "Co-occurrence",
cols = c(1:ncol(sub.results)),
rows = grep("co-occurring", sub.results[, 1])
)
mergeCells(
wb, "Co-occurrence",
cols = seq_len(ncol(sub.results)),
rows = grep("exclusive", sub.results[, 1])
)
addStyle(
wb, "Co-occurrence",
cols = seq_len(ncol(sub.results)),
rows = c(
1, grep("co-occurring", sub.results[, 1]),
grep("exclusive", sub.results[, 1])
),
gridExpand = TRUE, style = bold, stack = TRUE
)
addStyle(
wb, "Co-occurrence",
cols = seq_len(ncol(sub.results)),
rows = c(1, 2, (nrow(sub.results) + 1)), gridExpand = TRUE,
style = horizontal_border_med, stack = TRUE
)
addStyle(
wb, "Co-occurrence",
cols = seq_len(ncol(sub.results)),
rows = c(
grep("co-occurring", sub.results[, 1]) + 1,
grep("exclusive", sub.results[, 1]),
grep("exclusive", sub.results[, 1]) + 1
),
gridExpand = TRUE, style = horizontal_border_thin, stack = TRUE
)
convertNum(sub.results, wb, "Co-occurrence", FALSE)
saveWorkbook(
wb, "Output/Tables/Co_occurrence_results_alterations.xlsx",
overwrite = TRUE
)
# convert odds ratios < 1 to negative inverse to make color scales more equal among
# values and cap odds ratios to reduce effect of extreme outliers on color scale
co.occur.out$or.matrix[
co.occur.out$or.matrix < 1 & !is.na(co.occur.out$or.matrix)
] <- -(1 / co.occur.out$or.matrix[
co.occur.out$or.matrix < 1 & !is.na(co.occur.out$or.matrix)
])
co.occur.out$or.matrix[co.occur.out$or.matrix > 10] <- 10
co.occur.out$or.matrix[co.occur.out$or.matrix < -10] <- -10
# mask half of the matrix for plotting
co.occur.out$or.matrix[upper.tri(co.occur.out$or.matrix)] <- NA
co.occur.out$padj.matrix[upper.tri(co.occur.out$padj.matrix)] <- NA
co.occur.out$prop.matrix[upper.tri(co.occur.out$prop.matrix)] <- NA
# combine odds ratios and p-values for plotting
plot.data <- merge(
merge(
reshape2::melt(
co.occur.out$or.matrix[
rev(rownames(co.occur.out$or.matrix)),
rev(colnames(co.occur.out$or.matrix))
]
),
reshape2::melt(
co.occur.out$padj.matrix[
rev(rownames(co.occur.out$padj.matrix)),
rev(colnames(co.occur.out$padj.matrix))
]
),
by = c(1, 2),
suffix = c(".or", ".p"),
sort = FALSE
),
reshape2::melt(
co.occur.out$prop.matrix[
rev(rownames(co.occur.out$prop.matrix)),
rev(colnames(co.occur.out$prop.matrix))
]
),
by = c(1, 2),
sort = FALSE
)
# create heatmap of results
g1 <- ggplot(plot.data, aes(y = Var1, x = Var2, fill = value.or)) +
geom_tile(color = "white") +
geom_tile(
color = "white",
fill = ifelse(plot.data$value == 0 &
!is.na(plot.data$value) &
plot.data$value.p >= 0.05, "white", NA)
) +
geom_point(
aes(shape = ifelse(value == 0 & !is.na(value),
"No overlapping\nalterations", ""
)),
size = 2, color = "black"
) +
geom_point(aes(size = ifelse(value.p < 0.05, "FDR q-value\n< 0.05", ""))) +
scale_y_discrete(position = "right") +
scale_fill_gradient2(
low = "#075AFF", mid = "white", high = "#FF0000", midpoint = 0,
name = "Fisher's exact test\nodds ratio",
labels = c(expression("" <= 0.1), "0.2", "1", "5", expression("" >= 10)),
na.value = "white"
) +
scale_size_manual(
values = c(2, NA), na.value = NA, breaks = "FDR q-value\n< 0.05",
name = NULL
) +
scale_shape_manual(
values = c(4, NA), na.value = NA, breaks = "No overlapping\nalterations",
name = NULL
) +
coord_fixed() +
theme(
text = element_text(size = 12),
axis.text.x = element_text(
angle = 90, hjust = 1, vjust = 0.5,
color = rev(gene.col[rownames(alt.matrix)])
),
axis.title.x = element_blank(),
axis.text.y = element_text(color = rev(gene.col[rownames(alt.matrix)])),
axis.title.y = element_blank(),
legend.position = c(0.1, 0.5),
legend.text.align = 0,
legend.key = element_blank(),
legend.title = element_text(size = 20),
legend.text = element_text(size = 20),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_rect(color = "white"),
plot.margin = margin(1, 1, 1, 40)
)
### Network analysis
# get results with FDR < 0.05 to include in network analysis
net.data <- plot.data[plot.data$value.p < 0.05 & !is.na(plot.data$value.p), ]
# create variable with reverted ORs
net.data$revert.or <- net.data$value.or
net.data$revert.or[net.data$revert.or < 0] <-
1 / abs(net.data$revert.or[net.data$revert.or < 0])
# create column to designate effect direction, then make absolute OR variable
net.data$or.direction <- ifelse(net.data$value.or < 0, "-", "+")
net.data$abs.value.or <- abs(net.data$value.or)
# import data into igraph and make ORs the weight
net.igraph <- graph_from_data_frame(net.data, directed = FALSE)
E(net.igraph)$weight <- net.data$revert.or
# calculate degree and weighted degree of each node
V(net.igraph)$degree <- degree(net.igraph)
V(net.igraph)$weighted.degree <- strength(net.igraph, weights = E(net.igraph)$abs.value.or)
# detect sub-communities within the network
set.seed(1234)
net.clust <- cluster_infomap(
net.igraph,
e.weights = E(net.igraph)$revert.or,
v.weights = V(net.igraph)$weighted.degree
)
V(net.igraph)$cluster <- net.clust$membership
# mark which genes had Tier 1 or 2 alterations detected
V(net.igraph)$amp_cap_asco_tier <- ifelse(
V(net.igraph)$name %in%
therapy.df$marker_component_marker[grep("Guideline", therapy.df$amp_cap_asco_tier)],
"Guideline indicated",
ifelse(
V(net.igraph)$name %in%
therapy.df$marker_component_marker[grep("Clinical", therapy.df$amp_cap_asco_tier)],
"Clinical trial or therapy in other tumor type", "Neither, but known\npathogenic"
)
)
# output network data
net.results <- data.frame(
`Feature` = V(net.igraph)$name,
Degree = V(net.igraph)$degree,
`Weighted degree` = V(net.igraph)$weighted.degree,
`Network cluster` = V(net.igraph)$cluster,
check.names = FALSE
)
wb <- createWorkbook()
addWorksheet(wb, "Network results")
writeData(wb, "Network results", net.results, keepNA = TRUE, colNames = TRUE)
setColWidths(
wb, "Network results",
cols = seq_len(ncol(net.results)),
widths = c(rep(15, ncol(net.results)))
)
addStyle(
wb, "Network results",
cols = seq_len(ncol(net.results)), rows = 1,
gridExpand = TRUE, style = bold, stack = TRUE
)
addStyle(
wb, "Network results",
cols = seq_len(ncol(net.results)),
rows = c(1, 2, (nrow(net.results) + 2)), gridExpand = TRUE,
style = horizontal_border_med, stack = TRUE
)
convertNum(net.results, wb, "Network results", TRUE)
saveWorkbook(wb, "Output/Tables/Network_results.xlsx", overwrite = TRUE)
# plot network using force atlas 2 algorithm to place nodes
set.seed(1234)
layout <- ForceAtlas2::layout.forceatlas2(
data.frame(from = net.data$Var1, to = net.data$Var2, weights = net.data$revert.or),
directed = FALSE, iterations = 2000, k = 1000, gravity = 400, plotstep = 0
)
V(net.igraph)$amp_cap_asco_tier[
V(net.igraph)$amp_cap_asco_tier == "Clinical trial or therapy in other tumor type"
] <- "Clinical trial or therapy\nin other tumor type"
g2 <- ggraph(net.igraph, layout = as.matrix(layout[, -1])) +
geom_edge_fan(aes(color = value.or), alpha = 0.75, width = 1.5) +
geom_node_point(
aes(
color = factor(as.character(cluster), levels = paste(1:max(cluster))),
shape = factor(
amp_cap_asco_tier,
levels = c(
"Guideline indicated",
"Clinical trial or therapy\nin other tumor type",
"Neither, but known\npathogenic"
)
)
),
size = log(V(net.igraph)$weighted.degree) * 6, alpha = 0.5
) +
geom_node_point(
aes(
color = factor(as.character(cluster), levels = paste(1:max(cluster))),
shape = factor(
amp_cap_asco_tier,
levels = c(
"Guideline indicated",
"Clinical trial or therapy\nin other tumor type",
"Neither, but known\npathogenic"
)
)
),
size = log(V(net.igraph)$weighted.degree) * 5
) +
geom_node_text(aes(label = name), size = 4) +
geom_node_text(aes(label = name), size = 4) +
scale_edge_colour_gradient2(
low = "#075AFF", mid = "white", high = "#FF0000", midpoint = 0,
name = "Fisher's exact test\nodds ratio (edges)",
breaks = c(-10, -5, 0, 5, 10),
labels = c(expression("" <= 0.1), "0.2", "1", "5", expression("" >= 10)),
na.value = "white"
) +
scale_shape_manual(values = c(16, 15, 17)) +
guides(
color = guide_legend(
title = "Algorithm detected\ncluster (nodes)",
override.aes = list(size = 8)
),
shape = guide_legend(
title = "Clinical evidence (nodes)",
override.aes = list(size = 8),
keyheight = 3
),
size = "none"
) +
theme_void() +
theme(
legend.title = element_text(size = 20),
legend.text = element_text(size = 20),
legend.spacing.y = unit(10, "mm")
)
# output both co-occurrence results and network
g <- ggarrange(
NULL, g1, NULL, g2,
ncol = 4, widths = c(-0.01, 0.8, -0.01, 1.2), labels = c("", "A", "", "B"),
font.label = list(size = 35)
)
ggsave(
"Output/Plots/Co_occurrence_network_plots.tiff", g,
device = "tiff", bg = "white", width = 30, height = 13
)
g
```
### Association of gene clusters with histology, TMB, and PD-L1 IHC
```{r co-occuring-clust-assoc}
#| label: fig-co-occuring-clust-assoc
#| fig-cap: "Associations between variant number in gene clusters and histology, TMB, and PD-L1 IHC"
#| fig-width: 50
#| fig-height: 30
### Co-occurring cluster associations
# test/plot association of each cluster with histology, TMB, and PD-L1
plot.list <- list()
for (clust in unique(net.results$`Network cluster`)) {
# calculate number of variants in the cluster and get gene combos
n.alts <- data.frame(
table(variant.df$de_id[
variant.df$gene %in% net.results$Feature[net.results$`Network cluster` == clust] &
(variant.df$gene %in% results$`Feature 1`[results$OR > 1] |
variant.df$gene %in% results$`Feature 2`[results$OR > 1])
])
)
for (j in n.alts$Var1) {
n.alts$gene.combos[n.alts$Var1 == j] <- paste(
unique(variant.df$gene[
variant.df$de_id == j &
variant.df$gene %in% net.results$Feature[net.results$`Network cluster` == clust]
]),
collapse = " + "
)
}
plot.data <- merge(
patient.df[, c("de_id", "histology_group", "TMB", "PD_L1_IHC_22C3_interp")],
n.alts,
by = 1, all.x = TRUE
)
plot.data$Freq[plot.data$Freq > 2] <- 2
plot.data$Freq[
is.na(plot.data$Freq) &
plot.data$de_id %in% patient.df$de_id[
patient.df$DNA_SNV %in% c("Pass", "Limited") |
patient.df$DNA_CNV %in% c("Pass", "Limited") |
patient.df$RNA_Rearrangements %in% c("Pass", "Limited")
]
] <- 0
plot.data <- plot.data[!is.na(plot.data$Freq), ]
plot.data$histology_group[grep("Squamous", plot.data$histology_group)] <- "Squamous cell"
# plot breakdown of most common gene combinations
plot.data.prop <- ddply(
plot.data[
!is.na(plot.data$gene.combos) &
plot.data$gene.combos %in%
names(table(plot.data$gene.combos)[table(plot.data$gene.combos) > 3]),
],
.(gene.combos), summarize,
count = length(de_id),
prop = length(de_id) / nrow(plot.data[!is.na(plot.data$gene.combos), ])
)
plot.data.prop <- plot.data.prop[order(plot.data.prop$prop, decreasing = TRUE), ]
g1 <- ggplot() +
annotate(
"text",
x = 1, y = 1,
size = ifelse(nrow(plot.data.prop) <= 15, 7.5, 130 / nrow(plot.data.prop)),
label = paste(
plot.data.prop$gene.combo, " (N=", plot.data.prop$count,
", ", round(plot.data.prop$prop, 3) * 100, "%)",
sep = "", collapse = "\n"
)
) +
theme_void()
# test/plot association with histology group
plot.data.prop <- rbind(
data.frame(
Freq = "No\nvariants",
ddply(
plot.data[plot.data$Freq == "0", ], .(histology_group), summarize,
count = length(de_id),
prop = length(de_id) / nrow(plot.data[plot.data$Freq == "0", ])
)
),
data.frame(
Freq = "One\nvariant",
ddply(
plot.data[plot.data$Freq == "1", ], .(histology_group), summarize,
count = length(de_id),
prop = length(de_id) / nrow(plot.data[plot.data$Freq == "1", ])
)
),
data.frame(
Freq = "Two or\nmore variants",
ddply(
plot.data[plot.data$Freq == "2", ], .(histology_group), summarize,
count = length(de_id),
prop = length(de_id) / nrow(plot.data[plot.data$Freq == "2", ])
)
)
)
pvals <- data.frame()
for (j in unique(plot.data.prop$Freq)) {
for (k in unique(plot.data.prop$Freq)) {
if (j != k) {
fish.p <- fisher.test(
matrix(
c(
plot.data.prop$count[plot.data.prop$Freq == j],
plot.data.prop$count[plot.data.prop$Freq == k]
),
nrow = 2
)
)$p.value
pvals <- rbind(
pvals,
data.frame(
`Alteration group 1` = j, `Alteration group 2` = k, P = fish.p,
check.names = FALSE
)
)
}
}
}
pvals <- pvals[c(1, 2, 4), ]
pvals$P <- p.adjust(pvals$P, method = "bonferroni")
g2 <- ggplot(plot.data.prop, aes(x = Freq, y = prop, fill = histology_group)) +
geom_bar(stat = "identity", color = "black", alpha = 0.75) +
geom_label(
aes(
label = paste(count, "\n(", round(prop, 2) * 100, "%)", sep = ""),
color = histology_group
),
position = position_fill(0.5), size = 7, fill = "white"
) +
geom_signif(
xmin = c(1, 1, 2), xmax = c(2, 3, 3), y_position = c(1.05, 1.15, 1.25),
annotations = ifelse(pvals$P < 1E-4, "<0.0001", round(pvals$P, 3)),
textsize = 7, tip_length = 0
) +
scale_y_continuous(labels = scales::percent, breaks = seq(0, 1, 0.25)) +
scale_fill_manual(values = c("black", "grey50")) +
scale_color_manual(values = rep("black", 2)) +
theme_classic() +
theme(
text = element_text(size = 24),
axis.title.x = element_blank(),
axis.title.y = element_markdown(size = 24),
legend.title = element_blank(),
legend.position = "top"
) +
guides(fill = "none", color = "none") +
labs(y = paste(
"Tumors with histology<br>",
"<b style='color:#000000'>black</b> = Adenocarcinoma<br>",
"<b style='color:#7F7F7F'>gray</b> = Squamous cell<br>",
sep = ""
))
# test/plot association with TMB
summ.stats <- ddply(plot.data, .(Freq), summarize, median = median(na.omit(TMB)))
g3 <- ggplot(
plot.data[!is.na(plot.data$TMB), ],
aes(x = as.character(Freq), y = log(TMB + 1))
) +
geom_violin(alpha = 0.5, scale = "width", fill = "grey50", color = "black") +
geom_boxplot(width = 0.3, fill = "white", color = "black", outlier.size = 0) +
geom_point(color = "black", size = 3.5) +
geom_hline(yintercept = log(10 + 1), color = "black", linetype = "dashed") +
geom_text(data = summ.stats, aes(label = round(median, 1), y = -0.2), size = 7) +
stat_pwc(
y.position = max(na.omit(log(plot.data$TMB + 1))) + 0.25,
method = "t_test", p.adjust.method = "bonferroni", label = "p.adj.format",
step.increase = 0.1, label.size = 7, , tip.length = 0
) +
scale_x_discrete(labels = c("No\nvariants", "One\nvariant", "Two or\nmore variants")) +
scale_y_continuous(
breaks = c(0, 1, 2, 3, 4, 5),
labels = c(
0,
paste("1 (", round(exp(1) - 1, 1), ")", sep = ""),
paste("2 (", round(exp(2) - 1, 1), ")", sep = ""),
paste("3 (", round(exp(3) - 1, 1), ")", sep = ""),
paste("4 (", round(exp(4) - 1, 1), ")", sep = ""),
paste("5 (", round(exp(5) - 1, 1), ")", sep = "")
)
) +
theme_classic() +
theme(
text = element_text(size = 24),
axis.title.x = element_blank()
) +
labs(y = "Log transformed TMB (Mut/Mb)")
# test/plot association with PD-L1 IHC
plot.data.prop <- rbind(
data.frame(
Freq = "No\nvariants",
ddply(
plot.data[plot.data$Freq == "0", ], .(PD_L1_IHC_22C3_interp), summarize,
count = length(de_id),
prop = length(de_id) / nrow(plot.data[plot.data$Freq == "0", ])
)
),
data.frame(
Freq = "One\nvariant",
ddply(
plot.data[plot.data$Freq == "1", ], .(PD_L1_IHC_22C3_interp), summarize,
count = length(de_id),
prop = length(de_id) / nrow(plot.data[plot.data$Freq == "1", ])
)
),
data.frame(
Freq = "Two or\nmore variants",
ddply(
plot.data[plot.data$Freq == "2", ], .(PD_L1_IHC_22C3_interp), summarize,
count = length(de_id),
prop = length(de_id) / nrow(plot.data[plot.data$Freq == "2", ])
)
)
)
plot.data.prop <- plot.data.prop[!is.na(plot.data.prop$PD_L1_IHC_22C3_interp), ]
pvals <- data.frame()
for (j in unique(plot.data.prop$Freq)) {
for (k in unique(plot.data.prop$Freq)) {
if (j != k) {
chisq.p <- chisq.test(
matrix(
c(
plot.data.prop$count[plot.data.prop$Freq == j],
plot.data.prop$count[plot.data.prop$Freq == k]
),
nrow = 3
)
)$p.value
pvals <- rbind(
pvals,
data.frame(
`Alteration group 1` = j, `Alteration group 2` = k, P = chisq.p,
check.names = FALSE
)
)
}
}
}
pvals <- pvals[c(1, 2, 4), ]
pvals$P <- p.adjust(pvals$P, method = "bonferroni")
g4 <- ggplot(plot.data.prop, aes(x = Freq, y = prop, fill = PD_L1_IHC_22C3_interp)) +
geom_bar(stat = "identity", color = "black", alpha = 0.5) +
geom_label(
aes(
label = paste(count, "\n(", round(prop, 2) * 100, "%)", sep = ""),
color = PD_L1_IHC_22C3_interp
),
position = position_fill(0.5), size = 7, fill = "white"
) +
geom_signif(
xmin = c(1, 1, 2), xmax = c(2, 3, 3), y_position = c(1.05, 1.15, 1.25),
annotations = ifelse(pvals$P < 1E-4, "<0.0001", round(pvals$P, 3)),
textsize = 7, tip_length = 0
) +
scale_y_continuous(labels = scales::percent, breaks = seq(0, 1, 0.25)) +
scale_fill_manual(values = c("blue", "purple", "deeppink")) +
scale_color_manual(values = rep("black", 3)) +
theme_classic() +
theme(
text = element_text(size = 24),
axis.title.x = element_blank(),
axis.title.y = element_markdown(size = 24),
legend.title = element_blank(),
legend.position = "top"
) +
guides(fill = "none", color = "none") +
labs(
y = paste(
"Tumors with PD-L1 level<br>",
"<b style='color:#0000FF'>blue</b> = Negative<br>",
"<b style='color:#A020F0'>purple</b> = Low<br>",
"<b style='color:#FF1493'>pink</b> = High",
sep = ""
)
)
# output plots
g <- ggarrange(
g1, NULL, g2, NULL, g3, NULL, g4,
nrow = 1, ncol = 7,
widths = c(1, 0, 1, 0, 1, 0, 1)
)
plot.list[[length(plot.list) + 1]] <- g
names(plot.list)[length(plot.list)] <- clust
}
# output plots of specific clusters with significant associations
g <- ggarrange(
plotlist = plot.list[c(2, 3, 4, 8, 9, 11, 12, 13)],
nrow = 4, ncol = 2, heights = rep(1, 4), widths = rep(1, 2),
labels = c("A", "B", "C", "D", "E", "F", "G", "H"), font.label = list(size = 40)
)
ggsave(
"Output/Plots/Gene_module_associations_sigassocs.tiff", g,
device = "tiff", bg = "white",
width = 50, height = 30, limitsize = FALSE
)
g
# combine and save other plots
g <- ggarrange(
plotlist = plot.list[c(1, 5, 6, 7, 10)],
nrow = 3, ncol = 2, heights = rep(1, 4), widths = rep(1, 4),
labels = c("A", "B", "C", "D", "E"), font.label = list(size = 40)
)
ggsave(
"Output/Plots/Gene_module_associations_others.tiff", g,
device = "tiff", bg = "white",
width = 50, height = 25, limitsize = FALSE
)
g
```
# R session information
```{r r-session, echo=FALSE}
sessionInfo()
```