This script prints Figure 2a: Barplot representing the number of CMD patients receiving each drug (on the horizontal axis) singly or in combination with a specified number of other drugs.
Required file in the input_data folder:
Supplementary Tables 1 to 4 include information on the cohort characteristics and drug intake with source metadata. Source metadata for the MetaCardis cohort includes disease group, gender, age, study center, body mass index (BMI, kg/m2), alternative healthy eating index (aHEI), diet diversity score (DDS), and dietary approaches to stop hypertension (DASH), physical activity and manual work levels, and smoking status. Source data includes information on the drug intake, drug combinations, dosage and antibiotic use analyzed in the MetaCardis cohort.
File containing the mapping between drug IDs in the metadata table and drug names and variable type
File containing the mapping between drug name and indication.
Load necessary libraries.
set.seed(0)
library(reshape2)
library(stringr)
library(dplyr)
library(arules)
library(arulesViz)
library(igraph)
library(ggplot2)
library(RColorBrewer)
library(readxl)
Load data required for plotting from files.
folderName <- './input_data/'
d.out <- "./"
drug.indication <- paste(folderName, "drug.indication.tsv", sep="")
dic.anno <- paste(folderName, "data_drugs_dict.txt", sep="")
dir.create(d.out, showWarnings = F)
infilename <- 'Supplementary_Tables_1_to_4_2019-09-13434.xlsx'
# load patient drug uptake information
sampleData_drug <- as.matrix(read_excel(paste(folderName,infilename,sep=''),
sheet = "ST 2b"))#,
# load patient metadata information
sampleData_meta <- as.matrix(read_excel(paste(folderName,infilename,sep=''),
sheet = "ST 1b"))#,
# load drug combination intake information
sampleData_comb <- as.matrix(read_excel(paste(folderName,infilename,sep=''),
sheet = "ST 3b"))#,
Prepare data for processing.
# prepare drug table
rownames(sampleData_drug) = sampleData_drug[, "SampleID"]
drop <- c( "SampleID" ,"PatientGroup","DRUGTOTAL")
sampleData_drug = sampleData_drug[,!(colnames(sampleData_drug) %in% drop)]
# prepare drug comb table
rownames(sampleData_comb) <- sampleData_comb[, "SampleID"]
drop <- c( "SampleID" ,"PatientGroup")
sampleData_comb = sampleData_comb[,!(colnames(sampleData_comb) %in% drop)]
# prepare metadata
colnames(sampleData_meta)[2:6]<-c("PATGROUPFINAL_C", "GENDER", "BMI_C", "AGE", "CENTER_C")
rownames(sampleData_meta) <- sampleData_meta[, "SampleID"]
sample_extended_meta <- merge(sampleData_meta, sampleData_comb, by="row.names", all=TRUE)
drop <- c( "Row.names")
sample_extended_meta = sample_extended_meta[,!(colnames(sample_extended_meta) %in% drop)]
Load data files for indication and drug abbreviations.
# load drug abbreviation dictionary
dic.anno <- read.delim(dic.anno, stringsAsFactors = F)
# load rug indication info
drug.indication = read.delim(drug.indication, stringsAsFactors = F)
drugs <- sampleData_drug
metadata <- sample_extended_meta
Edit Dictionary.
dic.anno <- dic.anno %>%
mutate(DisplayName = str_replace(DisplayName, " intake", ""))
Edit input data.
## Filter out healthy patients
metadata <- metadata[ metadata$PATGROUPFINAL_C != "8",]
drugs <- drugs[match(metadata$SampleID, rownames(drugs)) ,]
identical(metadata$SampleID %>% as.character(), rownames(drugs))
## [1] TRUE
## Remove collective drugs
drugs <- drugs [, !colnames(drugs) %in%
dic.anno[dic.anno$DisplayName %in%
c("Antibiotics total",
"Unknown antibiotic",
"Any antilipid treatment",
"Any antihypertensive treatment",
"Any antidiabetic treatment"),
"VariableID"]]
Filter out healthy patients.
metadata <- metadata[ metadata$PATGROUPFINAL_C != "8",]
drugs <- drugs[match(metadata$SampleID, rownames(drugs)) ,]
# check consistency between drug variable names
identical(metadata$SampleID %>% as.character(), rownames(drugs))
## [1] TRUE
Remove collective drugs.
drugs <- drugs [, !colnames(drugs) %in%
dic.anno[dic.anno$DisplayName %in%
c("Antibiotics total",
"Unknown antibiotic",
"Any antilipid treatment",
"Any antihypertensive treatment",
"Any antidiabetic treatment"),
"VariableID"]]
Sync data with dictionary.
drugs <- drugs[,colnames(drugs) %in% dic.anno$VariableID]
dic.anno <- dic.anno[match(colnames(drugs), dic.anno$VariableID),]
# check consistency between drug variable names
identical(dic.anno$VariableID, colnames(drugs))
## [1] TRUE
Convert from factor to numeric.
for (i in 1:ncol(drugs)){
drugs[,i] <- as.numeric(drugs[,i])-1
}
Get frequent items.
## Format data input
drugs.d <- drugs
drugs.d <- as.data.frame(drugs.d >= 0)
## Mine frequent itemsets with the Eclat algorithm
freq.all=eclat(drugs.d,parameter=list(supp = 0.05))
## Get clean data
freq.list <- as.data.frame(inspect(freq.all))
freq.list$items <- gsub("\\{", "", freq.list$items)
freq.list$items <- gsub("\\}", "", freq.list$items)
freq.list$items <- gsub("\\s+", " ", freq.list$items)
freq.list$items <- gsub(", ", ",", freq.list$items)
freq.list$support <- as.numeric(as.character(freq.list$support))
freq.list$transIdenticalToItemsets <- as.numeric(as.character(freq.list$transIdenticalToItemsets))
freq.list$count <- as.numeric(as.character(freq.list$count))
# Get most frequently used drugs ----
freq.list <- freq.list %>%
filter(!str_detect(items, ","))
Format to plot.
## Add display name to the frequency list
freq.list <- freq.list %>%
left_join(dic.anno %>% select(VariableID, DisplayName), by = c("items" = "VariableID"))
# convert drugs to dataframe
drugs <- as.data.frame((drugs>=0)*1)
## Count co-used drugs
co.usage = NULL
for (i in colnames(drugs)){
d.temp <- drugs %>%
filter(get(i) == 1) %>%
select(-i) %>%
rowSums(na.rm = T) %>%
plyr::count() %>%
#add.coding
mutate(code = if_else(x == 0,
"single",
if_else(x < 8,
paste0("+",x),
"+8 or more"))) %>%
group_by(code) %>%
summarise(frequency = sum(freq)) %>%
mutate(drug = i)
co.usage <- rbind(co.usage, d.temp)
rm(d.temp)
}
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(i)` instead of `i` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
## Format to plot -----
co.usage <- co.usage %>%
#add annotation
left_join(dic.anno %>% select(VariableID, DisplayName), by = c("drug" = "VariableID")) %>%
filter(DisplayName %in% freq.list$DisplayName) %>%
#do some ordering for the plot
mutate(usage = factor(code, levels = c("single",
"+1",
"+2",
"+3",
"+4",
"+5",
"+6",
"+7",
"+8 or more"))) %>%
mutate(DisplayName = factor(DisplayName, levels = freq.list$DisplayName)) %>%
mutate(drug = DisplayName)
Plot.
ggplot(co.usage, aes(x = drug, y = frequency, fill = usage)) +
geom_bar(stat = "identity") +
scale_fill_brewer(palette = "Paired") +
theme_classic(base_size = 14) +
theme(axis.text.x = element_text(size = 14),
axis.text.y = element_text(size = 14)) +
ggpubr::rotate_x_text() +
labs(x = "", y = "Number of patients")
# UNCOMMENT TO PRINT TO FILE
#f.name <- paste(d.out, "fig2a_frequent.list.barplot.pdf", sep = "/")
#pdf(f.name)
# ggplot(co.usage, aes(x = drug, y = frequency, fill = usage)) +
# geom_bar(stat = "identity") +
# scale_fill_brewer(palette = "Paired") +
# theme_classic(base_size = 14) +
# theme(axis.text.x = element_text(size = 14),
# axis.text.y = element_text(size = 14)) +
# ggpubr::rotate_x_text() +
# labs(x = "", y = "Number of patients")
#dev.off()
sessionInfo()
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 17763)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=English_United States.1252
## [2] LC_CTYPE=English_United States.1252
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.1252
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] readxl_1.3.1 RColorBrewer_1.1-2 ggplot2_3.3.3 igraph_1.2.6
## [5] arulesViz_1.4-0 arules_1.6-7 Matrix_1.3-2 dplyr_1.0.5
## [9] stringr_1.4.0 reshape2_1.4.4
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.6 lattice_0.20-41 tidyr_1.1.3 assertthat_0.2.1
## [5] digest_0.6.27 foreach_1.5.1 utf8_1.2.1 R6_2.5.0
## [9] cellranger_1.1.0 plyr_1.8.6 backports_1.2.1 evaluate_0.14
## [13] highr_0.9 pillar_1.6.0 rlang_0.4.10 curl_4.3
## [17] rstudioapi_0.13 data.table_1.14.0 car_3.0-10 jquerylib_0.1.4
## [21] rmarkdown_2.7 labeling_0.4.2 foreign_0.8-76 munsell_0.5.0
## [25] broom_0.7.6 compiler_3.6.1 xfun_0.22 pkgconfig_2.0.3
## [29] htmltools_0.5.1.1 tidyselect_1.1.0 tibble_3.1.0 seriation_1.2-9
## [33] rio_0.5.26 codetools_0.2-18 fansi_0.4.2 crayon_1.4.1
## [37] withr_2.4.2 ggpubr_0.4.0 grid_3.6.1 jsonlite_1.7.2
## [41] gtable_0.3.0 lifecycle_1.0.0 registry_0.5-1 DBI_1.1.1
## [45] magrittr_2.0.1 scales_1.1.1 zip_2.1.1 carData_3.0-4
## [49] cli_2.5.0 stringi_1.5.3 farver_2.1.0 ggsignif_0.6.1
## [53] bslib_0.2.4 ellipsis_0.3.1 generics_0.1.0 vctrs_0.3.6
## [57] openxlsx_4.2.3 iterators_1.0.13 tools_3.6.1 forcats_0.5.1
## [61] glue_1.4.2 purrr_0.3.4 hms_1.0.0 abind_1.4-5
## [65] yaml_2.2.0 colorspace_2.0-0 TSP_1.1-10 rstatix_0.7.0
## [69] knitr_1.33 haven_2.4.1 sass_0.3.1