library(readxl) # version 1.4.2
library(dplyr) # version 1.1.2
library(lubridate) # version 1.9.2
library(RColorBrewer) # version 1.1-3
library(FSA) # version 0.9.4
library(ggh4x) # version 0.2.4
library(fixest) # version 0.11.1
library(ggplot2) # version 3.4.2
library(qiime2R) # version 0.99.6
library(cowplot) # version 1.1.1
library(readr) # version 2.1.4
library(ggpubr) # version 0.6.0
library(ggrepel) # version 0.9.3
library(vegan) # version 2.6-4
library(ANCOMBC) # version 2.0.2
library(phyloseq) # version 1.42.0
library(tidyr) # version 1.3.0
library(microViz) # version 0.10.8
library(jsonlite) # version 1.8.4
library(MicrobiotaProcess) # version 1.12.3
library(gtools) # version 3.9.4
library(igraph) # version 1.4.2
library(stringr) # version 1.5.0
library(multcomp) # version 1.4-25
set.seed(5)
Metadata281 <- read_excel("data/Metadata281.xlsx")
# Rename some columns
colnames(Metadata281)[colnames(Metadata281) == 'CD4+_count'] <- 'CD4_count'
colnames(Metadata281)[colnames(Metadata281) == 'Cohort'] <- 'Cohort_Description'
colnames(Metadata281)[colnames(Metadata281) == 'HIV-1_viral_load'] <- 'HIV1_viral_load'
colnames(Metadata281)[colnames(Metadata281) == 'IL-6_pg_mL'] <- 'IL6_pg_mL'
# Create viral outcome column
Metadata281$HIV1_viral_load[Metadata281$HIV1_viral_load == 'ND'] <- 0
Metadata281$HIV1_viral_load <- as.numeric(Metadata281$HIV1_viral_load)
Metadata281$HIV1_viral_outcome <- 'Suppressed'
Metadata281$HIV1_viral_outcome[Metadata281$Cohort_Short == 'A' &
Metadata281$Visit == 3 &
Metadata281$HIV1_viral_load > 200] <- 'Viremic'
Metadata281$HIV1_viral_outcome[Metadata281$Cohort_Short == 'B' &
Metadata281$Visit == 2 &
Metadata281$HIV1_viral_load > 200] <- 'Viremic'
Metadata281$HIV1_viral_outcome <- as.factor(Metadata281$HIV1_viral_outcome)
# Calculate length on ART column
Metadata281$ARTStart[Metadata281$Cohort_Short == "B"] <- Metadata281$ART_Start_Date[Metadata281$Cohort_Short == "B"]
Metadata281$ARTStart <- as.Date(Metadata281$ARTStart, "%Y")
lubridate::day(Metadata281$ARTStart) <- 1
lubridate::month(Metadata281$ARTStart) <- 6
Metadata281$Visit_date <- as.Date(Metadata281$Visit_date, "%m/%d/%y")
Metadata281$LenghtOnArt_yrs[Metadata281$Cohort_Short == "A"] <- 0.5
Metadata281$LenghtOnArt_yrs[Metadata281$Cohort_Short == "C"] <- 0.0
Metadata281$LenghtOnArt_yrs[Metadata281$Cohort_Short == "B"] <- time_length(Metadata281$Visit_date[Metadata281$Cohort_Short == "B"] - Metadata281$ARTStart[Metadata281$Cohort_Short == "B"], "years")
# Make cleaner columns for visualizations
Metadata281$Location[Metadata281$Arm_Short == "MA"] <- "Urban"
Metadata281$Location[Metadata281$Arm_Short == "MT"] <- "Rural"
Metadata281$Cohort[Metadata281$Cohort_Short == "A"] <- "Naïve"
Metadata281$Cohort[Metadata281$Cohort_Short == "B"] <- "Exp"
Metadata281$Cohort[Metadata281$Cohort_Short == "C"] <- "HC"
Metadata281$Week[Metadata281$Visit == 2] <- 0
Metadata281$Week[Metadata281$Visit == 3] <- 24
Metadata281$Location <- as.factor(Metadata281$Location)
Metadata281$Cohort <- as.factor(Metadata281$Cohort)
Metadata281$Cohort <- ordered(Metadata281$Cohort, levels = c("Naïve", "Exp", "HC"))
Metadata281$Week <- as.factor(Metadata281$Week)
# make correct factored variables
fact_cols <- c("Gender", "HIV_Status", "Bristol_Score", "Water_Source_Num",
"Cotrimoxazole_HIV", "Manual_work_job", "Manual_chores_home",
"Head_of_household", "Education_Level", "Normal_work_transportation")
Metadata281[fact_cols] <- lapply(Metadata281[fact_cols], factor)
Metadata281$BMI_Cat <- ordered(Metadata281$BMI_Cat, levels = c("Severe_thinness", "Moderate_thinnes", "Mild_thinness", "Normal", "Overweight", "Obese"))
# make sure immune markers are numeric
immune_markers <- c("CD4_Percent",
"CD4_HLADRpos_CD38pos_Percent",
"CD8_HLADRpos_CD38pos_Percent",
"CD4_PD1_Percent",
"CD8_PD1_Percent",
"CD4_CD103_Percent",
"CD8_CD103_Percent",
"IL6_pg_mL",
"CRP_mg_L")
for (marker in immune_markers){
Metadata281[[marker]] <- as.numeric(Metadata281[[marker]])
}
SupplementalFig3 <- Metadata281
# Impute means for immune markers where needed
for(marker in immune_markers){
Metadata281[[marker]][is.na(Metadata281[[marker]])] <- mean(Metadata281[[marker]], na.rm = TRUE)
}
# Make sure to remove sample with no immune data
Metadata281 <- Metadata281 %>%
filter(SampleID != "ZIM033.3")
# Keep only samples with two visits
Metadata281 <- Metadata281[Metadata281$PID %in% Metadata281$PID[duplicated(Metadata281$PID)],]
# Remove patients that were viremic
vir_PID <- Metadata281 %>%
filter(HIV1_viral_outcome == "Viremic") %>%
dplyr::select(SampleID, PID, Cohort_Short, HIV1_viral_outcome)
PIDs <- unique(vir_PID$PID)
Metadata281$HIV1_viral_outcome[Metadata281$PID %in% PIDs] <- 'Viremic'
# get figure 1 inter data
inter_fig1 <- Metadata281 %>%
filter(!(HIV1_viral_outcome == "Viremic" & Cohort_Short == "B")) %>%
filter(Visit == 2)
# remove all viremmic patients from metadata
Metadata281 <- subset(Metadata281, !(PID %in% PIDs))
# Get dataframe of just baseline measurements
baseline <- Metadata281 %>% filter(Week == 0)
# Creating a color palettes
visitColors <- palette(brewer.pal(n = 5, name = "BrBG"))
cohortColors <- palette(brewer.pal(n = 5, name = "PiYG"))
# Figure 1
sig_df <- data.frame(matrix(ncol = 5, nrow = 0))
colnames(sig_df) <- c("marker", "comparison", "pval", "direction", "patients")
categories <- c("All", "Urban", "Rural")
markers <- c("CD4_HLADRpos_CD38pos_Percent",
"CD8_HLADRpos_CD38pos_Percent",
"CD4_PD1_Percent",
"CD8_PD1_Percent",
"CD4_CD103_Percent",
"CD8_CD103_Percent",
"IL6_pg_mL",
"CRP_mg_L")
for (category in categories){
if(category != "All"){
heatmap_df <- Metadata281 %>% filter(Location == category)
heatmap_baseline <- inter_fig1 %>% filter(Location == category)
}else{
heatmap_df <- Metadata281
heatmap_baseline <- inter_fig1
}
for (marker in markers){
patients <- category
# inter cohort tests
dt <- dunnTest(heatmap_baseline[[marker]], heatmap_baseline$Cohort_Short, method = "bonferroni")
c1 <- "Naïve - Exp"
c2 <- "Naïve - HC"
c3 <- "Exp - HC"
p1 <- dt$res$P.adj[1]
p2 <- dt$res$P.adj[2]
p3 <- dt$res$P.adj[3]
mean <- heatmap_baseline %>%
group_by(Cohort_Short) %>%
summarise(avg = mean(eval(as.name(marker))))
avgA <- mean[mean$Cohort_Short == "A",2]$avg
avgB <- mean[mean$Cohort_Short == "B",2]$avg
avgC <- mean[mean$Cohort_Short == "C",2]$avg
diffAB <- avgA - avgB
diffAC <- avgA - avgC
diffBC <- avgB - avgC
r1 <- c(marker, c1, p1, diffAB, patients)
r2 <- c(marker, c2, p2, diffAC, patients)
r3 <- c(marker, c3, p3, diffBC, patients)
sig_df[nrow(sig_df)+1, ] <- r1
sig_df[nrow(sig_df)+1, ] <- r2
sig_df[nrow(sig_df)+1, ] <- r3
# intra cohort tests
sub <- heatmap_df %>% filter(Cohort_Short == "A")
comparison <- "Naïve: 0 - 24"
t <- wilcox.test(sub[[marker]] ~ Visit, data = sub, paired = T)
mean <- sub %>%
group_by(Visit) %>%
summarise(avg = mean(eval(as.name(marker))))
avg1 <- mean[mean$Visit == 2,2]$avg
avg2 <- mean[mean$Visit ==3,2]$avg
mean_diff <- avg1 - avg2
r <- c(marker, comparison, t$p.value, mean_diff, patients)
sig_df[nrow(sig_df)+1, ] <- r
sub <- heatmap_df %>% filter(Cohort_Short == "B")
comparison <- "Exp: 0 - 24"
t <- wilcox.test(sub[[marker]] ~ Visit, data = sub, paired = T)
mean <- sub %>%
group_by(Visit) %>%
summarise(avg = mean(eval(as.name(marker))))
avg1 <- mean[mean$Visit == 2,2]$avg
avg2 <- mean[mean$Visit ==3,2]$avg
mean_diff <- avg1 - avg2
r <- c(marker, comparison, t$p.value, mean_diff, patients)
sig_df[nrow(sig_df)+1, ] <- r
sub <- heatmap_df %>% filter(Cohort_Short == "C")
comparison <- "HC: 0 - 24"
t <- wilcox.test(sub[[marker]] ~ Visit, data = sub, paired = T)
mean <- sub %>%
group_by(Visit) %>%
summarise(avg = mean(eval(as.name(marker))))
avg1 <- mean[mean$Visit == 2,2]$avg
avg2 <- mean[mean$Visit ==3,2]$avg
mean_diff <- avg1 - avg2
r <- c(marker, comparison, t$p.value, mean_diff, patients)
sig_df[nrow(sig_df)+1, ] <- r
}
}
sig_df$direction <- as.numeric(sig_df$direction)
sig_df$pval <- as.numeric(sig_df$pval)
sig_df$patients <- as.factor(sig_df$patients)
sig_df$Significance <- 'NS'
sig_df$Significance[sig_df$pval <= 0.05] <- '*'
sig_df$Significance[sig_df$pval <= 0.01] <- '**'
sig_df$Significance[sig_df$pval <= 0.001] <- '***'
sig_df$Significance[sig_df$pval <= 0.0001] <- '****'
sig_df$Significance <- as.factor(sig_df$Significance)
sig_df$direction <- sig_df$direction * -1
sig_df$direction[sig_df$pval > 0.05] <- NA
markers_labels <- c("CD4+CD38+HLA-DR+ (%)",
"CD8+CD38+HLA-DR+ (%)",
"CD4+PD1+ (%)",
"CD8+PD1+ (%)",
"CD4+CD103+ (%)",
"CD8+CD103+ (%)",
"IL-6 (pg/mL)",
"CRP (mg/L)")
for (i in 1:length(markers)){
sig_df$marker[sig_df$marker == markers[i]] <- markers_labels[i]
}
sig_df$category[sig_df$comparison == "Naïve - Exp" |
sig_df$comparison == "Naïve - HC" |
sig_df$comparison == "Exp - HC"] <- "Inter"
sig_df$category[is.na(sig_df$category)] <- "Intra"
comp <- c("Naïve - Exp","Naïve - HC","Exp - HC", "Naïve: 0 - 24", "Exp: 0 - 24", "HC: 0 - 24")
ggplot(sig_df, aes(x = factor(comparison, levels = comp),
y = factor(marker, levels = markers_labels))) +
geom_raster(aes(fill=direction))+
scale_fill_gradient2(high = "red2", mid = "white", low = "deepskyblue3", na.value = "lightgrey", midpoint = 0,
limits = c(-25, 25))+
geom_text(aes(label=Significance), size = 6) +
theme_bw() +
xlab("") +
ylab("") +
theme(text = element_text(size = 18),
plot.title = element_text(hjust = 0.5),
axis.text.x = element_text(angle = 45, vjust = 1, hjust=1),
legend.key = element_rect(colour = "black", size = 1),
legend.title = element_text(size = 14),
legend.text=element_text(size=12)) +
guides(fill=guide_legend(title="Percent \nDelta", reverse = T)) +
facet_nested(~patients + category, scales = "free_x")
# Supplemental Figure S3 but not stratified by location
SupplementalFig3$Cohort_Short <- factor(SupplementalFig3$Cohort_Short, levels = c("C", "A", "B"))
SupplementalFig3$Visit <- as.factor(SupplementalFig3$Visit)
linear_methods_df <- as.data.frame(matrix(ncol = 4, nrow = 0))
colnames(linear_methods_df) <- c('ImmuneMarker', 'ExplanatoryVariable', 'Coefficient', 'pvalue')
explanatory_vars <- c('Naïve vs HC', 'Exp vs HC', 'COT (HC)',
'COT (Naïve, \nrelative to HC)', 'COT (Exp, \nrelative to HC)')
for (marker in immune_markers) {
# Model M1
fm <- as.formula(paste0('rank(', marker,') ~ Cohort_Short + Visit + Cohort_Short*Visit'))
results <- feols(fm, data = SupplementalFig3, cluster = ~PID)
r1 <- c(marker, explanatory_vars[1], results$coeftable[2,1], round(results$coeftable[2,4], 2))
r2 <- c(marker, explanatory_vars[2], results$coeftable[3,1], round(results$coeftable[3,4], 2))
r3 <- c(marker, explanatory_vars[3], results$coeftable[4,1], round(results$coeftable[4,4], 2))
r4 <- c(marker, explanatory_vars[4], results$coeftable[5,1], round(results$coeftable[5,4], 2))
r5 <- c(marker, explanatory_vars[5], results$coeftable[6,1], round(results$coeftable[6,4], 2))
linear_methods_df[nrow(linear_methods_df)+1, ] <- r1
linear_methods_df[nrow(linear_methods_df)+1, ] <- r2
linear_methods_df[nrow(linear_methods_df)+1, ] <- r3
linear_methods_df[nrow(linear_methods_df)+1, ] <- r4
linear_methods_df[nrow(linear_methods_df)+1, ] <- r5
}
linear_methods_df$Coefficient <- as.numeric(linear_methods_df$Coefficient)
linear_methods_df$pvalue <- as.numeric(linear_methods_df$pvalue)
linear_methods_df$Coefficient[linear_methods_df$pvalue > 0.05] <- NA
linear_methods_df$Significance <- 'NS'
linear_methods_df$Significance[linear_methods_df$pvalue <= 0.05] <- '*'
linear_methods_df$Significance[linear_methods_df$pvalue <= 0.01] <- '**'
linear_methods_df$Significance[linear_methods_df$pvalue <= 0.001] <- '***'
linear_methods_df$Significance[linear_methods_df$pvalue <= 0.0001] <- '****'
linear_methods_df$Significance <- as.factor(linear_methods_df$Significance)
markers_labels_1 <- c("CD4+ (%)",
"CD4+CD38+HLA-DR+ (%)",
"CD8+CD38+HLA-DR+ (%)",
"CD4+PD1+ (%)",
"CD8+PD1+ (%)",
"CD4+CD103+ (%)",
"CD8+CD103+ (%)",
"IL-6 (pg/mL)",
"CRP (mg/L)")
for (i in 1:length(immune_markers)){
linear_methods_df$ImmuneMarker[linear_methods_df$ImmuneMarker == immune_markers[i]] <- markers_labels_1[i]
}
ggplot(linear_methods_df, aes(x = factor(ExplanatoryVariable, levels = explanatory_vars),
y = factor(ImmuneMarker, levels = markers_labels_1))) +
geom_raster(aes(fill=Coefficient))+
scale_fill_gradient2(low = "deepskyblue3", mid = "white", high = "red2", na.value = "lightgrey", midpoint = 0,
limits = c(-105, 105))+
geom_text(aes(label=Significance), size = 6) +
theme_bw() +
xlab("Predictor") +
ylab("Immune Marker") +
theme(text = element_text(size = 18),
plot.title = element_text(hjust = 0.5),
axis.text.x = element_text(angle = 45, vjust = 1, hjust=1),
legend.key = element_rect(colour = "black", size = 1),
legend.title = element_text(size = 14),
legend.text=element_text(size=12)) +
guides(fill=guide_legend(title="Coefficient", reverse = T))
shannon_df <- qiime2R::read_qza("data/DiversityData/shannon_vector.qza")
shannon_df <- shannon_df$data
shannon_df <- tibble::rownames_to_column(shannon_df, "SampleID")
Metadata281 <- Metadata281 %>%
dplyr::left_join(shannon_df, by = "SampleID")
shannon_baseline <- Metadata281 %>% filter(Visit == 2)
v2 <- Metadata281 %>% filter(Visit == 2) %>% dplyr::select(PID, shannon_entropy, Cohort, Location)
v3 <- Metadata281 %>% filter(Visit == 3) %>% dplyr::select(PID, shannon_entropy)
delta_df <- dplyr::full_join(v2, v3, by = "PID")
delta_df<- delta_df %>%
mutate(delta = shannon_entropy.y - shannon_entropy.x)
# All statistical calculations were done separately, see Methods
ann <- data.frame(lx = c(2, 2, 2.5, 2.5, 2.5,2.5),
ly = c(8, 8, 7.5, 7.5, 7.5, 7.5),
lab = c("*", "____________", "*", "______", "*", "______"),
Location = c("Rural", "Rural", "Rural", "Rural", "Urban", "Urban"))
A <- ggplot(shannon_baseline, aes(x = Cohort, y = shannon_entropy)) +
geom_boxplot(show.legend = F,
coef = 6,
alpha = 0.5, aes(fill = Cohort)) +
scale_fill_manual(values = c(cohortColors[1],
cohortColors[3],
cohortColors[5])) +
geom_point(show.legend = F,
position = position_jitterdodge(0.2),
aes(color = Cohort)) +
scale_color_manual(values = c("black",
"black",
"black")) +
theme_bw() +
theme(text = element_text(size = 18)) +
xlab("Cohort") +
ylab("Shannon Entropy") +
facet_grid(~Location) +
geom_text(data = ann, aes(x = lx, y = ly, label = lab), size = 6)
ann <- data.frame(lx = c(1, 1),
ly = c(7.5, 7.5),
lab = c("*", "_____"),
Location = c("Rural", "Rural") )
B <- ggplot(Metadata281, aes(x = Cohort, y = shannon_entropy)) +
geom_boxplot(show.legend = T,coef = 6, alpha = 0.5, aes(fill = Week)) +
geom_point(show.legend = F, aes(color = Week), position = position_jitterdodge(0.2)) +
scale_fill_manual(values = c(visitColors[1], visitColors[5])) +
scale_color_manual(values = c(visitColors[1], visitColors[5])) +
theme_bw() +
theme(text = element_text(size = 18)) +
facet_grid(~Location)+
xlab("Cohort") +
ylab("") +
geom_text(data = ann, aes(x = lx, y = ly, label = lab), size = 6)
# Model M2
C <- ggplot(delta_df, aes(x = shannon_entropy.x, y = delta, color = Cohort)) +
geom_point() +
scale_color_manual(values = c(cohortColors[1],
"gray65",
cohortColors[5])) +
geom_smooth(method = "lm", fill = NA)+
theme_bw() +
theme(text = element_text(size = 18)) +
xlab("Baseline Shannon Entropy") +
ylab("Delta Shannon Entropy \n (Wk24 - Wk0)") +
stat_cor(label.x = 5)+
ylim(-2,3)
cowplot::plot_grid(A, B, C,
labels = c("A", "B", "C"),
ncol = 3, nrow = 1 ,
rel_widths = c(0.7, 1, 1))
# takes a row containing taxon names and returns last two known levels
# e.g. d__Eukaryota;p__Incertae_Sedis;c__Incertae_Sedis;o__Incertae_Sedis;f__Incertae_Sedis;g__Blastocystis
# to f__Prevotellaceae; g__Prevotella
getNiceTaxon <- function(rowVector){
newVector <- c()
for(rowValue in rowVector){
if(substr(rowValue, 1, 6) != "module"){
newName <- tail(strsplit(rowValue, split = ";")[[1]], 2)
newName <- paste0(newName[1], ";", newName[2])
newVector <- c(newVector, c(newName))
}else{
newVector <- c(newVector, c(rowValue))
}
}
return(newVector)
}
# biom table
biom <- qiime2R::read_qza("data/BiplotsData/tax_filtered_281.qza")
biom <- as_tibble(biom$data, rownames = 'ASV')
# subset of metadata
sub_meta <- readr::read_delim("data/subset_Metadata281.tsv",
delim = "\t",
escape_double = FALSE,
col_types = cols(Location = col_factor(levels = c("Rural","Urban")),
Week = col_factor(levels = c("0","24")),
HIV1_viral_outcome = col_factor(levels = c("Suppressed",
"Viremic"))),
trim_ws = TRUE)
# Weighted UniFrac PCoA
w_uni <- qiime2R::read_qza("data/DiversityData/weighted_unifrac_pcoa_results.qza")$data
w_uni_pcoa <- as_tibble(w_uni$Vectors)
# get proportions explained
w_uni_prop <- w_uni$ProportionExplained[1:3]
# get biplot from qiime
q_biplot <- qiime2R::read_qza("data/BiplotsData/biplot.qza")
q_biplotSpecies <- as_tibble(q_biplot$data$Species)
q_biplotVectors <- as_tibble(q_biplot$data$Vectors)
# get taxonomy and replace with better labels
tax <- qiime2R::read_qza("data/BiplotsData/classification.qza")
tax <- tax$data
colnames(tax)[1] <- "FeatureID"
q_biplotSpecies <- q_biplotSpecies %>%
dplyr::left_join(tax, by = "FeatureID") %>%
dplyr::select(Taxon, PC1, PC2, PC3)
colnames(q_biplotSpecies)[1] <- "FeatureID"
q_biplotSpecies %>%
dplyr::mutate(FeatureID = getNiceTaxon(FeatureID)) -> abbrBiplotSpecies
# prepare sample data
w_uni_pcoa <- w_uni_pcoa %>%
dplyr::select(SampleID, PC1, PC2, PC3) %>%
dplyr::left_join(sub_meta, by = "SampleID")
w_uni_pcoa$Cohort <- as.factor(w_uni_pcoa$Cohort)
w_uni_pcoa$Cohort <- ordered(w_uni_pcoa$Cohort, levels = c("Naïve", "Exp", "HC"))
# prep axis labels
pc1_lab <- paste0('PC1 (', round(w_uni_prop[1],3)*100, '%)')
pc2_lab <- paste0('PC2 (', round(w_uni_prop[2],3)*100, '%)')
pc3_lab <- paste0('PC3 (', round(w_uni_prop[3],3)*100, '%)')
# create base plot with arrows for first plot
abbrBiplotSpecies %>%
mutate(a=sqrt(PC1^2+PC2^2)) %>% # calculate the distance from the origin
top_n(4, a) %>% #keep 8 furthest away points
mutate(PC1=PC1*5, PC2=PC2*5) -> abbrBiplotSpeciesArrows # scale arrows linearly
new_lables <- c()
for (row in 1:nrow(abbrBiplotSpeciesArrows)) {
tax_name <- abbrBiplotSpeciesArrows$FeatureID[row]
genus <- tail(strsplit(tax_name, split = ";")[[1]], n=1)
new_lables <- c(new_lables, genus)
}
abbrBiplotSpeciesArrows <- cbind(abbrBiplotSpeciesArrows, new_lables)
A <- ggplot(w_uni_pcoa, aes(x = PC1, y = PC2, color = Cohort)) +
geom_point(show.legend = F, size = 5 ,alpha = 0.6) +
scale_color_manual(values = c(cohortColors[1],
"grey65",
cohortColors[5])) +
theme_bw(base_size = 18) +
xlab(pc1_lab) +
ylab(pc2_lab) +
geom_segment(data = abbrBiplotSpeciesArrows, aes(x=0, xend=PC1, y=0, yend=PC2),
arrow = arrow(length = unit(0.3,"cm")),color = 'black')+
geom_text_repel(data = abbrBiplotSpeciesArrows, aes(x=PC1, y=PC2, label=new_lables),size=7, color = 'black')
# second plot
abbrBiplotSpecies %>%
mutate(a=sqrt(PC1^2+PC3^2)) %>% # calculate the distance from the origin
top_n(4, a) %>% #keep 8 furthest away points
mutate(PC1=PC1*5, PC3=PC3*5) -> abbrBiplotSpeciesArrows # scale arrows linearly
new_lables <- c()
for (row in 1:nrow(abbrBiplotSpeciesArrows)) {
tax_name <- abbrBiplotSpeciesArrows$FeatureID[row]
genus <- tail(strsplit(tax_name, split = ";")[[1]], n=1)
new_lables <- c(new_lables, genus)
}
abbrBiplotSpeciesArrows <- cbind(abbrBiplotSpeciesArrows, new_lables)
B <- ggplot(w_uni_pcoa, aes(x = PC1, y = PC3, color = Cohort)) +
geom_point(size = 5, alpha = 0.6 ) +
scale_color_manual(values = c(cohortColors[1],
"grey65",
cohortColors[5])) +
theme_bw(base_size = 18) +
xlab(pc1_lab) +
ylab(pc3_lab) +
geom_segment(data = abbrBiplotSpeciesArrows, aes(x=0, xend=PC1, y=0, yend=PC3),
arrow = arrow(length = unit(0.3,"cm")),color = 'black')+
geom_text_repel(data = abbrBiplotSpeciesArrows, aes(x=PC1, y=PC3, label=new_lables),size=7, color = 'black')
cowplot::plot_grid(A, B,
labels = c("A", " "),
ncol = 2, nrow = 1)
beta_df <- data.frame(matrix(ncol = 2, nrow = 0))
colnames(beta_df) <- c("SampleID", "Dysbiosis")
distance_matrix<- qiime2R::read_qza("data/DiversityData/weighted_unifrac_distance_matrix.qza")
distance_matrix <- distance_matrix$data %>% as.matrix()
distance_matrix[upper.tri(distance_matrix)] <- NA
diag(distance_matrix) <- NA
distance_matrix <- distance_matrix %>% as.data.frame()
distance_matrix <- tibble::rownames_to_column(distance_matrix, "SampleID")
# calculate average distance from PLWH to healthy controls for samples at
# each time point
weeks <- c(0, 24)
for (current_week in weeks){
sampleIDs <- Metadata281 %>% filter(Cohort_Short != 'C' & Week == current_week)
AB_SIDs <- sampleIDs$SampleID
sampleIDs <- Metadata281 %>% filter(Cohort_Short == 'C'& Week == current_week)
C_SIDs <- sampleIDs$SampleID
dysbiosis_matrix <- distance_matrix[distance_matrix$SampleID %in% C_SIDs, ]
for (sample in AB_SIDs) {
avg <- mean(dysbiosis_matrix[[sample]], na.rm = T)
beta_df[nrow(beta_df)+1, ] <- c(sample, avg)
}
}
# remove NAs
beta_df <- beta_df[!is.na(beta_df$Dysbiosis), ]
beta_df <- beta_df %>% filter(Dysbiosis != "NaN")
# create subset and baseline
meta_sub <- Metadata281 %>% dplyr::select(SampleID, Cohort, Week, PID, Location)
beta_df <- dplyr::left_join(beta_df, meta_sub, by = "SampleID")
beta_df$Dysbiosis <- as.numeric(beta_df$Dysbiosis)
baseline_beta <- beta_df %>%
filter(Week == 0) %>%
dplyr::select(PID, Dysbiosis)
# calculate deltas
bv3 <- beta_df %>% filter(Week == 24) %>% dplyr::select(PID, Dysbiosis, Cohort, Location)
beta_delta <- dplyr::left_join(baseline_beta, bv3, by = "PID")
beta_delta <- beta_delta %>% mutate(Delta_Dysbiosis = Dysbiosis.y - Dysbiosis.x)
# Model M3
B <- ggplot(na.omit(beta_delta), aes(x = Dysbiosis.x, y = Delta_Dysbiosis, color = Cohort)) +
geom_point() +
geom_smooth(method = "lm", fill = NA)+
scale_color_manual(values = c(cohortColors[1],
"gray65")) +
theme_bw() +
theme(text = element_text(size = 18)) +
xlab("Baseline Dysbiosis") +
ylab("Delta Dysbiosis \n (Wk24 - Wk0)") +
stat_cor(label.x = 0.5)
C <- ggplot(na.omit(beta_delta), aes(x = Dysbiosis.x, y = Delta_Dysbiosis, color = Location)) +
geom_point() +
geom_smooth(method = "lm", fill = NA)+
theme_bw() +
theme(text = element_text(size = 18)) +
xlab("Baseline Dysbiosis") +
ylab(" ") +
stat_cor(label.x = 0.6)
cowplot::plot_grid(B, C,
labels = c("B", "C"),
ncol = 2, nrow = 1)
# reload metadata to retain as many samples as possible
Metadata281_full <- read_excel("data/Metadata281.xlsx")
Metadata281_full <- subset(Metadata281_full, Metadata281_full$SampleID != "ZIM026.2" &
Metadata281_full$SampleID != "ZIM026.3" &
Metadata281_full$SampleID != "ZIM008.3" &
Metadata281_full$SampleID != "ZIM062.3")
Metadata281_full$Location[Metadata281_full$Arm_Short == "MA"] <- "Urban"
Metadata281_full$Location[Metadata281_full$Arm_Short == "MT"] <- "Rural"
Metadata281_full$Cohort[Metadata281_full$Cohort_Short == "A"] <- "Naïve"
Metadata281_full$Cohort[Metadata281_full$Cohort_Short == "B"] <- "Exp"
Metadata281_full$Cohort[Metadata281_full$Cohort_Short == "C"] <- "HC"
Metadata281_full$Week[Metadata281_full$Visit == 2] <- 0
Metadata281_full$Week[Metadata281_full$Visit == 3] <- 24
Metadata281_full$Cohort <- factor(Metadata281_full$Cohort, levels = c("Naïve", "Exp", "HC"))
Metadata281_full$Treatment_stat[Metadata281_full$Cohort_Short == "A" & Metadata281_full$Visit == 2] <- "untreated"
Metadata281_full$Treatment_stat[Metadata281_full$Cohort_Short == "A" & Metadata281_full$Visit == 3] <- "treated"
Metadata281_full$Treatment_stat[Metadata281_full$Cohort_Short == "B"] <- "treated"
Metadata281_full$Treatment_stat[Metadata281_full$Cohort_Short == "C"] <- "untreated"
# read in weighted unifrac distance matrix
distance_matrix<- qiime2R::read_qza("data/DiversityData/weighted_unifrac_distance_matrix.qza")
distance_matrix <- distance_matrix$data %>% as.matrix()
distance_matrix <- distance_matrix %>% as.data.frame()
distance_matrix <- tibble::rownames_to_column(distance_matrix, "SampleID")
indices <- order(distance_matrix$SampleID)
distance_matrix <- distance_matrix[,2:ncol(distance_matrix)]
distance_matrix <- distance_matrix[indices, indices]
adonis_data <- Metadata281_full[Metadata281_full$SampleID %in% colnames(distance_matrix),]
# make sure they are in the same sample order
adonis_data$SampleID == colnames(distance_matrix)
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [46] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [61] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [76] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [91] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [106] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [121] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [136] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [151] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [166] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [181] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [196] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [211] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [226] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [241] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [256] TRUE TRUE TRUE TRUE
# Model M4
adonis2(as.dist(distance_matrix) ~ adonis_data$Location +
adonis_data$Location*adonis_data$Week,
strata = adonis_data$PID)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Blocks: strata
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = as.dist(distance_matrix) ~ adonis_data$Location + adonis_data$Location * adonis_data$Week, strata = adonis_data$PID)
## Df SumOfSqs R2 F Pr(>F)
## adonis_data$Location 1 2.870 0.02181 5.7291 0.029 *
## adonis_data$Week 1 0.853 0.00648 1.7034 0.010 **
## adonis_data$Location:adonis_data$Week 1 0.152 0.00115 0.3034 0.795
## Residual 255 127.734 0.97056
## Total 258 131.610 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Model M5
adonis2(as.dist(distance_matrix) ~ adonis_data$Visit_Age +
adonis_data$HIV_Status +
adonis_data$Treatment_stat,
strata = adonis_data$PID)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Blocks: strata
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = as.dist(distance_matrix) ~ adonis_data$Visit_Age + adonis_data$HIV_Status + adonis_data$Treatment_stat, strata = adonis_data$PID)
## Df SumOfSqs R2 F Pr(>F)
## adonis_data$Visit_Age 1 2.721 0.02067 5.4688 0.086 .
## adonis_data$HIV_Status 1 1.045 0.00794 2.1004 0.114
## adonis_data$Treatment_stat 1 0.991 0.00753 1.9928 0.051 .
## Residual 255 126.853 0.96386
## Total 258 131.610 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# create phyloseq object
pseq <- qiime2R::qza_to_phyloseq(features = "data/ANCOMBCData/tax_filtered_281.qza",
taxonomy = "data/ANCOMBCData/classification.qza",
metadata = "data/subset_Metadata281.tsv",
tree = "data/ANCOMBCData/tree_Zim.qza")
# Change factored variable in metadata
meta <- pseq %>% sam_data() %>% as.data.frame()
meta$Cohort <- factor(meta$Cohort, levels = c("HC", "Naïve", "Exp"))
sample_data(pseq) <- sample_data(meta)
# get baseline
pseq.baseline <- pseq
baseline <- pseq %>% sample_data() %>% as.data.frame()
baseline <- baseline[baseline$Week == 0]
sample_data(pseq.baseline) <- sample_data(baseline)
# only look at cohort A
pseq.A <- pseq
cohortA <- pseq %>% sample_data() %>% as.data.frame()
cohortA <- cohortA[cohortA$Cohort == "Naïve"]
sample_data(pseq.A) <- sample_data(cohortA)
# Model M6
out <- ancombc2(
data = pseq.baseline,
tax_level = 'Genus',
fix_formula = 'Location + HIV1_viral_outcome + Cohort',
group = 'Cohort'
)
# Model M7
# out <- ancombc2(
# data = pseq,
# tax_level = 'Genus',
# fix_formula = 'Location + Week + HIV1_viral_outcome + Cohort',
# rand_formula = '(1 | PID)',
# group = 'Cohort'
# )
# Model M8
# out <- ancombc2(
# data = pseq.A,
# tax_level = 'Genus',
# fix_formula = 'Location + Week + HIV1_viral_outcome',
# rand_formula = '(1 | PID)'
# )
# Results
res <- out$res
## Location results
df_location <- res %>%
dplyr::select(taxon, contains("Location"))
## Week results
df_week <- res %>%
dplyr::select(taxon, contains("Week"))
## Viral Outcome results
df_vo <- res %>%
dplyr::select(taxon, contains("Viral"))
## Cohort results
df_cohort <- res %>%
dplyr::select(taxon, contains("Cohort"))
# put results data in correct format
df_cohort_fig <- df_cohort %>%
dplyr::filter(diff_CohortExp == 1 | diff_CohortNaïve == 1) %>%
dplyr::mutate(lfc_CohortNaïve = ifelse(diff_CohortNaïve == 1,
lfc_CohortNaïve, 0),
lfc_CohortExp = ifelse(diff_CohortExp == 1,
lfc_CohortExp, 0)
) %>%
dplyr::transmute(taxon,
`Exp vs HC` = round(lfc_CohortExp, 2),
`Naïve vs HC` = round(lfc_CohortNaïve, 2)) %>%
tidyr::pivot_longer(cols = `Exp vs HC`:`Naïve vs HC`,
names_to = "group", values_to = "value") %>%
dplyr::arrange(taxon)
# plot
lo <- floor(min(df_cohort_fig$value))
up <- ceiling(max(df_cohort_fig$value))
mid <- (lo + up)/2
df_cohort_fig %>%
mutate(taxon = paste(substr(taxon, 7, 100000L))) %>%
ggplot(aes(x = factor(group, level = c("Naïve vs HC","Exp vs HC")), y = taxon, fill = value)) +
geom_tile(color = "black") +
scale_fill_gradient2(low = "deepskyblue3", high = "red2", mid = "white",
na.value = "white", midpoint = mid, limit = c(lo, up),
name = NULL) +
geom_text(aes(group, taxon, label = value), color = "black", size = 6) +
labs(x = NULL, y = NULL) +
ggtitle("Baseline") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5),
text = element_text(size = 18))
# get microbiome diversity data
shannon_df <- read_qza("data/DiversityData/shannon_vector.qza")
shannon_df <- shannon_df$data
shannon_df <- tibble::rownames_to_column(shannon_df, "SampleID")
w_uni <- read_qza("data/DiversityData/weighted_unifrac_pcoa_results.qza")$data
w_uni_pcoa <- as_tibble(w_uni$Vectors)
w_uni_pcoa <- w_uni_pcoa %>%
dplyr::select(SampleID, PC1, PC2, PC3, PC4)
# add to metadata
Metadata281 <- Metadata281 %>%
left_join(shannon_df, by = "SampleID") %>%
left_join(w_uni_pcoa, by = "SampleID")
# get immune marker table and feature table
IM_data <- Metadata281 %>% dplyr::select(SampleID, all_of(immune_markers))
features <- Metadata281 %>%
dplyr::select(SampleID, Cohort, Location, Gender, BMI, BMI_Cat, HIV_Status, Week, Visit_Age,
Bristol_Score, Education_Level, Water_Source_Num, Cotrimoxazole_HIV,
Cotri_Length_Month, Normal_work_transportation, Manual_work_job,
Manual_chores_home, Head_of_household, LenghtOnArt_yrs) %>%
left_join(shannon_df, by = "SampleID") %>%
left_join(w_uni_pcoa, by = "SampleID")
# some preprocessing to address NAs
# there were 20 samples without microbiome informations due to quality control
features$Cotrimoxazole_HIV[features$Cotrimoxazole_HIV == "NA"] <- NA
features$Cotrimoxazole_HIV[is.na(features$Cotrimoxazole_HIV)] <- "No"
features$Cotrimoxazole_HIV <- droplevels(features$Cotrimoxazole_HIV)
features$Normal_work_transportation[features$Normal_work_transportation == "NA"] <- NA
features$Normal_work_transportation <- droplevels(features$Normal_work_transportation)
features$BMI_Cat[is.na(features$BMI_Cat)] <- "Moderate_thinnes"
features$Head_of_household <- droplevels(features$Head_of_household)
# backwards stepwise regression feature selection
formulas <- vector(mode="list")
for(marker in immune_markers){
print(marker)
# merge immune marker with features
IM_one <- IM_data %>%
dplyr::select(SampleID, marker)
step_data <- features %>%
left_join(IM_one, by = "SampleID") %>%
dplyr::select(-SampleID) %>%
na.omit()
## we want the model to have location
f <- as.formula(paste0(marker," ~ Location"))
myM <- lm(f, data = step_data)
## model with everything
fB <- as.formula(paste0(marker, " ~."))
biggest <- lm(fB, data = step_data)
## stepwise regression, direction = "backward"
bwd <- step(biggest,
trace = 1,
direction = "backward",
scope = list(lower = myM, upper = biggest))
bwd_formula <- paste0(bwd$call)[2]
formulas[[marker]] <- bwd_formula
}
# for models involving PLWH we viral load
# for models involving ART exp PLWH we added length on ART
# made models dataset
lm_data <- as.data.frame(matrix(nrow = 0, ncol = 4))
colnames(lm_data) <- c("Marker", "Week", "Cohort", "Formula")
weeks <- c(0, 24)
Cohorts <- c("Naïve", "Exp", "HC")
for(marker in immune_markers){
for(week in weeks){
for(cohort in Cohorts){
if(cohort == "Naïve"){
form <- paste0(formulas[marker], "+ HIV1_viral_load")
}
if(cohort == "Exp"){
form <- paste0(formulas[marker], "+ HIV1_viral_load + LenghtOnArt_yrs")
}
if(cohort == "HC"){
form <- paste0(formulas[marker])
}
r <- c(marker, week, cohort, form)
lm_data[nrow(lm_data)+1, ] <- r
}
}
}
# run models and make results
sub_d <- Metadata281 %>%
filter(Cohort == "HC" & Week == 24)
f <- as.formula(CRP_mg_L ~ Location + BMI + Visit_Age + PC1 )
summary(lm(f, data = sub_d))
lm_table <- read_excel("data/PredictiveModeling/LinearModelInfo.xlsx")
lm_table$Tested[lm_table$Tested == "Yes"] <- "Included in Model"
lm_table$Cohort <- factor(lm_table$Cohort, levels = c("Naïve", "Exp", "HC"))
lm_table$Tested <- factor(lm_table$Tested, levels = c("Up", "Up (Model NS)", "Down",
"Down (Model NS)", "Included in Model"))
im_labels <- c("CD4+CD38+HLA-DR+ (%)",
"CD8+CD38+HLA-DR+ (%)",
"CD4+PD1+ (%)",
"CD8+PD1+ (%)",
"CD4+CD103+ (%)",
"CD8+CD103+ (%)",
"IL-6 (pg/mL)",
"CRP (mg/L)",
"CD4+ (%)")
lm_table$`Immune Marker` <- factor(lm_table$`Immune Marker`, levels = im_labels)
cols <- colnames(lm_table)
lm_table[cols] <- lapply(lm_table[cols], factor)
ggplot(lm_table, aes(x = Predictor, y = `Immune Marker`)) +
geom_point(aes(color = Tested), size = 6) +
scale_color_manual(values = c("Up" = "red2",
"Up (Model NS)"="peachpuff",
"Down"="deepskyblue3",
"Down (Model NS)" = "paleturquoise",
"Included in Model" = "lightgrey")) +
theme_bw() +
theme(text = element_text(size = 18),
axis.text.x = element_text(angle = 45, vjust = 1, hjust=1),
legend.title = element_blank()) +
ylab("Immune Marker") +
facet_grid(Cohort~Week)
library(MicrobiotaProcess)
# making a pretty taxa table
tax <- read_qza("data/NetworkAnalysis/dataFrom/classification.qza")
#head(tax$taxatab)
tax$taxatab$sequence = seq_along(rownames(tax$taxatab))
getNiceTaxon <- function(rowVector){
newVector <- c()
for(rowValue in 1:length(rowVector)){
#print(rowVector[rowValue])
if(substr(rowVector[rowValue], 1, 6) != "module"){
newName <- paste(tax$taxatab[rowVector[rowValue],], collapse=";")
#newName <- tax$taxatab[rowVector[rowValue],]
newName <- tail(strsplit(newName, split = ";")[[1]], 3)
newName <- paste0(newName[1], ".", newName[2],"_",newName[3])
newVector <- c(newVector, c(newName))
}else{
newVector <- c(newVector, c(rowVector[rowValue]))
}
}
return(newVector)
}
# Filter SCNIC output for ASVs that are not seen in less than 20% of samples
# before loading in convert SCNIC biom to TSV
# remove first line in the TSV
SCNIC_output <- read_delim("data/NetworkAnalysis/dataFrom/table_collapse.txt",
delim = "\t", escape_double = FALSE,
trim_ws = TRUE)
colnames(SCNIC_output)[1] <- "OTU_ID"
# convert to proper format
t_SO <- data.frame(t(SCNIC_output[,-1]))
colnames(t_SO) <- SCNIC_output$OTU_ID
#Compute relative abundance
reads = rowSums(t_SO) #not same, so not rarefied
scRA = t_SO/reads
rowSums(scRA) #sums to 1
# remove ASVs prevalent in less than 20% of samples
# goes from 8853 to 237 ASVs/modules
# goes from 338 to 106 taxa/modules
count0s<- function(col) {
zeros <- sum(col==0)
perc <- length(col)*0.2
length(col) - zeros > perc
}
scRAthin = scRA[,apply(scRA, 2, count0s)]
head(t_SO[,"ae988b9b4c6c1e2466fa26cd1e2ec5c7"])
newCols = getNiceTaxon(colnames(scRAthin))
colnames(scRAthin) = newCols
scRAthin$SampleID <- row.names(scRAthin)
scRAthin = cbind(reads, scRAthin)
write.csv(scRAthin, "data/NetworkAnalysis/dataOut/mb.csv", quote=FALSE, row.names=FALSE)
write.csv(colnames(scRAthin), "data/NetworkAnalysis/dataOut/mbH.csv", quote=FALSE, row.names=FALSE)
im = read.csv("data/NetworkAnalysis/dataOut/immuneMarkersImputed.csv")
mb = read.csv("data/NetworkAnalysis/dataOut/mb.csv")
h = read.csv("data/NetworkAnalysis/dataOut/mbH.csv")
colnames(mb) = h$x
#Add prefixes to data
colnames(im)[5:ncol(im)] = paste("im_", colnames(im)[5:ncol(im)], sep="")
colnames(mb)[2:(ncol(mb)-1)] = paste("mb_", colnames(mb)[2:(ncol(mb)-1)], sep="")
#Merge data
#Have less data with im than mb
d = merge(im, mb, by.x="SampleID", by.y="SampleID")
#separate into Week0 and Week24
d0 = subset(d, Week==0) #127 rows
d24 = subset(d, Week==24) #113. Delta would be pretty good
#Save files and column names
write.csv(d0, "data/NetworkAnalysis/dataOut/imMbW0.csv", quote=FALSE, row.names=FALSE)
write.csv(colnames(d0), "data/NetworkAnalysis/dataOut/imMbW0_h.csv", quote=FALSE, row.names=FALSE)
write.csv(d24, "data/NetworkAnalysis/dataOut/imMbW24.csv", quote=FALSE, row.names=FALSE)
write.csv(colnames(d24), "data/NetworkAnalysis/dataOut/imMbW24_h.csv", quote=FALSE, row.names=FALSE)
##########Parameters
tag = "3sW0"
dataOut = paste("data/NetworkAnalysis/dataOut/models_", tag, "June2023.csv", sep="")
##########Begin processing
dd = read.csv("data/NetworkAnalysis/dataOut/imMbW0.csv")
h = read.csv("data/NetworkAnalysis/dataOut/imMbW0_h.csv")
colnames(dd) = h$x
#dd = read.csv("data/NetworkAnalysis/dataOut/imMbW24.csv")
#h = read.csv("data/NetworkAnalysis/dataOut/imMbW24_h.csv")
#colnames(dd) = h$x
#Which arm is reference? HC
dd$Cohort = factor(dd$Cohort, levels = c("Naive", "Exp", "HC"))
coCol = which(colnames(dd)=="Cohort")
#Rename this to Grp
colnames(dd)[coCol] = "Grp"
#read count column
rc = which(colnames(dd)=="reads")
#must know where in file which "ome" starts and stops. These are column numbers
myOmes = c("im_", "mb_")
omeCols = lapply(myOmes, function(x) grep(paste("^", x, sep=""), colnames(dd)))
cc = combinations(n=length(myOmes),r=2)
#want to regress, e.g.: im ~ mb
#create function regress
regress = function(oA, oB, assays)
{
#oA = o1
#oB = o2
for (h in oA)
{
#print(h)
#h = 6
ana1 = colnames(dd)[h]
print(ana1)
for (i in oB)
{
#i=15
#print(paste(h,i))
ana2 = colnames(dd)[i]
print(paste(ana1,ana2, sep=","))
#subset for complete cases.
#not thin yet, so this works
dcc = dd[complete.cases(dd[, c(h,i,rc,coCol)]), c(h,i,rc,coCol) ]
#print(nrow(dcc))
n=nrow(dcc)
nNot0 = sum(dcc[,ana2] > 0)
if(min(table(dcc$Grp)) > 10)
{
#3 slope, readcount in model
m = lm(dcc[,ana1] ~ dcc$reads + dcc$Grp + dcc[,ana2]*dcc$Grp)
adjRsq = summary(m)$adj.r.squared
dffitsMax = max(abs(dffits(m)))
fParts = summary(m)$fstatistic
pF = pf(fParts[1], fParts[2], fParts[3], lower.tail=FALSE)
tt = summary(m)$coefficients
if (nrow(tt) == 7)
{
pRc = tt[2,4]
pNaive = tt[5,4]
pExp = tt[6,4]
pHC = tt[7,4]
ExpSlope = tt[6,1] + tt[5,1]
HCSlope = tt[7,1] + tt[5,1]
q13 = quantile(dcc[,ana2], c(.25,.75))
NaiveIqrE = tt[5,1] * (q13[2] - q13[1])
HCIqrE = HCSlope * (q13[2] - q13[1])
ExpIqrE = ExpSlope * (q13[2] - q13[1])
#append to output file
write(paste(assays, ana1, ana2, n, nNot0, adjRsq, pF, pNaive, pExp, pHC, NaiveIqrE, ExpIqrE, HCIqrE, dffitsMax, pRc, sep=","), dataOut, append=TRUE)
}
}
}
}
}
#Create header row for output file.
#Writing these results directly to file performs well, and sets stage for permutation testing.
#Should something go wrong in a long job, it is possible to restart without losing the data previously generated, since it's in the output file.
header = paste("assays", "ana1", "ana2", "n", "nNot0", "adjRsq", "pF", "pNaive", "pExp", "pHC", "IqrNaive", "IqrExp", "IqrHC", "maxInfluence", "pRc", sep=",")
write(header, dataOut, append=FALSE)
#Add timer for general information
tm0 = proc.time()
for (k in 1:nrow(cc))
{
#k = 2
print(paste("processing row ", k))
#omeA = myOmes[1]
oA = omeCols[[cc[k,1]]]
#shorten array for testing purposes
#oA = omeCols[[cc[k,1]]][1:10]
oB = omeCols[[cc[k,2]]]
#omeCols[[oA]]
#omeCols[[oB]]
regress(oA, oB, gsub("_", "", paste(myOmes[cc[k,1]], myOmes[cc[k,2]], sep=".")))
}
proc.time() - tm0
#Explore results
d = read.csv(dataOut)
d$pAdjFDR = p.adjust(d$pF, method="fdr")
dBest = subset(d, pAdjFDR < 0.2 & maxInfluence < 4 & pNaive < 0.05 & (pExp < 0.05 | pHC < 0.05) & adjRsq > 0.15)
dBestStrict = subset(d, pAdjFDR < 0.2 & maxInfluence < 2 & pNaive < 0.05 & (pExp < 0.05 | pHC < 0.05) & adjRsq > 0.25)
sort(table(c(dBest$ana1, dBest$ana2)))
#bos=best of show
bosOut = paste("data/NetworkAnalysis/dataOut/bos_", tag, "June2023.csv", sep="")
write.csv(dBest, bosOut, quote=FALSE, row.names=FALSE)
sort(table(c(dBestStrict$ana1, dBestStrict$ana2)))
bosOutStrict = paste("data/NetworkAnalysis/dataOut/bos_", tag, "Strict_June2023.csv", sep="")
write.csv(dBestStrict, bosOutStrict, quote=FALSE, row.names=FALSE)
d = read.csv("data/NetworkAnalysis/dataOut/bos_3sW0Strict_June2023.csv", stringsAsFactors=FALSE)
gg = graph.data.frame(d[,c("ana1","ana2")], directed=FALSE)
#retrieve niceNames
niceNames = read.csv("data/NetworkAnalysis/dataFrom/bosNamesWithNice.csv")
niceNames$nice = gsub("\\n", "\n", niceNames$nice, fixed=TRUE)
#perform lookup
featureToNice = list()
featureToNice[niceNames$orig] = as.character(niceNames$nice)
myNiceNames = unlist(sapply(V(gg)$name, function(x) featureToNice[[x]] ))
#1 solid
d$slope = "1"
#2 dotted
d$slope[d$IqrNaive < 0] = "2"
#red positive, blue negative
edgeColors = c("red2", "deepskyblue3")
myRgb = col2rgb(edgeColors)
myTint = 0.45
newColors = myRgb + (255 - myRgb) * myTint
newColorsRgb = rgb(t(newColors), maxColorValue=255)
d$edgeColor = newColorsRgb[1]
d$edgeColor[d$IqrNaive < 0] = newColorsRgb[2]
#Vertex shape: circle normally, square for module
myShapes = rep("circle",length(V(gg)))
myShapes[grep("module",V(gg)$name)] = "square"
assays = c("im", "mb")
myColorsRedPink = c('#feebe2','#fa9fb5')
myRgb = col2rgb(myColorsRedPink)
myTint = 0.45
newColors = myRgb + (255 - myRgb) * myTint
newColorsRgb = rgb(t(newColors), maxColorValue=255)
myColors = newColorsRgb
V(gg)$color=myColors[1]
for(i in 2:length(assays))
{
myExpr = paste("^", assays[i], sep="")
V(gg)$color[grep(myExpr, V(gg)$name, fixed=FALSE)] = myColors[i]
}
myColor = V(gg)$color
myEdgeWidth = -1*log10(d$pNaive)
myEdgeLty = as.numeric(d$slope)
#To adjust label position, use distance, degree pairs
#Set vertex.label.degree and vertex.label.dist in the plot command
myDist = rep(0,length(V(gg)))
myDeg = rep(0,length(V(gg)))
myDist[which(myNiceNames=="g_Clostridia_vadinBB60_group")] = .5
myDeg[which(myNiceNames=="g_Clostridia_vadinBB60_group")] = -pi/2
myDist[which(myNiceNames=="g_Blautia")] = .5
myDeg[which(myNiceNames=="g_Blautia")] = pi/2
myDist[which(myNiceNames=="g_Ruminococcus_torques_group")] = 3
myDeg[which(myNiceNames=="g_Ruminococcus_torques_group")] = pi
myDist[which(myNiceNames=="s_Massiliprevotella_massiliensis")] = 2.5
myDeg[which(myNiceNames=="s_Massiliprevotella_massiliensis")] = pi
myDist[which(myNiceNames=="g_Solobacterium")] = .3
myDeg[which(myNiceNames=="g_Solobacterium")] = pi/2
set.seed(20)
lay = layout_with_fr(gg)
par(mar = rep(2, 4))
plot(gg, vertex.label.family="sans", vertex.label= myNiceNames, vertex.shape = myShapes, vertex.label.cex=.7, vertex.label.font=1, vertex.label.color="black", edge.lty=1, edge.label=NA, edge.label.font=1, edge.label.cex=.8, edge.label.color=NA, edge.label.family="sans", edge.curved=.1, edge.width=myEdgeWidth, edge.color=d$edgeColor, vertex.label.dist=myDist, vertex.label.degree=myDeg, vertex.frame.color="darkgray", layout=lay, vertex.color=V(gg)$color, vertex.frame.lty=2, vertex.width=4)
legend(x="topleft", assays, pch=21,
col="#777777", pt.bg=myColors, pt.cex=1.3, cex=.8, bty="n", ncol=1)
dd = read.csv("data/NetworkAnalysis/dataOut/imMbW0.csv")
h = read.csv("data/NetworkAnalysis/dataOut/imMbW0_h.csv")
colnames(dd) = h$x
ds = subset(dd, Cohort != "Naive") #75
write.csv(ds, "data/NetworkAnalysis/dataOut/imMbW0_ExpHC.csv", quote=FALSE, row.names=FALSE)
dd = read.csv("data/NetworkAnalysis/dataOut/imMbW24.csv")
h = read.csv("data/NetworkAnalysis/dataOut/imMbW24_h.csv")
colnames(dd) = h$x
ds = subset(dd, Cohort != "Naive") #69
write.csv(ds, "data/NetworkAnalysis/dataOut/imMbW24_ExpHC.csv", quote=FALSE, row.names=FALSE)
#No need to write headers since those stay the same
##########Parameters
#tag = "3sW0_ExpHC"
tag = "3sW24_ExpHC"
dataOut = paste("data/NetworkAnalysis/dataOut/models_", tag, "June2023.csv", sep="")
##########Begin processing
if (grepl("W0", tag))
{
dd = read.csv("data/NetworkAnalysis/dataOut/imMbW0_ExpHC.csv")
h = read.csv("data/NetworkAnalysis/dataOut/imMbW0_h.csv")
colnames(dd) = h$x
} else
{
dd = read.csv("data/NetworkAnalysis/dataOut/imMbW24_ExpHC.csv")
h = read.csv("data/NetworkAnalysis/dataOut/imMbW24_h.csv")
colnames(dd) = h$x
}
coCol = which(colnames(dd)=="Cohort")
#Rename this to Grp
colnames(dd)[coCol] = "Grp"
#read count column
rc = which(colnames(dd)=="reads")
#must know where in file which "ome" starts and stops. These are column numbers
myOmes = c("im_", "mb_")
omeCols = lapply(myOmes, function(x) grep(paste("^", x, sep=""), colnames(dd)))
cc = combinations(n=length(myOmes),r=2)
#create function regress
regress = function(oA, oB, assays)
{
#oA = o1
#oB = o2
for (h in oA)
{
#print(h)
#h = 6
ana1 = colnames(dd)[h]
#print(ana1)
for (i in oB)
{
#i=15
#print(paste(h,i))
ana2 = colnames(dd)[i]
#print(paste(ana1,ana2, sep=","))
#subset for complete cases.
#not thin yet, so this works
dcc = dd[complete.cases(dd[, c(h,i,rc,coCol)]), c(h,i,rc,coCol) ]
#print(nrow(dcc))
n=nrow(dcc)
nNot0 = sum(dcc[,ana2] > 0)
if(min(table(dcc$Grp)) > 10)
{
#3 slope, readcount in model
m = lm(dcc[,ana1] ~ dcc$reads + dcc$Grp + dcc[,ana2]*dcc$Grp)
adjRsq = summary(m)$adj.r.squared
dffitsMax = max(abs(dffits(m)))
fParts = summary(m)$fstatistic
pF = pf(fParts[1], fParts[2], fParts[3], lower.tail=FALSE)
tt = summary(m)$coefficients
#cheap trick to catch bad model
if (nrow(tt) == 5)
{
pRc = tt[2,4]
pExp = tt[4,4]
pHC = tt[4,4]
ExpSlope = tt[4,1]
HCSlope = tt[5,1] + ExpSlope
q13 = quantile(dcc[,ana2], c(.25,.75))
HCIqrE = HCSlope * (q13[2] - q13[1])
ExpIqrE = ExpSlope * (q13[2] - q13[1])
#append to output file
write(paste(assays, ana1, ana2, n, nNot0, adjRsq, pF, pExp, pHC, ExpIqrE, HCIqrE, dffitsMax, pRc, sep=","), dataOut, append=TRUE)
}
}
}
}
}
#Create header row for output file.
#Writing these results directly to file performs well, and sets stage for permutation testing.
#Should something go wrong in a long job, it is possible to restart without losing the data previously generated, since it's in the output file.
header = paste("assays", "ana1", "ana2", "n", "nNot0", "adjRsq", "pF", "pExp", "pHC", "IqrExp", "IqrHC", "maxInfluence", "pRc", sep=",")
write(header, dataOut, append=FALSE)
#Add timer for general information
tm0 = proc.time()
for (k in 1:nrow(cc))
{
#k = 2
#print(paste("processing row ", k))
#omeA = myOmes[1]
oA = omeCols[[cc[k,1]]]
#shorten array for testing purposes
#oA = omeCols[[cc[k,1]]][1:10]
oB = omeCols[[cc[k,2]]]
#omeCols[[oA]]
#omeCols[[oB]]
assays = gsub("_", "", paste(myOmes[cc[k,1]], myOmes[cc[k,2]], sep="."))
regress(oA, oB, assays)
}
proc.time() - tm0
#tag = "3sW0_ExpHC"
tag = "3sW24_ExpHC"
inFile = paste("data/NetworkAnalysis/dataOut/models_", tag, "June2023.csv", sep="")
outFile = paste("data/NetworkAnalysis/dataOut//models_", tag, "Contrasts.csv", sep="")
models = read.csv(inFile, stringsAsFactors=TRUE)
if (grepl("W0", tag))
{
dd = read.csv("data/NetworkAnalysis/dataOut/imMbW0_ExpHC.csv")
h = read.csv("data/NetworkAnalysis/dataOut/imMbW0_h.csv")
colnames(dd) = h$x
} else
{
dd = read.csv("data/NetworkAnalysis/dataOut/imMbW24_ExpHC.csv")
h = read.csv("data/NetworkAnalysis/dataOut/imMbW24_h.csv")
colnames(dd) = h$x
}
dd$Cohort = factor(dd$Cohort)
#nts: doing contrasts this way corrects for the multiple comparisons by default
#below is contrast for slope of HC different than 0 (reference slope + HC offset)
#can specify method, including "none"
contr <- rbind("SlopeC2"=c(0,0,0,1,1))
models$pSHC=NA
for (i in 1:nrow(models))
{
myRow = i
#myRow = 5
ana1 = as.character(models$ana1[myRow])
ana2 = as.character(models$ana2[myRow])
myCols = c("SampleID","Cohort",ana1,ana2,"reads")
#Extract a subset of the data for building the model
#Identify read column based on the ana2
ss = as.data.frame(dd[,myCols])
ssc = ss[complete.cases(ss),]
#Rename the relevant columns to a generic "Ana1" and "Ana2"
#This makes the regression model a little clearer than the subscript approach.
colnames(ssc)[3] = "Ana1"
colnames(ssc)[4] = "Ana2"
m = lm(Ana1 ~ reads + Cohort + Ana2*Cohort , data=ssc)
test = glht(m, linfct=contr)
pvals = summary(test, adjusted(type="none"))$test$pvalues
models$pSHC[i] = pvals[1]
}
models$pFAdj = p.adjust(models$pF, method="fdr")
write.csv(models, outFile, quote=FALSE, row.names=FALSE)
models = read.csv(outFile)
#At least one of the slopes for Exp or HC is not 0, and the difference between slopes is not 0
dBest = droplevels(subset(models, pFAdj < 0.2 & maxInfluence < 2 & (pExp < 0.05 | pSHC < 0.05) & pHC < 0.05 & adjRsq > 0.2))
nrow(dBest)
sort(table(c(dBest$ana1, dBest$ana2)))
#save dBest
outFileBos = paste("../dataOut/bos_", tag, "Contrasts.csv", sep="")
write.csv(dBest, outFileBos, quote=FALSE, row.names=FALSE)