library(readxl) # version 1.4.2
library(dplyr) # version 1.1.2
library(lubridate) # version 1.9.2
library(RColorBrewer) # version 1.1-3
library(FSA) # version 0.9.4
library(ggh4x) # version 0.2.4
library(fixest) # version 0.11.1
library(ggplot2) # version 3.4.2
library(qiime2R) # version 0.99.6
library(cowplot) # version 1.1.1
library(readr) # version 2.1.4
library(ggpubr) # version 0.6.0
library(ggrepel) # version 0.9.3
library(vegan) # version 2.6-4
library(ANCOMBC) # version 2.0.2
library(phyloseq) # version 1.42.0
library(tidyr) # version 1.3.0
library(microViz) # version 0.10.8
library(jsonlite) # version 1.8.4
library(MicrobiotaProcess) # version 1.12.3
library(gtools) # version 3.9.4
library(igraph) # version 1.4.2
library(stringr) # version 1.5.0
library(multcomp) # version 1.4-25
set.seed(5)
Metadata281 <- read_excel("data/Metadata281.xlsx")
# Rename some columns
colnames(Metadata281)[colnames(Metadata281) == 'CD4+_count'] <- 'CD4_count'
colnames(Metadata281)[colnames(Metadata281) == 'Cohort'] <- 'Cohort_Description'
colnames(Metadata281)[colnames(Metadata281) == 'HIV-1_viral_load'] <- 'HIV1_viral_load'
colnames(Metadata281)[colnames(Metadata281) == 'IL-6_pg_mL'] <- 'IL6_pg_mL'
# Create viral outcome column
Metadata281$HIV1_viral_load[Metadata281$HIV1_viral_load == 'ND'] <- 0
Metadata281$HIV1_viral_load <- as.numeric(Metadata281$HIV1_viral_load)
Metadata281$HIV1_viral_outcome <- 'Suppressed'
Metadata281$HIV1_viral_outcome[Metadata281$Cohort_Short == 'A' &
Metadata281$Visit == 3 &
Metadata281$HIV1_viral_load > 200] <- 'Viremic'
Metadata281$HIV1_viral_outcome[Metadata281$Cohort_Short == 'B' &
Metadata281$Visit == 2 &
Metadata281$HIV1_viral_load > 200] <- 'Viremic'
Metadata281$HIV1_viral_outcome <- as.factor(Metadata281$HIV1_viral_outcome)
# Calculate length on ART column
Metadata281$ARTStart[Metadata281$Cohort_Short == "B"] <- Metadata281$ART_Start_Date[Metadata281$Cohort_Short == "B"]
Metadata281$ARTStart <- as.Date(Metadata281$ARTStart, "%Y")
lubridate::day(Metadata281$ARTStart) <- 1
lubridate::month(Metadata281$ARTStart) <- 6
Metadata281$Visit_date <- as.Date(Metadata281$Visit_date, "%m/%d/%y")
Metadata281$LenghtOnArt_yrs[Metadata281$Cohort_Short == "A"] <- 0.5
Metadata281$LenghtOnArt_yrs[Metadata281$Cohort_Short == "C"] <- 0.0
Metadata281$LenghtOnArt_yrs[Metadata281$Cohort_Short == "B"] <- time_length(Metadata281$Visit_date[Metadata281$Cohort_Short == "B"] - Metadata281$ARTStart[Metadata281$Cohort_Short == "B"], "years")
# Make cleaner columns for visualizations
Metadata281$Location[Metadata281$Arm_Short == "MA"] <- "Urban"
Metadata281$Location[Metadata281$Arm_Short == "MT"] <- "Rural"
Metadata281$Cohort[Metadata281$Cohort_Short == "A"] <- "Naïve"
Metadata281$Cohort[Metadata281$Cohort_Short == "B"] <- "Exp"
Metadata281$Cohort[Metadata281$Cohort_Short == "C"] <- "HC"
Metadata281$Week[Metadata281$Visit == 2] <- 0
Metadata281$Week[Metadata281$Visit == 3] <- 24
Metadata281$Location <- as.factor(Metadata281$Location)
Metadata281$Cohort <- as.factor(Metadata281$Cohort)
Metadata281$Cohort <- ordered(Metadata281$Cohort, levels = c("Naïve", "Exp", "HC"))
Metadata281$Week <- as.factor(Metadata281$Week)
# make correct factored variables
fact_cols <- c("Gender", "HIV_Status", "Bristol_Score", "Water_Source_Num",
"Cotrimoxazole_HIV", "Manual_work_job", "Manual_chores_home",
"Head_of_household", "Education_Level", "Normal_work_transportation")
Metadata281[fact_cols] <- lapply(Metadata281[fact_cols], factor)
Metadata281$BMI_Cat <- ordered(Metadata281$BMI_Cat, levels = c("Severe_thinness", "Moderate_thinnes", "Mild_thinness", "Normal", "Overweight", "Obese"))
# make sure immune markers are numeric
immune_markers <- c("CD4_Percent",
"CD4_HLADRpos_CD38pos_Percent",
"CD8_HLADRpos_CD38pos_Percent",
"CD4_PD1_Percent",
"CD8_PD1_Percent",
"CD4_CD103_Percent",
"CD8_CD103_Percent",
"IL6_pg_mL",
"CRP_mg_L")
for (marker in immune_markers){
Metadata281[[marker]] <- as.numeric(Metadata281[[marker]])
}
SupplementalFig3 <- Metadata281
# Impute means for immune markers where needed
for(marker in immune_markers){
Metadata281[[marker]][is.na(Metadata281[[marker]])] <- mean(Metadata281[[marker]], na.rm = TRUE)
}
# Make sure to remove sample with no immune data
Metadata281 <- Metadata281 %>%
filter(SampleID != "ZIM033.3")
# Keep only samples with two visits
Metadata281 <- Metadata281[Metadata281$PID %in% Metadata281$PID[duplicated(Metadata281$PID)],]
# Remove patients that were viremic
vir_PID <- Metadata281 %>%
filter(HIV1_viral_outcome == "Viremic") %>%
dplyr::select(SampleID, PID, Cohort_Short, HIV1_viral_outcome)
PIDs <- unique(vir_PID$PID)
Metadata281$HIV1_viral_outcome[Metadata281$PID %in% PIDs] <- 'Viremic'
# get figure 1 inter data
inter_fig1 <- Metadata281 %>%
filter(!(HIV1_viral_outcome == "Viremic" & Cohort_Short == "B")) %>%
filter(Visit == 2)
# remove all viremmic patients from metadata
Metadata281 <- subset(Metadata281, !(PID %in% PIDs))
# Get dataframe of just baseline measurements
baseline <- Metadata281 %>% filter(Week == 0)
# Creating a color palettes
visitColors <- palette(brewer.pal(n = 5, name = "BrBG"))
cohortColors <- palette(brewer.pal(n = 5, name = "PiYG"))
# Figure 1
sig_df <- data.frame(matrix(ncol = 5, nrow = 0))
colnames(sig_df) <- c("marker", "comparison", "pval", "direction", "patients")
categories <- c("All", "Urban", "Rural")
markers <- c("CD4_HLADRpos_CD38pos_Percent",
"CD8_HLADRpos_CD38pos_Percent",
"CD4_PD1_Percent",
"CD8_PD1_Percent",
"CD4_CD103_Percent",
"CD8_CD103_Percent",
"IL6_pg_mL",
"CRP_mg_L")
for (category in categories){
if(category != "All"){
heatmap_df <- Metadata281 %>% filter(Location == category)
heatmap_baseline <- inter_fig1 %>% filter(Location == category)
}else{
heatmap_df <- Metadata281
heatmap_baseline <- inter_fig1
}
for (marker in markers){
patients <- category
# inter cohort tests
dt <- dunnTest(heatmap_baseline[[marker]], heatmap_baseline$Cohort_Short, method = "bonferroni")
c1 <- "Naïve - Exp"
c2 <- "Naïve - HC"
c3 <- "Exp - HC"
p1 <- dt$res$P.adj[1]
p2 <- dt$res$P.adj[2]
p3 <- dt$res$P.adj[3]
mean <- heatmap_baseline %>%
group_by(Cohort_Short) %>%
summarise(avg = mean(eval(as.name(marker))))
avgA <- mean[mean$Cohort_Short == "A",2]$avg
avgB <- mean[mean$Cohort_Short == "B",2]$avg
avgC <- mean[mean$Cohort_Short == "C",2]$avg
diffAB <- avgA - avgB
diffAC <- avgA - avgC
diffBC <- avgB - avgC
r1 <- c(marker, c1, p1, diffAB, patients)
r2 <- c(marker, c2, p2, diffAC, patients)
r3 <- c(marker, c3, p3, diffBC, patients)
sig_df[nrow(sig_df)+1, ] <- r1
sig_df[nrow(sig_df)+1, ] <- r2
sig_df[nrow(sig_df)+1, ] <- r3
# intra cohort tests
sub <- heatmap_df %>% filter(Cohort_Short == "A")
comparison <- "Naïve: 0 - 24"
t <- wilcox.test(sub[[marker]] ~ Visit, data = sub, paired = T)
mean <- sub %>%
group_by(Visit) %>%
summarise(avg = mean(eval(as.name(marker))))
avg1 <- mean[mean$Visit == 2,2]$avg
avg2 <- mean[mean$Visit ==3,2]$avg
mean_diff <- avg1 - avg2
r <- c(marker, comparison, t$p.value, mean_diff, patients)
sig_df[nrow(sig_df)+1, ] <- r
sub <- heatmap_df %>% filter(Cohort_Short == "B")
comparison <- "Exp: 0 - 24"
t <- wilcox.test(sub[[marker]] ~ Visit, data = sub, paired = T)
mean <- sub %>%
group_by(Visit) %>%
summarise(avg = mean(eval(as.name(marker))))
avg1 <- mean[mean$Visit == 2,2]$avg
avg2 <- mean[mean$Visit ==3,2]$avg
mean_diff <- avg1 - avg2
r <- c(marker, comparison, t$p.value, mean_diff, patients)
sig_df[nrow(sig_df)+1, ] <- r
sub <- heatmap_df %>% filter(Cohort_Short == "C")
comparison <- "HC: 0 - 24"
t <- wilcox.test(sub[[marker]] ~ Visit, data = sub, paired = T)
mean <- sub %>%
group_by(Visit) %>%
summarise(avg = mean(eval(as.name(marker))))
avg1 <- mean[mean$Visit == 2,2]$avg
avg2 <- mean[mean$Visit ==3,2]$avg
mean_diff <- avg1 - avg2
r <- c(marker, comparison, t$p.value, mean_diff, patients)
sig_df[nrow(sig_df)+1, ] <- r
}
}
sig_df$direction <- as.numeric(sig_df$direction)
sig_df$pval <- as.numeric(sig_df$pval)
sig_df$patients <- as.factor(sig_df$patients)
sig_df$Significance <- 'NS'
sig_df$Significance[sig_df$pval <= 0.05] <- '*'
sig_df$Significance[sig_df$pval <= 0.01] <- '**'
sig_df$Significance[sig_df$pval <= 0.001] <- '***'
sig_df$Significance[sig_df$pval <= 0.0001] <- '****'
sig_df$Significance <- as.factor(sig_df$Significance)
sig_df$direction <- sig_df$direction * -1
sig_df$direction[sig_df$pval > 0.05] <- NA
markers_labels <- c("CD4+CD38+HLA-DR+ (%)",
"CD8+CD38+HLA-DR+ (%)",
"CD4+PD1+ (%)",
"CD8+PD1+ (%)",
"CD4+CD103+ (%)",
"CD8+CD103+ (%)",
"IL-6 (pg/mL)",
"CRP (mg/L)")
for (i in 1:length(markers)){
sig_df$marker[sig_df$marker == markers[i]] <- markers_labels[i]
}
sig_df$category[sig_df$comparison == "Naïve - Exp" |
sig_df$comparison == "Naïve - HC" |
sig_df$comparison == "Exp - HC"] <- "Inter"
sig_df$category[is.na(sig_df$category)] <- "Intra"
comp <- c("Naïve - Exp","Naïve - HC","Exp - HC", "Naïve: 0 - 24", "Exp: 0 - 24", "HC: 0 - 24")
ggplot(sig_df, aes(x = factor(comparison, levels = comp),
y = factor(marker, levels = markers_labels))) +
geom_raster(aes(fill=direction))+
scale_fill_gradient2(high = "red2", mid = "white", low = "deepskyblue3", na.value = "lightgrey", midpoint = 0,
limits = c(-25, 25))+
geom_text(aes(label=Significance), size = 6) +
theme_bw() +
xlab("") +
ylab("") +
theme(text = element_text(size = 18),
plot.title = element_text(hjust = 0.5),
axis.text.x = element_text(angle = 45, vjust = 1, hjust=1),
legend.key = element_rect(colour = "black", size = 1),
legend.title = element_text(size = 14),
legend.text=element_text(size=12)) +
guides(fill=guide_legend(title="Percent \nDelta", reverse = T)) +
facet_nested(~patients + category, scales = "free_x")
# Supplemental Figure S3 but not stratified by location
SupplementalFig3$Cohort_Short <- factor(SupplementalFig3$Cohort_Short, levels = c("C", "A", "B"))
SupplementalFig3$Visit <- as.factor(SupplementalFig3$Visit)
linear_methods_df <- as.data.frame(matrix(ncol = 4, nrow = 0))
colnames(linear_methods_df) <- c('ImmuneMarker', 'ExplanatoryVariable', 'Coefficient', 'pvalue')
explanatory_vars <- c('Naïve vs HC', 'Exp vs HC', 'COT (HC)',
'COT (Naïve, \nrelative to HC)', 'COT (Exp, \nrelative to HC)')
for (marker in immune_markers) {
# Model M1
fm <- as.formula(paste0('rank(', marker,') ~ Cohort_Short + Visit + Cohort_Short*Visit'))
results <- feols(fm, data = SupplementalFig3, cluster = ~PID)
r1 <- c(marker, explanatory_vars[1], results$coeftable[2,1], round(results$coeftable[2,4], 2))
r2 <- c(marker, explanatory_vars[2], results$coeftable[3,1], round(results$coeftable[3,4], 2))
r3 <- c(marker, explanatory_vars[3], results$coeftable[4,1], round(results$coeftable[4,4], 2))
r4 <- c(marker, explanatory_vars[4], results$coeftable[5,1], round(results$coeftable[5,4], 2))
r5 <- c(marker, explanatory_vars[5], results$coeftable[6,1], round(results$coeftable[6,4], 2))
linear_methods_df[nrow(linear_methods_df)+1, ] <- r1
linear_methods_df[nrow(linear_methods_df)+1, ] <- r2
linear_methods_df[nrow(linear_methods_df)+1, ] <- r3
linear_methods_df[nrow(linear_methods_df)+1, ] <- r4
linear_methods_df[nrow(linear_methods_df)+1, ] <- r5
}
linear_methods_df$Coefficient <- as.numeric(linear_methods_df$Coefficient)
linear_methods_df$pvalue <- as.numeric(linear_methods_df$pvalue)
linear_methods_df$Coefficient[linear_methods_df$pvalue > 0.05] <- NA
linear_methods_df$Significance <- 'NS'
linear_methods_df$Significance[linear_methods_df$pvalue <= 0.05] <- '*'
linear_methods_df$Significance[linear_methods_df$pvalue <= 0.01] <- '**'
linear_methods_df$Significance[linear_methods_df$pvalue <= 0.001] <- '***'
linear_methods_df$Significance[linear_methods_df$pvalue <= 0.0001] <- '****'
linear_methods_df$Significance <- as.factor(linear_methods_df$Significance)
markers_labels_1 <- c("CD4+ (%)",
"CD4+CD38+HLA-DR+ (%)",
"CD8+CD38+HLA-DR+ (%)",
"CD4+PD1+ (%)",
"CD8+PD1+ (%)",
"CD4+CD103+ (%)",
"CD8+CD103+ (%)",
"IL-6 (pg/mL)",
"CRP (mg/L)")
for (i in 1:length(immune_markers)){
linear_methods_df$ImmuneMarker[linear_methods_df$ImmuneMarker == immune_markers[i]] <- markers_labels_1[i]
}
ggplot(linear_methods_df, aes(x = factor(ExplanatoryVariable, levels = explanatory_vars),
y = factor(ImmuneMarker, levels = markers_labels_1))) +
geom_raster(aes(fill=Coefficient))+
scale_fill_gradient2(low = "deepskyblue3", mid = "white", high = "red2", na.value = "lightgrey", midpoint = 0,
limits = c(-105, 105))+
geom_text(aes(label=Significance), size = 6) +
theme_bw() +
xlab("Predictor") +
ylab("Immune Marker") +
theme(text = element_text(size = 18),
plot.title = element_text(hjust = 0.5),
axis.text.x = element_text(angle = 45, vjust = 1, hjust=1),
legend.key = element_rect(colour = "black", size = 1),
legend.title = element_text(size = 14),
legend.text=element_text(size=12)) +
guides(fill=guide_legend(title="Coefficient", reverse = T))
shannon_df <- qiime2R::read_qza("data/DiversityData/shannon_vector.qza")
shannon_df <- shannon_df$data
shannon_df <- tibble::rownames_to_column(shannon_df, "SampleID")
Metadata281 <- Metadata281 %>%
dplyr::left_join(shannon_df, by = "SampleID")
shannon_baseline <- Metadata281 %>% filter(Visit == 2)
v2 <- Metadata281 %>% filter(Visit == 2) %>% dplyr::select(PID, shannon_entropy, Cohort, Location)
v3 <- Metadata281 %>% filter(Visit == 3) %>% dplyr::select(PID, shannon_entropy)
delta_df <- dplyr::full_join(v2, v3, by = "PID")
delta_df<- delta_df %>%
mutate(delta = shannon_entropy.y - shannon_entropy.x)
# All statistical calculations were done separately, see Methods
ann <- data.frame(lx = c(2, 2, 2.5, 2.5, 2.5,2.5),
ly = c(8, 8, 7.5, 7.5, 7.5, 7.5),
lab = c("*", "____________", "*", "______", "*", "______"),
Location = c("Rural", "Rural", "Rural", "Rural", "Urban", "Urban"))
A <- ggplot(shannon_baseline, aes(x = Cohort, y = shannon_entropy)) +
geom_boxplot(show.legend = F,
coef = 6,
alpha = 0.5, aes(fill = Cohort)) +
scale_fill_manual(values = c(cohortColors[1],
cohortColors[3],
cohortColors[5])) +
geom_point(show.legend = F,
position = position_jitterdodge(0.2),
aes(color = Cohort)) +
scale_color_manual(values = c("black",
"black",
"black")) +
theme_bw() +
theme(text = element_text(size = 18)) +
xlab("Cohort") +
ylab("Shannon Entropy") +
facet_grid(~Location) +
geom_text(data = ann, aes(x = lx, y = ly, label = lab), size = 6)
ann <- data.frame(lx = c(1, 1),
ly = c(7.5, 7.5),
lab = c("*", "_____"),
Location = c("Rural", "Rural") )
B <- ggplot(Metadata281, aes(x = Cohort, y = shannon_entropy)) +
geom_boxplot(show.legend = T,coef = 6, alpha = 0.5, aes(fill = Week)) +
geom_point(show.legend = F, aes(color = Week), position = position_jitterdodge(0.2)) +
scale_fill_manual(values = c(visitColors[1], visitColors[5])) +
scale_color_manual(values = c(visitColors[1], visitColors[5])) +
theme_bw() +
theme(text = element_text(size = 18)) +
facet_grid(~Location)+
xlab("Cohort") +
ylab("") +
geom_text(data = ann, aes(x = lx, y = ly, label = lab), size = 6)
# Model M2
C <- ggplot(delta_df, aes(x = shannon_entropy.x, y = delta, color = Cohort)) +
geom_point() +
scale_color_manual(values = c(cohortColors[1],
"gray65",
cohortColors[5])) +
geom_smooth(method = "lm", fill = NA)+
theme_bw() +
theme(text = element_text(size = 18)) +
xlab("Baseline Shannon Entropy") +
ylab("Delta Shannon Entropy \n (Wk24 - Wk0)") +
stat_cor(label.x = 5)+
ylim(-2,3)
cowplot::plot_grid(A, B, C,
labels = c("A", "B", "C"),
ncol = 3, nrow = 1 ,
rel_widths = c(0.7, 1, 1))
# takes a row containing taxon names and returns last two known levels
# e.g. d__Eukaryota;p__Incertae_Sedis;c__Incertae_Sedis;o__Incertae_Sedis;f__Incertae_Sedis;g__Blastocystis
# to f__Prevotellaceae; g__Prevotella
getNiceTaxon <- function(rowVector){
newVector <- c()
for(rowValue in rowVector){
if(substr(rowValue, 1, 6) != "module"){
newName <- tail(strsplit(rowValue, split = ";")[[1]], 2)
newName <- paste0(newName[1], ";", newName[2])
newVector <- c(newVector, c(newName))
}else{
newVector <- c(newVector, c(rowValue))
}
}
return(newVector)
}
# biom table
biom <- qiime2R::read_qza("data/BiplotsData/tax_filtered_281.qza")
biom <- as_tibble(biom$data, rownames = 'ASV')
# subset of metadata
sub_meta <- readr::read_delim("data/subset_Metadata281.tsv",
delim = "\t",
escape_double = FALSE,
col_types = cols(Location = col_factor(levels = c("Rural","Urban")),
Week = col_factor(levels = c("0","24")),
HIV1_viral_outcome = col_factor(levels = c("Suppressed",
"Viremic"))),
trim_ws = TRUE)
# Weighted UniFrac PCoA
w_uni <- qiime2R::read_qza("data/DiversityData/weighted_unifrac_pcoa_results.qza")$data
w_uni_pcoa <- as_tibble(w_uni$Vectors)
# get proportions explained
w_uni_prop <- w_uni$ProportionExplained[1:3]
# get biplot from qiime
q_biplot <- qiime2R::read_qza("data/BiplotsData/biplot.qza")
q_biplotSpecies <- as_tibble(q_biplot$data$Species)
q_biplotVectors <- as_tibble(q_biplot$data$Vectors)
# get taxonomy and replace with better labels
tax <- qiime2R::read_qza("data/BiplotsData/classification.qza")
tax <- tax$data
colnames(tax)[1] <- "FeatureID"
q_biplotSpecies <- q_biplotSpecies %>%
dplyr::left_join(tax, by = "FeatureID") %>%
dplyr::select(Taxon, PC1, PC2, PC3)
colnames(q_biplotSpecies)[1] <- "FeatureID"
q_biplotSpecies %>%
dplyr::mutate(FeatureID = getNiceTaxon(FeatureID)) -> abbrBiplotSpecies
# prepare sample data
w_uni_pcoa <- w_uni_pcoa %>%
dplyr::select(SampleID, PC1, PC2, PC3) %>%
dplyr::left_join(sub_meta, by = "SampleID")
w_uni_pcoa$Cohort <- as.factor(w_uni_pcoa$Cohort)
w_uni_pcoa$Cohort <- ordered(w_uni_pcoa$Cohort, levels = c("Naïve", "Exp", "HC"))
# prep axis labels
pc1_lab <- paste0('PC1 (', round(w_uni_prop[1],3)*100, '%)')
pc2_lab <- paste0('PC2 (', round(w_uni_prop[2],3)*100, '%)')
pc3_lab <- paste0('PC3 (', round(w_uni_prop[3],3)*100, '%)')
# create base plot with arrows for first plot
abbrBiplotSpecies %>%
mutate(a=sqrt(PC1^2+PC2^2)) %>% # calculate the distance from the origin
top_n(4, a) %>% #keep 8 furthest away points
mutate(PC1=PC1*5, PC2=PC2*5) -> abbrBiplotSpeciesArrows # scale arrows linearly
new_lables <- c()
for (row in 1:nrow(abbrBiplotSpeciesArrows)) {
tax_name <- abbrBiplotSpeciesArrows$FeatureID[row]
genus <- tail(strsplit(tax_name, split = ";")[[1]], n=1)
new_lables <- c(new_lables, genus)
}
abbrBiplotSpeciesArrows <- cbind(abbrBiplotSpeciesArrows, new_lables)
A <- ggplot(w_uni_pcoa, aes(x = PC1, y = PC2, color = Cohort)) +
geom_point(show.legend = F, size = 5 ,alpha = 0.6) +
scale_color_manual(values = c(cohortColors[1],
"grey65",
cohortColors[5])) +
theme_bw(base_size = 18) +
xlab(pc1_lab) +
ylab(pc2_lab) +
geom_segment(data = abbrBiplotSpeciesArrows, aes(x=0, xend=PC1, y=0, yend=PC2),
arrow = arrow(length = unit(0.3,"cm")),color = 'black')+
geom_text_repel(data = abbrBiplotSpeciesArrows, aes(x=PC1, y=PC2, label=new_lables),size=7, color = 'black')
# second plot
abbrBiplotSpecies %>%
mutate(a=sqrt(PC1^2+PC3^2)) %>% # calculate the distance from the origin
top_n(4, a) %>% #keep 8 furthest away points
mutate(PC1=PC1*5, PC3=PC3*5) -> abbrBiplotSpeciesArrows # scale arrows linearly
new_lables <- c()
for (row in 1:nrow(abbrBiplotSpeciesArrows)) {
tax_name <- abbrBiplotSpeciesArrows$FeatureID[row]
genus <- tail(strsplit(tax_name, split = ";")[[1]], n=1)
new_lables <- c(new_lables, genus)
}
abbrBiplotSpeciesArrows <- cbind(abbrBiplotSpeciesArrows, new_lables)
B <- ggplot(w_uni_pcoa, aes(x = PC1, y = PC3, color = Cohort)) +
geom_point(size = 5, alpha = 0.6 ) +
scale_color_manual(values = c(cohortColors[1],
"grey65",
cohortColors[5])) +
theme_bw(base_size = 18) +
xlab(pc1_lab) +
ylab(pc3_lab) +
geom_segment(data = abbrBiplotSpeciesArrows, aes(x=0, xend=PC1, y=0, yend=PC3),
arrow = arrow(length = unit(0.3,"cm")),color = 'black')+
geom_text_repel(data = abbrBiplotSpeciesArrows, aes(x=PC1, y=PC3, label=new_lables),size=7, color = 'black')
cowplot::plot_grid(A, B,
labels = c("A", " "),
ncol = 2, nrow = 1)
beta_df <- data.frame(matrix(ncol = 2, nrow = 0))
colnames(beta_df) <- c("SampleID", "Dysbiosis")
distance_matrix<- qiime2R::read_qza("data/DiversityData/weighted_unifrac_distance_matrix.qza")
distance_matrix <- distance_matrix$data %>% as.matrix()
distance_matrix[upper.tri(distance_matrix)] <- NA
diag(distance_matrix) <- NA
distance_matrix <- distance_matrix %>% as.data.frame()
distance_matrix <- tibble::rownames_to_column(distance_matrix, "SampleID")
# calculate average distance from PLWH to healthy controls for samples at
# each time point
weeks <- c(0, 24)
for (current_week in weeks){
sampleIDs <- Metadata281 %>% filter(Cohort_Short != 'C' & Week == current_week)
AB_SIDs <- sampleIDs$SampleID
sampleIDs <- Metadata281 %>% filter(Cohort_Short == 'C'& Week == current_week)
C_SIDs <- sampleIDs$SampleID
dysbiosis_matrix <- distance_matrix[distance_matrix$SampleID %in% C_SIDs, ]
for (sample in AB_SIDs) {
avg <- mean(dysbiosis_matrix[[sample]], na.rm = T)
beta_df[nrow(beta_df)+1, ] <- c(sample, avg)
}
}
# remove NAs
beta_df <- beta_df[!is.na(beta_df$Dysbiosis), ]
beta_df <- beta_df %>% filter(Dysbiosis != "NaN")
# create subset and baseline
meta_sub <- Metadata281 %>% dplyr::select(SampleID, Cohort, Week, PID, Location)
beta_df <- dplyr::left_join(beta_df, meta_sub, by = "SampleID")
beta_df$Dysbiosis <- as.numeric(beta_df$Dysbiosis)
baseline_beta <- beta_df %>%
filter(Week == 0) %>%
dplyr::select(PID, Dysbiosis)
# calculate deltas
bv3 <- beta_df %>% filter(Week == 24) %>% dplyr::select(PID, Dysbiosis, Cohort, Location)
beta_delta <- dplyr::left_join(baseline_beta, bv3, by = "PID")
beta_delta <- beta_delta %>% mutate(Delta_Dysbiosis = Dysbiosis.y - Dysbiosis.x)
# Model M3
B <- ggplot(na.omit(beta_delta), aes(x = Dysbiosis.x, y = Delta_Dysbiosis, color = Cohort)) +
geom_point() +
geom_smooth(method = "lm", fill = NA)+
scale_color_manual(values = c(cohortColors[1],
"gray65")) +
theme_bw() +
theme(text = element_text(size = 18)) +
xlab("Baseline Dysbiosis") +
ylab("Delta Dysbiosis \n (Wk24 - Wk0)") +
stat_cor(label.x = 0.5)
C <- ggplot(na.omit(beta_delta), aes(x = Dysbiosis.x, y = Delta_Dysbiosis, color = Location)) +
geom_point() +
geom_smooth(method = "lm", fill = NA)+
theme_bw() +
theme(text = element_text(size = 18)) +
xlab("Baseline Dysbiosis") +
ylab(" ") +
stat_cor(label.x = 0.6)
cowplot::plot_grid(B, C,
labels = c("B", "C"),
ncol = 2, nrow = 1)
# reload metadata to retain as many samples as possible
Metadata281_full <- read_excel("data/Metadata281.xlsx")
Metadata281_full <- subset(Metadata281_full, Metadata281_full$SampleID != "ZIM026.2" &
Metadata281_full$SampleID != "ZIM026.3" &
Metadata281_full$SampleID != "ZIM008.3" &
Metadata281_full$SampleID != "ZIM062.3")
Metadata281_full$Location[Metadata281_full$Arm_Short == "MA"] <- "Urban"
Metadata281_full$Location[Metadata281_full$Arm_Short == "MT"] <- "Rural"
Metadata281_full$Cohort[Metadata281_full$Cohort_Short == "A"] <- "Naïve"
Metadata281_full$Cohort[Metadata281_full$Cohort_Short == "B"] <- "Exp"
Metadata281_full$Cohort[Metadata281_full$Cohort_Short == "C"] <- "HC"
Metadata281_full$Week[Metadata281_full$Visit == 2] <- 0
Metadata281_full$Week[Metadata281_full$Visit == 3] <- 24
Metadata281_full$Cohort <- factor(Metadata281_full$Cohort, levels = c("Naïve", "Exp", "HC"))
Metadata281_full$Treatment_stat[Metadata281_full$Cohort_Short == "A" & Metadata281_full$Visit == 2] <- "untreated"
Metadata281_full$Treatment_stat[Metadata281_full$Cohort_Short == "A" & Metadata281_full$Visit == 3] <- "treated"
Metadata281_full$Treatment_stat[Metadata281_full$Cohort_Short == "B"] <- "treated"
Metadata281_full$Treatment_stat[Metadata281_full$Cohort_Short == "C"] <- "untreated"
# read in weighted unifrac distance matrix
distance_matrix<- qiime2R::read_qza("data/DiversityData/weighted_unifrac_distance_matrix.qza")
distance_matrix <- distance_matrix$data %>% as.matrix()
distance_matrix <- distance_matrix %>% as.data.frame()
distance_matrix <- tibble::rownames_to_column(distance_matrix, "SampleID")
indices <- order(distance_matrix$SampleID)
distance_matrix <- distance_matrix[,2:ncol(distance_matrix)]
distance_matrix <- distance_matrix[indices, indices]
adonis_data <- Metadata281_full[Metadata281_full$SampleID %in% colnames(distance_matrix),]
# make sure they are in the same sample order
adonis_data$SampleID == colnames(distance_matrix)
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [46] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [61] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [76] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [91] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [106] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [121] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [136] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [151] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [166] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [181] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [196] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [211] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [226] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [241] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [256] TRUE TRUE TRUE TRUE
# Model M4
adonis2(as.dist(distance_matrix) ~ adonis_data$Location +
adonis_data$Location*adonis_data$Week,
strata = adonis_data$PID)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Blocks: strata
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = as.dist(distance_matrix) ~ adonis_data$Location + adonis_data$Location * adonis_data$Week, strata = adonis_data$PID)
## Df SumOfSqs R2 F Pr(>F)
## adonis_data$Location 1 2.870 0.02181 5.7291 0.029 *
## adonis_data$Week 1 0.853 0.00648 1.7034 0.010 **
## adonis_data$Location:adonis_data$Week 1 0.152 0.00115 0.3034 0.795
## Residual 255 127.734 0.97056
## Total 258 131.610 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Model M5
adonis2(as.dist(distance_matrix) ~ adonis_data$Visit_Age +
adonis_data$HIV_Status +
adonis_data$Treatment_stat,
strata = adonis_data$PID)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Blocks: strata
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = as.dist(distance_matrix) ~ adonis_data$Visit_Age + adonis_data$HIV_Status + adonis_data$Treatment_stat, strata = adonis_data$PID)
## Df SumOfSqs R2 F Pr(>F)
## adonis_data$Visit_Age 1 2.721 0.02067 5.4688 0.086 .
## adonis_data$HIV_Status 1 1.045 0.00794 2.1004 0.114
## adonis_data$Treatment_stat 1 0.991 0.00753 1.9928 0.051 .
## Residual 255 126.853 0.96386
## Total 258 131.610 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# create phyloseq object
pseq <- qiime2R::qza_to_phyloseq(features = "data/ANCOMBCData/tax_filtered_281.qza",
taxonomy = "data/ANCOMBCData/classification.qza",
metadata = "data/subset_Metadata281.tsv",
tree = "data/ANCOMBCData/tree_Zim.qza")
# Change factored variable in metadata
meta <- pseq %>% sam_data() %>% as.data.frame()
meta$Cohort <- factor(meta$Cohort, levels = c("HC", "Naïve", "Exp"))
sample_data(pseq) <- sample_data(meta)
# get baseline
pseq.baseline <- pseq
baseline <- pseq %>% sample_data() %>% as.data.frame()
baseline <- baseline[baseline$Week == 0]
sample_data(pseq.baseline) <- sample_data(baseline)
# only look at cohort A
pseq.A <- pseq
cohortA <- pseq %>% sample_data() %>% as.data.frame()
cohortA <- cohortA[cohortA$Cohort == "Naïve"]
sample_data(pseq.A) <- sample_data(cohortA)
# Model M6
out <- ancombc2(
data = pseq.baseline,
tax_level = 'Genus',
fix_formula = 'Location + HIV1_viral_outcome + Cohort',
group = 'Cohort'
)
# Model M7
# out <- ancombc2(
# data = pseq,
# tax_level = 'Genus',
# fix_formula = 'Location + Week + HIV1_viral_outcome + Cohort',
# rand_formula = '(1 | PID)',
# group = 'Cohort'
# )
# Model M8
# out <- ancombc2(
# data = pseq.A,
# tax_level = 'Genus',
# fix_formula = 'Location + Week + HIV1_viral_outcome',
# rand_formula = '(1 | PID)'
# )
# Results
res <- out$res
## Location results
df_location <- res %>%
dplyr::select(taxon, contains("Location"))
## Week results
df_week <- res %>%
dplyr::select(taxon, contains("Week"))
## Viral Outcome results
df_vo <- res %>%
dplyr::select(taxon, contains("Viral"))
## Cohort results
df_cohort <- res %>%
dplyr::select(taxon, contains("Cohort"))
# put results data in correct format
df_cohort_fig <- df_cohort %>%
dplyr::filter(diff_CohortExp == 1 | diff_CohortNaïve == 1) %>%
dplyr::mutate(lfc_CohortNaïve = ifelse(diff_CohortNaïve == 1,
lfc_CohortNaïve, 0),
lfc_CohortExp = ifelse(diff_CohortExp == 1,
lfc_CohortExp, 0)
) %>%
dplyr::transmute(taxon,
`Exp vs HC` = round(lfc_CohortExp, 2),
`Naïve vs HC` = round(lfc_CohortNaïve, 2)) %>%
tidyr::pivot_longer(cols = `Exp vs HC`:`Naïve vs HC`,
names_to = "group", values_to = "value") %>%
dplyr::arrange(taxon)
# plot
lo <- floor(min(df_cohort_fig$value))
up <- ceiling(max(df_cohort_fig$value))
mid <- (lo + up)/2
df_cohort_fig %>%
mutate(taxon = paste(substr(taxon, 7, 100000L))) %>%
ggplot(aes(x = factor(group, level = c("Naïve vs HC","Exp vs HC")), y = taxon, fill = value)) +
geom_tile(color = "black") +
scale_fill_gradient2(low = "deepskyblue3", high = "red2", mid = "white",
na.value = "white", midpoint = mid, limit = c(lo, up),
name = NULL) +
geom_text(aes(group, taxon, label = value), color = "black", size = 6) +
labs(x = NULL, y = NULL) +
ggtitle("Baseline") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5),
text = element_text(size = 18))
# get microbiome diversity data
shannon_df <- read_qza("data/DiversityData/shannon_vector.qza")
shannon_df <- shannon_df$data
shannon_df <- tibble::rownames_to_column(shannon_df, "SampleID")
w_uni <- read_qza("data/DiversityData/weighted_unifrac_pcoa_results.qza")$data
w_uni_pcoa <- as_tibble(w_uni$Vectors)
w_uni_pcoa <- w_uni_pcoa %>%
dplyr::select(SampleID, PC1, PC2, PC3, PC4)
# add to metadata
Metadata281 <- Metadata281 %>%
left_join(shannon_df, by = "SampleID") %>%
left_join(w_uni_pcoa, by = "SampleID")
# get immune marker table and feature table
IM_data <- Metadata281 %>% dplyr::select(SampleID, all_of(immune_markers))
features <- Metadata281 %>%
dplyr::select(SampleID, Cohort, Location, Gender, BMI, BMI_Cat, HIV_Status, Week, Visit_Age,
Bristol_Score, Education_Level, Water_Source_Num, Cotrimoxazole_HIV,
Cotri_Length_Month, Normal_work_transportation, Manual_work_job,
Manual_chores_home, Head_of_household, LenghtOnArt_yrs) %>%
left_join(shannon_df, by = "SampleID") %>%
left_join(w_uni_pcoa, by = "SampleID")
# some preprocessing to address NAs
# there were 20 samples without microbiome informations due to quality control
features$Cotrimoxazole_HIV[features$Cotrimoxazole_HIV == "NA"] <- NA
features$Cotrimoxazole_HIV[is.na(features$Cotrimoxazole_HIV)] <- "No"
features$Cotrimoxazole_HIV <- droplevels(features$Cotrimoxazole_HIV)
features$Normal_work_transportation[features$Normal_work_transportation == "NA"] <- NA
features$Normal_work_transportation <- droplevels(features$Normal_work_transportation)
features$BMI_Cat[is.na(features$BMI_Cat)] <- "Moderate_thinnes"
features$Head_of_household <- droplevels(features$Head_of_household)
# backwards stepwise regression feature selection
formulas <- vector(mode="list")
for(marker in immune_markers){
print(marker)
# merge immune marker with features
IM_one <- IM_data %>%
dplyr::select(SampleID, marker)
step_data <- features %>%
left_join(IM_one, by = "SampleID") %>%
dplyr::select(-SampleID) %>%
na.omit()
## we want the model to have location
f <- as.formula(paste0(marker," ~ Location"))
myM <- lm(f, data = step_data)
## model with everything
fB <- as.formula(paste0(marker, " ~."))
biggest <- lm(fB, data = step_data)
## stepwise regression, direction = "backward"
bwd <- step(biggest,
trace = 1,
direction = "backward",
scope = list(lower = myM, upper = biggest))
bwd_formula <- paste0(bwd$call)[2]
formulas[[marker]] <- bwd_formula
}
# for models involving PLWH we viral load
# for models involving ART exp PLWH we added length on ART
# made models dataset
lm_data <- as.data.frame(matrix(nrow = 0, ncol = 4))
colnames(lm_data) <- c("Marker", "Week", "Cohort", "Formula")
weeks <- c(0, 24)
Cohorts <- c("Naïve", "Exp", "HC")
for(marker in immune_markers){
for(week in weeks){
for(cohort in Cohorts){
if(cohort == "Naïve"){
form <- paste0(formulas[marker], "+ HIV1_viral_load")
}
if(cohort == "Exp"){
form <- paste0(formulas[marker], "+ HIV1_viral_load + LenghtOnArt_yrs")
}
if(cohort == "HC"){
form <- paste0(formulas[marker])
}
r <- c(marker, week, cohort, form)
lm_data[nrow(lm_data)+1, ] <- r
}
}
}
# run models and make results
sub_d <- Metadata281 %>%
filter(Cohort == "HC" & Week == 24)
f <- as.formula(CRP_mg_L ~ Location + BMI + Visit_Age + PC1 )
summary(lm(f, data = sub_d))
lm_table <- read_excel("data/PredictiveModeling/LinearModelInfo.xlsx")
lm_table$Tested[lm_table$Tested == "Yes"] <- "Included in Model"
lm_table$Cohort <- factor(lm_table$Cohort, levels = c("Naïve", "Exp", "HC"))
lm_table$Tested <- factor(lm_table$Tested, levels = c("Up", "Up (Model NS)", "Down",
"Down (Model NS)", "Included in Model"))
im_labels <- c("CD4+CD38+HLA-DR+ (%)",
"CD8+CD38+HLA-DR+ (%)",
"CD4+PD1+ (%)",
"CD8+PD1+ (%)",
"CD4+CD103+ (%)",
"CD8+CD103+ (%)",
"IL-6 (pg/mL)",
"CRP (mg/L)",
"CD4+ (%)")
lm_table$`Immune Marker` <- factor(lm_table$`Immune Marker`, levels = im_labels)
cols <- colnames(lm_table)
lm_table[cols] <- lapply(lm_table[cols], factor)
ggplot(lm_table, aes(x = Predictor, y = `Immune Marker`)) +
geom_point(aes(color = Tested), size = 6) +
scale_color_manual(values = c("Up" = "red2",
"Up (Model NS)"="peachpuff",
"Down"="deepskyblue3",
"Down (Model NS)" = "paleturquoise",
"Included in Model" = "lightgrey")) +
theme_bw() +
theme(text = element_text(size = 18),
axis.text.x = element_text(angle = 45, vjust = 1, hjust=1),
legend.title = element_blank()) +
ylab("Immune Marker") +
facet_grid(Cohort~Week)