Import Needed Packages

library(readxl)             # version 1.4.2
library(dplyr)              # version 1.1.2
library(lubridate)          # version 1.9.2
library(RColorBrewer)       # version 1.1-3
library(FSA)                # version 0.9.4
library(ggh4x)              # version 0.2.4
library(fixest)             # version 0.11.1  
library(ggplot2)            # version 3.4.2
library(qiime2R)            # version 0.99.6
library(cowplot)            # version 1.1.1
library(readr)              # version 2.1.4
library(ggpubr)             # version 0.6.0
library(ggrepel)            # version 0.9.3
library(vegan)              # version 2.6-4
library(ANCOMBC)            # version 2.0.2 
library(phyloseq)           # version 1.42.0
library(tidyr)              # version 1.3.0
library(microViz)           # version 0.10.8
library(jsonlite)           # version 1.8.4
library(MicrobiotaProcess)  # version 1.12.3
library(gtools)             # version 3.9.4
library(igraph)             # version 1.4.2
library(stringr)            # version 1.5.0
library(multcomp)           # version 1.4-25

set.seed(5)

Data Read In and Wrangle

Metadata281 <- read_excel("data/Metadata281.xlsx")

# Rename some columns 
colnames(Metadata281)[colnames(Metadata281) == 'CD4+_count'] <- 'CD4_count'
colnames(Metadata281)[colnames(Metadata281) == 'Cohort'] <- 'Cohort_Description'
colnames(Metadata281)[colnames(Metadata281) == 'HIV-1_viral_load'] <- 'HIV1_viral_load'
colnames(Metadata281)[colnames(Metadata281) == 'IL-6_pg_mL'] <- 'IL6_pg_mL'

# Create viral outcome column
Metadata281$HIV1_viral_load[Metadata281$HIV1_viral_load == 'ND'] <- 0
Metadata281$HIV1_viral_load <- as.numeric(Metadata281$HIV1_viral_load)
Metadata281$HIV1_viral_outcome <- 'Suppressed'

Metadata281$HIV1_viral_outcome[Metadata281$Cohort_Short == 'A' & 
                                Metadata281$Visit == 3 &
                                Metadata281$HIV1_viral_load > 200] <- 'Viremic'
Metadata281$HIV1_viral_outcome[Metadata281$Cohort_Short == 'B' & 
                                Metadata281$Visit == 2 &
                                Metadata281$HIV1_viral_load > 200] <- 'Viremic'
Metadata281$HIV1_viral_outcome <- as.factor(Metadata281$HIV1_viral_outcome)

# Calculate length on ART column 
Metadata281$ARTStart[Metadata281$Cohort_Short == "B"] <- Metadata281$ART_Start_Date[Metadata281$Cohort_Short == "B"]
Metadata281$ARTStart <- as.Date(Metadata281$ARTStart, "%Y") 
lubridate::day(Metadata281$ARTStart) <- 1
lubridate::month(Metadata281$ARTStart) <- 6
Metadata281$Visit_date <- as.Date(Metadata281$Visit_date, "%m/%d/%y")

Metadata281$LenghtOnArt_yrs[Metadata281$Cohort_Short == "A"] <- 0.5
Metadata281$LenghtOnArt_yrs[Metadata281$Cohort_Short == "C"] <- 0.0
Metadata281$LenghtOnArt_yrs[Metadata281$Cohort_Short == "B"] <- time_length(Metadata281$Visit_date[Metadata281$Cohort_Short == "B"] - Metadata281$ARTStart[Metadata281$Cohort_Short == "B"], "years")

# Make cleaner columns for visualizations
Metadata281$Location[Metadata281$Arm_Short == "MA"] <- "Urban"
Metadata281$Location[Metadata281$Arm_Short == "MT"] <- "Rural"
Metadata281$Cohort[Metadata281$Cohort_Short == "A"] <- "Naïve"
Metadata281$Cohort[Metadata281$Cohort_Short == "B"] <- "Exp"
Metadata281$Cohort[Metadata281$Cohort_Short == "C"] <- "HC"
Metadata281$Week[Metadata281$Visit == 2] <- 0
Metadata281$Week[Metadata281$Visit == 3] <- 24
Metadata281$Location <- as.factor(Metadata281$Location)
Metadata281$Cohort <- as.factor(Metadata281$Cohort)
Metadata281$Cohort <- ordered(Metadata281$Cohort, levels = c("Naïve", "Exp", "HC"))
Metadata281$Week <- as.factor(Metadata281$Week)

# make correct factored variables 
fact_cols <- c("Gender", "HIV_Status", "Bristol_Score", "Water_Source_Num", 
               "Cotrimoxazole_HIV", "Manual_work_job", "Manual_chores_home",
               "Head_of_household", "Education_Level", "Normal_work_transportation")
Metadata281[fact_cols] <- lapply(Metadata281[fact_cols], factor) 
Metadata281$BMI_Cat <- ordered(Metadata281$BMI_Cat, levels = c("Severe_thinness", "Moderate_thinnes", "Mild_thinness", "Normal", "Overweight", "Obese"))

# make sure immune markers are numeric 
immune_markers <- c("CD4_Percent",
                    "CD4_HLADRpos_CD38pos_Percent", 
                    "CD8_HLADRpos_CD38pos_Percent", 
                    "CD4_PD1_Percent", 
                    "CD8_PD1_Percent",
                    "CD4_CD103_Percent",
                    "CD8_CD103_Percent",
                    "IL6_pg_mL", 
                    "CRP_mg_L")
for (marker in immune_markers){
  Metadata281[[marker]] <- as.numeric(Metadata281[[marker]])
}

SupplementalFig3 <- Metadata281

# Impute means for immune markers where needed 

for(marker in immune_markers){ 
  Metadata281[[marker]][is.na(Metadata281[[marker]])] <- mean(Metadata281[[marker]], na.rm = TRUE)
}

# Make sure to remove sample with no immune data 
Metadata281 <- Metadata281 %>% 
  filter(SampleID != "ZIM033.3")

# Keep only samples with two visits  
Metadata281 <- Metadata281[Metadata281$PID %in% Metadata281$PID[duplicated(Metadata281$PID)],]

# Remove patients that were viremic 
vir_PID <- Metadata281 %>%  
  filter(HIV1_viral_outcome == "Viremic") %>% 
  dplyr::select(SampleID, PID, Cohort_Short, HIV1_viral_outcome)
PIDs <- unique(vir_PID$PID)
Metadata281$HIV1_viral_outcome[Metadata281$PID %in% PIDs] <- 'Viremic'

# get figure 1 inter data 
inter_fig1 <- Metadata281 %>% 
  filter(!(HIV1_viral_outcome == "Viremic" & Cohort_Short == "B")) %>% 
  filter(Visit == 2)

# remove all viremmic patients from metadata
Metadata281 <- subset(Metadata281, !(PID %in% PIDs))

# Get dataframe of just baseline measurements 
baseline <- Metadata281 %>% filter(Week == 0)

# Creating a color palettes
visitColors <- palette(brewer.pal(n = 5, name = "BrBG")) 
cohortColors <- palette(brewer.pal(n = 5, name = "PiYG"))

Cross-sectional and Longitudinal Analysis of Immune Phenotype

Inter and Intra Cohort Immune Analyses

# Figure 1 

sig_df <- data.frame(matrix(ncol = 5, nrow = 0))
colnames(sig_df) <- c("marker", "comparison", "pval", "direction", "patients")

categories <- c("All", "Urban", "Rural")
markers <- c("CD4_HLADRpos_CD38pos_Percent", 
             "CD8_HLADRpos_CD38pos_Percent", 
              "CD4_PD1_Percent", 
              "CD8_PD1_Percent",
              "CD4_CD103_Percent",
              "CD8_CD103_Percent",
              "IL6_pg_mL", 
              "CRP_mg_L")

for (category in categories){
  if(category != "All"){
    heatmap_df <- Metadata281 %>% filter(Location == category)
    heatmap_baseline <- inter_fig1 %>% filter(Location == category)
  }else{
    heatmap_df <- Metadata281
    heatmap_baseline <- inter_fig1
  }

  for (marker in markers){
  patients <- category
  
  # inter cohort tests 
  dt <- dunnTest(heatmap_baseline[[marker]], heatmap_baseline$Cohort_Short, method = "bonferroni")
  c1 <- "Naïve - Exp"
  c2 <- "Naïve - HC"
  c3 <- "Exp - HC"
  p1 <- dt$res$P.adj[1]
  p2 <- dt$res$P.adj[2]
  p3 <- dt$res$P.adj[3]
  
  mean <- heatmap_baseline %>% 
    group_by(Cohort_Short) %>%  
    summarise(avg = mean(eval(as.name(marker))))
  
  avgA <- mean[mean$Cohort_Short == "A",2]$avg
  avgB <- mean[mean$Cohort_Short == "B",2]$avg
  avgC <- mean[mean$Cohort_Short == "C",2]$avg
  
  diffAB <- avgA - avgB
  diffAC <- avgA - avgC
  diffBC <- avgB - avgC 
  
  r1 <- c(marker, c1, p1, diffAB, patients)
  r2 <- c(marker, c2, p2, diffAC, patients)
  r3 <- c(marker, c3, p3, diffBC, patients)
  
  sig_df[nrow(sig_df)+1, ] <- r1
  sig_df[nrow(sig_df)+1, ] <- r2
  sig_df[nrow(sig_df)+1, ] <- r3  
  
  # intra cohort tests 
  sub <- heatmap_df %>% filter(Cohort_Short == "A")
  comparison <- "Naïve: 0 - 24"
  t <- wilcox.test(sub[[marker]] ~ Visit, data = sub, paired = T)
  mean <- sub %>% 
    group_by(Visit) %>%  
    summarise(avg = mean(eval(as.name(marker))))
  avg1 <- mean[mean$Visit == 2,2]$avg
  avg2 <- mean[mean$Visit ==3,2]$avg
  mean_diff <- avg1 - avg2
  r <- c(marker, comparison, t$p.value, mean_diff, patients)
  sig_df[nrow(sig_df)+1, ] <- r
  
  sub <- heatmap_df %>% filter(Cohort_Short == "B")
  comparison <- "Exp: 0 - 24"
  t <- wilcox.test(sub[[marker]] ~ Visit, data = sub, paired = T)
  mean <- sub %>% 
    group_by(Visit) %>%  
    summarise(avg = mean(eval(as.name(marker))))
  avg1 <- mean[mean$Visit == 2,2]$avg
  avg2 <- mean[mean$Visit ==3,2]$avg
  mean_diff <- avg1 - avg2
  r <- c(marker, comparison, t$p.value, mean_diff, patients)
  sig_df[nrow(sig_df)+1, ] <- r
  
  sub <- heatmap_df %>% filter(Cohort_Short == "C")
  comparison <- "HC: 0 - 24"
  t <- wilcox.test(sub[[marker]] ~ Visit, data = sub, paired = T)
  mean <- sub %>% 
    group_by(Visit) %>%  
    summarise(avg = mean(eval(as.name(marker))))
  avg1 <- mean[mean$Visit == 2,2]$avg
  avg2 <- mean[mean$Visit ==3,2]$avg
  mean_diff <- avg1 - avg2
  r <- c(marker, comparison, t$p.value, mean_diff, patients)
  sig_df[nrow(sig_df)+1, ] <- r
  }
}

sig_df$direction <- as.numeric(sig_df$direction)
sig_df$pval <- as.numeric(sig_df$pval)
sig_df$patients <- as.factor(sig_df$patients)

sig_df$Significance <- 'NS'
sig_df$Significance[sig_df$pval <= 0.05] <- '*'
sig_df$Significance[sig_df$pval <= 0.01] <- '**'
sig_df$Significance[sig_df$pval <= 0.001] <- '***'
sig_df$Significance[sig_df$pval <= 0.0001] <- '****'
sig_df$Significance <- as.factor(sig_df$Significance)

sig_df$direction <- sig_df$direction * -1
sig_df$direction[sig_df$pval > 0.05] <- NA

markers_labels <- c("CD4+CD38+HLA-DR+ (%)", 
                    "CD8+CD38+HLA-DR+ (%)", 
                    "CD4+PD1+ (%)", 
                    "CD8+PD1+ (%)",
                    "CD4+CD103+ (%)",
                    "CD8+CD103+ (%)",
                    "IL-6 (pg/mL)", 
                    "CRP (mg/L)")
for (i in 1:length(markers)){
  sig_df$marker[sig_df$marker == markers[i]] <- markers_labels[i]
}
sig_df$category[sig_df$comparison == "Naïve - Exp" |
                sig_df$comparison == "Naïve - HC"  |
                sig_df$comparison == "Exp - HC"] <- "Inter"
sig_df$category[is.na(sig_df$category)] <- "Intra"


comp <- c("Naïve - Exp","Naïve - HC","Exp - HC", "Naïve: 0 - 24", "Exp: 0 - 24", "HC: 0 - 24")
ggplot(sig_df, aes(x = factor(comparison, levels = comp), 
                   y = factor(marker, levels = markers_labels))) + 
  geom_raster(aes(fill=direction))+  
  scale_fill_gradient2(high = "red2", mid = "white", low = "deepskyblue3", na.value = "lightgrey", midpoint = 0,
                       limits = c(-25, 25))+
  geom_text(aes(label=Significance), size = 6) + 
  theme_bw() + 
  xlab("") + 
  ylab("") +
  theme(text = element_text(size = 18), 
        plot.title = element_text(hjust = 0.5), 
        axis.text.x = element_text(angle = 45, vjust = 1, hjust=1), 
        legend.key = element_rect(colour = "black", size = 1), 
        legend.title = element_text(size = 14),
        legend.text=element_text(size=12)) + 
  guides(fill=guide_legend(title="Percent \nDelta", reverse = T)) + 
  facet_nested(~patients + category, scales = "free_x")

Fixed-effects Ordinary Least Squares (OLS) Linear Modeling

# Supplemental Figure S3 but not stratified by location 
SupplementalFig3$Cohort_Short <- factor(SupplementalFig3$Cohort_Short, levels = c("C", "A", "B")) 
SupplementalFig3$Visit <- as.factor(SupplementalFig3$Visit)

linear_methods_df <- as.data.frame(matrix(ncol = 4, nrow = 0))
colnames(linear_methods_df) <- c('ImmuneMarker', 'ExplanatoryVariable', 'Coefficient', 'pvalue')

explanatory_vars <- c('Naïve vs HC', 'Exp vs HC', 'COT (HC)', 
                      'COT (Naïve, \nrelative to HC)', 'COT (Exp, \nrelative to HC)')

for (marker in immune_markers) {
  # Model M1
  fm <- as.formula(paste0('rank(', marker,') ~ Cohort_Short + Visit + Cohort_Short*Visit'))
  results <- feols(fm, data = SupplementalFig3, cluster = ~PID)
  r1 <- c(marker, explanatory_vars[1], results$coeftable[2,1], round(results$coeftable[2,4], 2))
  r2 <- c(marker, explanatory_vars[2], results$coeftable[3,1], round(results$coeftable[3,4], 2))
  r3 <- c(marker, explanatory_vars[3], results$coeftable[4,1], round(results$coeftable[4,4], 2))
  r4 <- c(marker, explanatory_vars[4], results$coeftable[5,1], round(results$coeftable[5,4], 2))
  r5 <- c(marker, explanatory_vars[5], results$coeftable[6,1], round(results$coeftable[6,4], 2))
  
  linear_methods_df[nrow(linear_methods_df)+1, ] <- r1
  linear_methods_df[nrow(linear_methods_df)+1, ] <- r2
  linear_methods_df[nrow(linear_methods_df)+1, ] <- r3
  linear_methods_df[nrow(linear_methods_df)+1, ] <- r4
  linear_methods_df[nrow(linear_methods_df)+1, ] <- r5
}

linear_methods_df$Coefficient <- as.numeric(linear_methods_df$Coefficient)
linear_methods_df$pvalue <- as.numeric(linear_methods_df$pvalue)
linear_methods_df$Coefficient[linear_methods_df$pvalue > 0.05] <- NA
linear_methods_df$Significance <- 'NS'
linear_methods_df$Significance[linear_methods_df$pvalue <= 0.05] <- '*'
linear_methods_df$Significance[linear_methods_df$pvalue <= 0.01] <- '**'
linear_methods_df$Significance[linear_methods_df$pvalue <= 0.001] <- '***'
linear_methods_df$Significance[linear_methods_df$pvalue <= 0.0001] <- '****'
linear_methods_df$Significance <- as.factor(linear_methods_df$Significance)

markers_labels_1 <- c("CD4+ (%)",
                      "CD4+CD38+HLA-DR+ (%)", 
                      "CD8+CD38+HLA-DR+ (%)", 
                      "CD4+PD1+ (%)", 
                      "CD8+PD1+ (%)",
                      "CD4+CD103+ (%)",
                      "CD8+CD103+ (%)",
                      "IL-6 (pg/mL)", 
                      "CRP (mg/L)")
for (i in 1:length(immune_markers)){
  linear_methods_df$ImmuneMarker[linear_methods_df$ImmuneMarker == immune_markers[i]] <- markers_labels_1[i]
}

ggplot(linear_methods_df, aes(x = factor(ExplanatoryVariable, levels = explanatory_vars), 
                   y = factor(ImmuneMarker, levels = markers_labels_1))) + 
  geom_raster(aes(fill=Coefficient))+  
  scale_fill_gradient2(low = "deepskyblue3", mid = "white", high = "red2", na.value = "lightgrey", midpoint = 0, 
                       limits = c(-105, 105))+
  geom_text(aes(label=Significance), size = 6) + 
  theme_bw() + 
  xlab("Predictor") + 
  ylab("Immune Marker") +
  theme(text = element_text(size = 18), 
        plot.title = element_text(hjust = 0.5), 
        axis.text.x = element_text(angle = 45, vjust = 1, hjust=1), 
        legend.key = element_rect(colour = "black", size = 1), 
        legend.title = element_text(size = 14),
        legend.text=element_text(size=12)) + 
  guides(fill=guide_legend(title="Coefficient", reverse = T))

Cross-sectional and Longitudinal Analysis of Microbiome Diversity

Alpha Diversity

Get Alpha Diveristy Data

shannon_df <- qiime2R::read_qza("data/DiversityData/shannon_vector.qza")
shannon_df <- shannon_df$data
shannon_df <- tibble::rownames_to_column(shannon_df, "SampleID")
Metadata281 <- Metadata281 %>% 
  dplyr::left_join(shannon_df, by = "SampleID")
shannon_baseline <- Metadata281 %>% filter(Visit == 2)

Get deltas

v2 <- Metadata281 %>% filter(Visit == 2) %>% dplyr::select(PID, shannon_entropy, Cohort, Location)
v3 <- Metadata281 %>% filter(Visit == 3) %>% dplyr::select(PID, shannon_entropy)
delta_df <- dplyr::full_join(v2, v3, by = "PID")

delta_df<- delta_df %>%  
  mutate(delta = shannon_entropy.y - shannon_entropy.x)

Plot

# All statistical calculations were done separately, see Methods 

ann <- data.frame(lx = c(2, 2, 2.5, 2.5, 2.5,2.5),
                  ly = c(8, 8, 7.5, 7.5, 7.5, 7.5),
                  lab = c("*", "____________", "*", "______", "*", "______"),
                  Location = c("Rural", "Rural", "Rural", "Rural", "Urban", "Urban"))
A <- ggplot(shannon_baseline, aes(x = Cohort, y = shannon_entropy)) + 
  geom_boxplot(show.legend = F, 
               coef = 6, 
               alpha = 0.5, aes(fill = Cohort)) + 
  scale_fill_manual(values = c(cohortColors[1],
                               cohortColors[3],
                               cohortColors[5])) + 
  geom_point(show.legend = F,  
             position = position_jitterdodge(0.2), 
             aes(color = Cohort)) + 
  scale_color_manual(values = c("black",
                                 "black",
                                 "black")) +  
  theme_bw() +
  theme(text = element_text(size = 18)) +
  xlab("Cohort") + 
  ylab("Shannon Entropy") + 
  facet_grid(~Location) + 
  geom_text(data = ann, aes(x = lx, y = ly, label = lab), size = 6)

ann <- data.frame(lx = c(1, 1),
                  ly = c(7.5, 7.5),
                  lab = c("*", "_____"),
                  Location = c("Rural", "Rural") )
B <- ggplot(Metadata281, aes(x = Cohort, y = shannon_entropy)) + 
  geom_boxplot(show.legend = T,coef = 6, alpha = 0.5, aes(fill = Week)) + 
  geom_point(show.legend = F, aes(color = Week), position = position_jitterdodge(0.2)) +
  scale_fill_manual(values = c(visitColors[1], visitColors[5])) + 
  scale_color_manual(values = c(visitColors[1], visitColors[5])) + 
  theme_bw() +
  theme(text = element_text(size = 18)) +
  facet_grid(~Location)+
  xlab("Cohort") + 
  ylab("") + 
  geom_text(data = ann, aes(x = lx, y = ly, label = lab), size = 6)

# Model M2
C <- ggplot(delta_df, aes(x = shannon_entropy.x, y = delta, color = Cohort)) +
  geom_point() +
  scale_color_manual(values = c(cohortColors[1],
                               "gray65",
                               cohortColors[5])) +
  geom_smooth(method = "lm", fill = NA)+ 
  theme_bw() + 
  theme(text = element_text(size = 18)) +
  xlab("Baseline Shannon Entropy") + 
  ylab("Delta Shannon Entropy \n (Wk24 - Wk0)") + 
  stat_cor(label.x = 5)+
  ylim(-2,3)

cowplot::plot_grid(A, B, C,
          labels = c("A", "B", "C"),
          ncol = 3, nrow = 1 ,
          rel_widths = c(0.7, 1, 1))

Beta Diversity

Function for replacing taxon labels

# takes a row containing taxon names and returns last two known levels
# e.g. d__Eukaryota;p__Incertae_Sedis;c__Incertae_Sedis;o__Incertae_Sedis;f__Incertae_Sedis;g__Blastocystis
# to f__Prevotellaceae; g__Prevotella
getNiceTaxon <- function(rowVector){
  newVector <- c()
  for(rowValue in rowVector){
    if(substr(rowValue, 1, 6) != "module"){
      newName <- tail(strsplit(rowValue, split = ";")[[1]], 2)
      newName <- paste0(newName[1], ";", newName[2])
      newVector <- c(newVector, c(newName))
    }else{
      newVector <- c(newVector, c(rowValue))
    }
  }
  return(newVector)
}

Biplots

Preparing the Data

# biom table 
biom <- qiime2R::read_qza("data/BiplotsData/tax_filtered_281.qza")
biom <- as_tibble(biom$data, rownames = 'ASV')

# subset of metadata
sub_meta <- readr::read_delim("data/subset_Metadata281.tsv", 
                           delim = "\t", 
                           escape_double = FALSE, 
                           col_types = cols(Location = col_factor(levels = c("Rural","Urban")), 
                                            Week = col_factor(levels = c("0","24")), 
                                            HIV1_viral_outcome = col_factor(levels = c("Suppressed", 
                                                                                       "Viremic"))), 
                           trim_ws = TRUE)

# Weighted UniFrac PCoA
w_uni <- qiime2R::read_qza("data/DiversityData/weighted_unifrac_pcoa_results.qza")$data
w_uni_pcoa <- as_tibble(w_uni$Vectors)

# get proportions explained
w_uni_prop <- w_uni$ProportionExplained[1:3]

# get biplot from qiime 
q_biplot <- qiime2R::read_qza("data/BiplotsData/biplot.qza")
q_biplotSpecies <- as_tibble(q_biplot$data$Species)
q_biplotVectors <- as_tibble(q_biplot$data$Vectors)

# get taxonomy and replace with better labels 
tax <- qiime2R::read_qza("data/BiplotsData/classification.qza")
tax <- tax$data 
colnames(tax)[1] <- "FeatureID"

q_biplotSpecies <- q_biplotSpecies %>% 
  dplyr::left_join(tax, by = "FeatureID") %>% 
  dplyr::select(Taxon, PC1, PC2, PC3)
colnames(q_biplotSpecies)[1] <- "FeatureID"

q_biplotSpecies %>% 
  dplyr::mutate(FeatureID = getNiceTaxon(FeatureID)) -> abbrBiplotSpecies

# prepare sample data 
w_uni_pcoa <- w_uni_pcoa %>% 
  dplyr::select(SampleID, PC1, PC2, PC3) %>% 
  dplyr::left_join(sub_meta, by = "SampleID")

w_uni_pcoa$Cohort <- as.factor(w_uni_pcoa$Cohort)
w_uni_pcoa$Cohort <- ordered(w_uni_pcoa$Cohort, levels = c("Naïve", "Exp", "HC"))

Figure Creation

# prep axis labels 
pc1_lab <- paste0('PC1 (', round(w_uni_prop[1],3)*100, '%)')
pc2_lab <- paste0('PC2 (', round(w_uni_prop[2],3)*100, '%)')
pc3_lab <- paste0('PC3 (', round(w_uni_prop[3],3)*100, '%)')

# create base plot with arrows for first plot 
abbrBiplotSpecies %>% 
  mutate(a=sqrt(PC1^2+PC2^2)) %>% # calculate the distance from the origin
  top_n(4, a) %>% #keep 8 furthest away points
  mutate(PC1=PC1*5, PC2=PC2*5) -> abbrBiplotSpeciesArrows # scale arrows linearly

new_lables <- c()
for (row in 1:nrow(abbrBiplotSpeciesArrows)) {
  tax_name <- abbrBiplotSpeciesArrows$FeatureID[row]
  genus <- tail(strsplit(tax_name, split = ";")[[1]], n=1)
  new_lables <- c(new_lables, genus)
}

abbrBiplotSpeciesArrows <- cbind(abbrBiplotSpeciesArrows, new_lables)

A <- ggplot(w_uni_pcoa, aes(x = PC1, y = PC2, color = Cohort)) + 
  geom_point(show.legend = F, size = 5 ,alpha = 0.6) + 
  scale_color_manual(values = c(cohortColors[1],
                               "grey65",
                               cohortColors[5])) + 
  theme_bw(base_size = 18) + 
  xlab(pc1_lab) +
  ylab(pc2_lab) + 
  geom_segment(data = abbrBiplotSpeciesArrows, aes(x=0, xend=PC1, y=0, yend=PC2),
               arrow = arrow(length = unit(0.3,"cm")),color = 'black')+
  geom_text_repel(data = abbrBiplotSpeciesArrows, aes(x=PC1, y=PC2, label=new_lables),size=7, color = 'black')




# second plot 
abbrBiplotSpecies %>% 
  mutate(a=sqrt(PC1^2+PC3^2)) %>% # calculate the distance from the origin
  top_n(4, a) %>% #keep 8 furthest away points
  mutate(PC1=PC1*5, PC3=PC3*5) -> abbrBiplotSpeciesArrows # scale arrows linearly

new_lables <- c()
for (row in 1:nrow(abbrBiplotSpeciesArrows)) {
  tax_name <- abbrBiplotSpeciesArrows$FeatureID[row]
  genus <- tail(strsplit(tax_name, split = ";")[[1]], n=1)
  new_lables <- c(new_lables, genus)
}

abbrBiplotSpeciesArrows <- cbind(abbrBiplotSpeciesArrows, new_lables)

B <- ggplot(w_uni_pcoa, aes(x = PC1, y = PC3, color = Cohort)) + 
  geom_point(size = 5, alpha = 0.6 ) + 
  scale_color_manual(values = c(cohortColors[1],
                                "grey65",
                                cohortColors[5])) + 
  theme_bw(base_size = 18) + 
  xlab(pc1_lab) +
  ylab(pc3_lab) + 
  geom_segment(data = abbrBiplotSpeciesArrows, aes(x=0, xend=PC1, y=0, yend=PC3),
               arrow = arrow(length = unit(0.3,"cm")),color = 'black')+
  geom_text_repel(data = abbrBiplotSpeciesArrows, aes(x=PC1, y=PC3, label=new_lables),size=7, color = 'black')

cowplot::plot_grid(A, B, 
                   labels = c("A", " "), 
                   ncol = 2, nrow = 1)

Dysbiosis and Delta Dysbiosis

Beta Data Read In and Wrangling

beta_df <- data.frame(matrix(ncol = 2, nrow = 0))
colnames(beta_df) <- c("SampleID", "Dysbiosis") 

distance_matrix<- qiime2R::read_qza("data/DiversityData/weighted_unifrac_distance_matrix.qza")
distance_matrix <- distance_matrix$data %>% as.matrix()
distance_matrix[upper.tri(distance_matrix)] <- NA
diag(distance_matrix) <- NA
distance_matrix <- distance_matrix %>% as.data.frame()
distance_matrix <- tibble::rownames_to_column(distance_matrix, "SampleID")

# calculate average distance from PLWH to healthy controls for samples at 
# each time point 
weeks <- c(0, 24)
for (current_week in weeks){
  
  sampleIDs <- Metadata281 %>% filter(Cohort_Short != 'C' & Week == current_week)
  AB_SIDs <- sampleIDs$SampleID
  sampleIDs <- Metadata281 %>% filter(Cohort_Short == 'C'& Week == current_week)
  C_SIDs <- sampleIDs$SampleID
  
  dysbiosis_matrix <- distance_matrix[distance_matrix$SampleID %in% C_SIDs, ]
  
  for (sample in AB_SIDs) {
    avg <- mean(dysbiosis_matrix[[sample]], na.rm = T)
    beta_df[nrow(beta_df)+1, ] <- c(sample, avg)
  }
}

# remove NAs 
beta_df <- beta_df[!is.na(beta_df$Dysbiosis), ]
beta_df <- beta_df %>% filter(Dysbiosis != "NaN")

# create subset and baseline 
meta_sub <- Metadata281 %>% dplyr::select(SampleID, Cohort, Week, PID, Location)
beta_df <- dplyr::left_join(beta_df, meta_sub, by = "SampleID")
beta_df$Dysbiosis <- as.numeric(beta_df$Dysbiosis)
baseline_beta <- beta_df %>% 
  filter(Week == 0) %>% 
  dplyr::select(PID, Dysbiosis)

# calculate deltas
bv3 <- beta_df %>% filter(Week == 24) %>% dplyr::select(PID, Dysbiosis, Cohort, Location)
beta_delta <- dplyr::left_join(baseline_beta, bv3, by = "PID")
beta_delta <- beta_delta %>% mutate(Delta_Dysbiosis = Dysbiosis.y - Dysbiosis.x)

Plotting

# Model M3
B <- ggplot(na.omit(beta_delta), aes(x = Dysbiosis.x, y = Delta_Dysbiosis, color = Cohort)) +
  geom_point() +
  geom_smooth(method = "lm", fill = NA)+ 
  scale_color_manual(values = c(cohortColors[1],
                                "gray65")) +
  theme_bw() + 
  theme(text = element_text(size = 18)) +
  xlab("Baseline Dysbiosis") + 
  ylab("Delta Dysbiosis \n (Wk24 - Wk0)") + 
  stat_cor(label.x = 0.5) 


C <- ggplot(na.omit(beta_delta), aes(x = Dysbiosis.x, y = Delta_Dysbiosis, color = Location)) +
  geom_point() +
  geom_smooth(method = "lm", fill = NA)+ 
  theme_bw() + 
  theme(text = element_text(size = 18)) +
  xlab("Baseline Dysbiosis") + 
  ylab(" ") + 
  stat_cor(label.x = 0.6)

cowplot::plot_grid(B, C, 
                   labels = c("B", "C"), 
                   ncol = 2, nrow = 1)

Performing Adonis Tests

# reload metadata to retain as many samples as possible 
Metadata281_full <- read_excel("data/Metadata281.xlsx")

Metadata281_full <- subset(Metadata281_full, Metadata281_full$SampleID != "ZIM026.2" &
                         Metadata281_full$SampleID != "ZIM026.3" &
                         Metadata281_full$SampleID != "ZIM008.3" & 
                         Metadata281_full$SampleID != "ZIM062.3")


Metadata281_full$Location[Metadata281_full$Arm_Short == "MA"] <- "Urban"
Metadata281_full$Location[Metadata281_full$Arm_Short == "MT"] <- "Rural"
Metadata281_full$Cohort[Metadata281_full$Cohort_Short == "A"] <- "Naïve"
Metadata281_full$Cohort[Metadata281_full$Cohort_Short == "B"] <- "Exp"
Metadata281_full$Cohort[Metadata281_full$Cohort_Short == "C"] <- "HC"
Metadata281_full$Week[Metadata281_full$Visit == 2] <- 0
Metadata281_full$Week[Metadata281_full$Visit == 3] <- 24
Metadata281_full$Cohort <- factor(Metadata281_full$Cohort, levels = c("Naïve", "Exp", "HC"))

Metadata281_full$Treatment_stat[Metadata281_full$Cohort_Short == "A" & Metadata281_full$Visit == 2] <- "untreated"
Metadata281_full$Treatment_stat[Metadata281_full$Cohort_Short == "A" & Metadata281_full$Visit == 3] <- "treated"
Metadata281_full$Treatment_stat[Metadata281_full$Cohort_Short == "B"] <- "treated"
Metadata281_full$Treatment_stat[Metadata281_full$Cohort_Short == "C"] <- "untreated"

# read in weighted unifrac distance matrix 
distance_matrix<- qiime2R::read_qza("data/DiversityData/weighted_unifrac_distance_matrix.qza")
distance_matrix <- distance_matrix$data %>% as.matrix()
distance_matrix <- distance_matrix %>% as.data.frame()
distance_matrix <- tibble::rownames_to_column(distance_matrix, "SampleID")
indices <- order(distance_matrix$SampleID)
distance_matrix <- distance_matrix[,2:ncol(distance_matrix)]
distance_matrix <- distance_matrix[indices, indices]

adonis_data <- Metadata281_full[Metadata281_full$SampleID %in% colnames(distance_matrix),]

# make sure they are in the same sample order
adonis_data$SampleID == colnames(distance_matrix)

##   [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [46] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [61] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [76] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [91] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [106] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [121] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [136] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [151] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [166] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [181] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [196] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [211] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [226] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [241] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [256] TRUE TRUE TRUE TRUE

# Model M4
adonis2(as.dist(distance_matrix) ~ adonis_data$Location + 
          adonis_data$Location*adonis_data$Week, 
        strata = adonis_data$PID)

## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Blocks:  strata 
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = as.dist(distance_matrix) ~ adonis_data$Location + adonis_data$Location * adonis_data$Week, strata = adonis_data$PID)
##                                        Df SumOfSqs      R2      F Pr(>F)   
## adonis_data$Location                    1    2.870 0.02181 5.7291  0.029 * 
## adonis_data$Week                        1    0.853 0.00648 1.7034  0.010 **
## adonis_data$Location:adonis_data$Week   1    0.152 0.00115 0.3034  0.795   
## Residual                              255  127.734 0.97056                 
## Total                                 258  131.610 1.00000                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Model M5
adonis2(as.dist(distance_matrix) ~ adonis_data$Visit_Age + 
          adonis_data$HIV_Status + 
          adonis_data$Treatment_stat, 
        strata = adonis_data$PID)

## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Blocks:  strata 
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = as.dist(distance_matrix) ~ adonis_data$Visit_Age + adonis_data$HIV_Status + adonis_data$Treatment_stat, strata = adonis_data$PID)
##                             Df SumOfSqs      R2      F Pr(>F)  
## adonis_data$Visit_Age        1    2.721 0.02067 5.4688  0.086 .
## adonis_data$HIV_Status       1    1.045 0.00794 2.1004  0.114  
## adonis_data$Treatment_stat   1    0.991 0.00753 1.9928  0.051 .
## Residual                   255  126.853 0.96386                
## Total                      258  131.610 1.00000                
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Differential Abundance Analysis

Get Data

# create phyloseq object 
pseq <- qiime2R::qza_to_phyloseq(features = "data/ANCOMBCData/tax_filtered_281.qza", 
                                 taxonomy = "data/ANCOMBCData/classification.qza", 
                                 metadata = "data/subset_Metadata281.tsv", 
                                 tree = "data/ANCOMBCData/tree_Zim.qza")

# Change factored variable in metadata 
meta <- pseq %>% sam_data() %>% as.data.frame()
meta$Cohort <- factor(meta$Cohort, levels = c("HC", "Naïve", "Exp"))
sample_data(pseq) <- sample_data(meta)

# get baseline 
pseq.baseline <- pseq
baseline <- pseq %>% sample_data() %>% as.data.frame()
baseline <- baseline[baseline$Week == 0]
sample_data(pseq.baseline) <- sample_data(baseline)

# only look at cohort A
pseq.A <- pseq
cohortA <- pseq %>% sample_data() %>%  as.data.frame()
cohortA <- cohortA[cohortA$Cohort == "Naïve"]
sample_data(pseq.A) <- sample_data(cohortA)

Run ANCOMBC

# Model M6
out <- ancombc2(
  data = pseq.baseline, 
  tax_level = 'Genus',
  fix_formula = 'Location + HIV1_viral_outcome + Cohort',
  group = 'Cohort'
)

# Model M7
# out <- ancombc2(
#   data = pseq, 
#   tax_level = 'Genus',
#   fix_formula = 'Location + Week + HIV1_viral_outcome + Cohort',
#   rand_formula = '(1 | PID)', 
#   group = 'Cohort'
# )

# Model M8
# out <- ancombc2(
#   data = pseq.A, 
#   tax_level = 'Genus',
#   fix_formula = 'Location + Week + HIV1_viral_outcome',
#   rand_formula = '(1 | PID)'
# )

View ANCOMBC results

# Results 
res <- out$res

## Location results 
df_location <- res %>% 
  dplyr::select(taxon, contains("Location")) 

## Week results 
df_week <- res %>% 
  dplyr::select(taxon, contains("Week"))

## Viral Outcome results 
df_vo <- res %>% 
  dplyr::select(taxon, contains("Viral"))

## Cohort results
df_cohort <- res %>% 
  dplyr::select(taxon, contains("Cohort"))

Plot

# put results data in correct format 
df_cohort_fig <- df_cohort %>% 
  dplyr::filter(diff_CohortExp == 1 | diff_CohortNaïve == 1) %>%  
  dplyr::mutate(lfc_CohortNaïve = ifelse(diff_CohortNaïve == 1, 
                                  lfc_CohortNaïve, 0), 
         lfc_CohortExp = ifelse(diff_CohortExp == 1, 
                               lfc_CohortExp, 0) 
  ) %>% 
  dplyr::transmute(taxon, 
            `Exp vs HC` = round(lfc_CohortExp, 2),
            `Naïve vs HC` = round(lfc_CohortNaïve, 2)) %>% 
  tidyr::pivot_longer(cols = `Exp vs HC`:`Naïve vs HC`, 
               names_to = "group", values_to = "value") %>% 
  dplyr::arrange(taxon)

# plot 
lo <- floor(min(df_cohort_fig$value))
up <- ceiling(max(df_cohort_fig$value))
mid <- (lo + up)/2
df_cohort_fig %>%
  mutate(taxon = paste(substr(taxon, 7, 100000L))) %>% 
  ggplot(aes(x = factor(group, level = c("Naïve vs HC","Exp vs HC")), y = taxon, fill = value)) + 
  geom_tile(color = "black") +
  scale_fill_gradient2(low = "deepskyblue3", high = "red2", mid = "white", 
                       na.value = "white", midpoint = mid, limit = c(lo, up),
                       name = NULL) +
  geom_text(aes(group, taxon, label = value), color = "black", size = 6) +
  labs(x = NULL, y = NULL) +
  ggtitle("Baseline") + 
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5), 
        text = element_text(size = 18))

Integrative Analysis of Immune Markers and Microbiome

Building Predictive Models for Immune Markers

Data Preprocessing

# get microbiome diversity data 
shannon_df <- read_qza("data/DiversityData/shannon_vector.qza")
shannon_df <- shannon_df$data
shannon_df <- tibble::rownames_to_column(shannon_df, "SampleID")

w_uni <- read_qza("data/DiversityData/weighted_unifrac_pcoa_results.qza")$data
w_uni_pcoa <- as_tibble(w_uni$Vectors)
w_uni_pcoa <- w_uni_pcoa %>% 
  dplyr::select(SampleID, PC1, PC2, PC3, PC4)

# add to metadata
Metadata281 <- Metadata281 %>% 
  left_join(shannon_df, by = "SampleID") %>% 
  left_join(w_uni_pcoa, by = "SampleID") 

# get immune marker table and feature table 
IM_data <- Metadata281 %>% dplyr::select(SampleID, all_of(immune_markers)) 
features  <- Metadata281 %>% 
  dplyr::select(SampleID, Cohort, Location, Gender, BMI, BMI_Cat, HIV_Status, Week, Visit_Age,
         Bristol_Score, Education_Level, Water_Source_Num, Cotrimoxazole_HIV, 
         Cotri_Length_Month, Normal_work_transportation, Manual_work_job,
         Manual_chores_home, Head_of_household, LenghtOnArt_yrs) %>% 
  left_join(shannon_df, by = "SampleID") %>% 
  left_join(w_uni_pcoa, by = "SampleID") 

# some preprocessing to address NAs 
# there were 20 samples without microbiome informations due to quality control
features$Cotrimoxazole_HIV[features$Cotrimoxazole_HIV == "NA"] <- NA
features$Cotrimoxazole_HIV[is.na(features$Cotrimoxazole_HIV)] <- "No"
features$Cotrimoxazole_HIV <- droplevels(features$Cotrimoxazole_HIV)
features$Normal_work_transportation[features$Normal_work_transportation == "NA"] <- NA
features$Normal_work_transportation <- droplevels(features$Normal_work_transportation)
features$BMI_Cat[is.na(features$BMI_Cat)] <- "Moderate_thinnes"
features$Head_of_household <- droplevels(features$Head_of_household)

Feature Selection

# backwards stepwise regression feature selection
formulas <- vector(mode="list")

for(marker in immune_markers){
  print(marker)
  # merge immune marker with features 
  IM_one <- IM_data %>% 
    dplyr::select(SampleID, marker)
  step_data <- features %>% 
    left_join(IM_one, by = "SampleID") %>% 
    dplyr::select(-SampleID) %>% 
    na.omit()
  
  ## we want the model to have location 
  f <- as.formula(paste0(marker," ~ Location"))
  myM <- lm(f, data = step_data)
  
  ## model with everything 
  fB <- as.formula(paste0(marker, " ~."))
  biggest <- lm(fB, data = step_data)
  
  ## stepwise regression, direction = "backward"
  bwd <- step(biggest, 
              trace = 1, 
              direction = "backward", 
              scope = list(lower = myM, upper = biggest))
  
  bwd_formula <- paste0(bwd$call)[2]
  formulas[[marker]] <- bwd_formula
}

Model Building & Testing

# for models involving PLWH we viral load 
# for models involving ART exp PLWH we added length on ART 

# made models dataset 
lm_data <- as.data.frame(matrix(nrow = 0, ncol = 4))
colnames(lm_data) <- c("Marker", "Week", "Cohort", "Formula")

weeks <- c(0, 24)
Cohorts <- c("Naïve", "Exp", "HC")

for(marker in immune_markers){
  for(week in weeks){
    for(cohort in Cohorts){
      if(cohort == "Naïve"){
        form <- paste0(formulas[marker], "+ HIV1_viral_load")
      }
      if(cohort == "Exp"){
        form <- paste0(formulas[marker], "+ HIV1_viral_load + LenghtOnArt_yrs")
      }
      if(cohort == "HC"){
        form <- paste0(formulas[marker])
      }
      r <- c(marker, week, cohort, form)
      lm_data[nrow(lm_data)+1, ] <- r
    }
  }
}

# run models and make results 
sub_d <- Metadata281 %>% 
  filter(Cohort == "HC" & Week == 24)
f <- as.formula(CRP_mg_L ~ Location + BMI + Visit_Age + PC1 )
summary(lm(f, data = sub_d))

Plotting

lm_table <- read_excel("data/PredictiveModeling/LinearModelInfo.xlsx")

lm_table$Tested[lm_table$Tested == "Yes"] <- "Included in Model"
lm_table$Cohort <- factor(lm_table$Cohort, levels = c("Naïve", "Exp", "HC"))
lm_table$Tested <- factor(lm_table$Tested, levels = c("Up", "Up (Model NS)", "Down", 
                                                      "Down (Model NS)", "Included in Model"))

im_labels <- c("CD4+CD38+HLA-DR+ (%)", 
                "CD8+CD38+HLA-DR+ (%)", 
                "CD4+PD1+ (%)", 
                "CD8+PD1+ (%)",
                "CD4+CD103+ (%)",
                "CD8+CD103+ (%)",
                "IL-6 (pg/mL)", 
                "CRP (mg/L)", 
                "CD4+ (%)")
lm_table$`Immune Marker` <- factor(lm_table$`Immune Marker`, levels = im_labels)

cols <- colnames(lm_table)
lm_table[cols] <- lapply(lm_table[cols], factor)



ggplot(lm_table, aes(x = Predictor, y = `Immune Marker`)) + 
  geom_point(aes(color = Tested), size = 6) + 
  scale_color_manual(values = c("Up" = "red2",
                                "Up (Model NS)"="peachpuff",
                                "Down"="deepskyblue3", 
                                "Down (Model NS)" = "paleturquoise", 
                                "Included in Model" = "lightgrey")) + 
  theme_bw() + 
  theme(text = element_text(size = 18), 
        axis.text.x = element_text(angle = 45, vjust = 1, hjust=1), 
        legend.title = element_blank()) +
  ylab("Immune Marker") +
  facet_grid(Cohort~Week)

Network Analysis of Immune and Microbial Associations

Format Mb Data

library(MicrobiotaProcess)

# making a pretty taxa table 
tax <- read_qza("data/NetworkAnalysis/dataFrom/classification.qza")
  
#head(tax$taxatab)
tax$taxatab$sequence = seq_along(rownames(tax$taxatab))

getNiceTaxon <- function(rowVector){
  newVector <- c()
  for(rowValue in 1:length(rowVector)){
    #print(rowVector[rowValue])
    if(substr(rowVector[rowValue], 1, 6) != "module"){
     newName <- paste(tax$taxatab[rowVector[rowValue],], collapse=";")

      #newName <- tax$taxatab[rowVector[rowValue],]
      newName <- tail(strsplit(newName, split = ";")[[1]], 3)
      newName <- paste0(newName[1], ".", newName[2],"_",newName[3])
      newVector <- c(newVector, c(newName))
    }else{
      newVector <- c(newVector, c(rowVector[rowValue]))
    }
  }
  return(newVector)
}  
 

# Filter SCNIC output for ASVs that are not seen in less than 20% of samples
# before loading in convert SCNIC biom to TSV
# remove first line in the TSV 

SCNIC_output <- read_delim("data/NetworkAnalysis/dataFrom/table_collapse.txt", 
                           delim = "\t", escape_double = FALSE, 
                           trim_ws = TRUE)
colnames(SCNIC_output)[1] <- "OTU_ID"

# convert to proper format 
t_SO <- data.frame(t(SCNIC_output[,-1]))
colnames(t_SO) <- SCNIC_output$OTU_ID

#Compute relative abundance

reads = rowSums(t_SO) #not same, so not rarefied
scRA = t_SO/reads
rowSums(scRA) #sums to 1


# remove ASVs prevalent in less than 20% of samples 
# goes from 8853 to 237 ASVs/modules
# goes from 338 to 106 taxa/modules 

count0s<- function(col) {
  zeros <- sum(col==0)
  perc <- length(col)*0.2
  length(col) - zeros > perc 
}


scRAthin = scRA[,apply(scRA, 2, count0s)]

head(t_SO[,"ae988b9b4c6c1e2466fa26cd1e2ec5c7"])


newCols = getNiceTaxon(colnames(scRAthin))
colnames(scRAthin) = newCols

scRAthin$SampleID <- row.names(scRAthin)
scRAthin = cbind(reads, scRAthin)

write.csv(scRAthin, "data/NetworkAnalysis/dataOut/mb.csv", quote=FALSE, row.names=FALSE)
write.csv(colnames(scRAthin), "data/NetworkAnalysis/dataOut/mbH.csv", quote=FALSE, row.names=FALSE)

Combine Immune and Microbiome data

im = read.csv("data/NetworkAnalysis/dataOut/immuneMarkersImputed.csv")
mb = read.csv("data/NetworkAnalysis/dataOut/mb.csv")
h = read.csv("data/NetworkAnalysis/dataOut/mbH.csv")
colnames(mb) = h$x

#Add prefixes to data

colnames(im)[5:ncol(im)] = paste("im_", colnames(im)[5:ncol(im)], sep="")
colnames(mb)[2:(ncol(mb)-1)] = paste("mb_", colnames(mb)[2:(ncol(mb)-1)], sep="")

#Merge data
#Have less data with im than mb
d = merge(im, mb, by.x="SampleID", by.y="SampleID")


#separate into Week0 and Week24
d0 = subset(d, Week==0) #127 rows
d24 = subset(d, Week==24) #113.  Delta would be pretty good

#Save files and column names
write.csv(d0, "data/NetworkAnalysis/dataOut/imMbW0.csv", quote=FALSE, row.names=FALSE)
write.csv(colnames(d0), "data/NetworkAnalysis/dataOut/imMbW0_h.csv", quote=FALSE, row.names=FALSE)


write.csv(d24, "data/NetworkAnalysis/dataOut/imMbW24.csv", quote=FALSE, row.names=FALSE)
write.csv(colnames(d24), "data/NetworkAnalysis/dataOut/imMbW24_h.csv", quote=FALSE, row.names=FALSE)

Derive and Filter Immune:Microbe regressions

##########Parameters
tag = "3sW0"

dataOut = paste("data/NetworkAnalysis/dataOut/models_", tag, "June2023.csv", sep="")

##########Begin processing
dd = read.csv("data/NetworkAnalysis/dataOut/imMbW0.csv")
h = read.csv("data/NetworkAnalysis/dataOut/imMbW0_h.csv")
colnames(dd) = h$x

#dd = read.csv("data/NetworkAnalysis/dataOut/imMbW24.csv")
#h = read.csv("data/NetworkAnalysis/dataOut/imMbW24_h.csv")
#colnames(dd) = h$x

#Which arm is reference?  HC
dd$Cohort = factor(dd$Cohort, levels = c("Naive", "Exp", "HC"))

coCol = which(colnames(dd)=="Cohort")

#Rename this to Grp
colnames(dd)[coCol] = "Grp"

#read count column
rc = which(colnames(dd)=="reads")

#must know where in file which "ome" starts and stops.  These are column numbers

myOmes = c("im_", "mb_")

omeCols = lapply(myOmes, function(x) grep(paste("^", x, sep=""), colnames(dd)))

cc = combinations(n=length(myOmes),r=2)

#want to regress, e.g.: im ~ mb

#create function regress
regress = function(oA, oB, assays)
{
    #oA = o1
    #oB = o2
    
    for (h in oA)
    {   
        #print(h)
        #h = 6
        ana1 = colnames(dd)[h]
        print(ana1)
        
        for (i in oB)
        {    
            #i=15
            #print(paste(h,i))
            ana2 = colnames(dd)[i]
            print(paste(ana1,ana2, sep=","))
            #subset for complete cases.
            #not thin yet, so this works    
            dcc = dd[complete.cases(dd[, c(h,i,rc,coCol)]), c(h,i,rc,coCol) ]
                        
            #print(nrow(dcc))
            n=nrow(dcc)
            nNot0 = sum(dcc[,ana2] > 0)
            if(min(table(dcc$Grp)) > 10)
            {
                                        
                    #3 slope, readcount in model
                    m = lm(dcc[,ana1] ~ dcc$reads + dcc$Grp + dcc[,ana2]*dcc$Grp)
                    adjRsq = summary(m)$adj.r.squared
                    dffitsMax = max(abs(dffits(m)))
                    fParts = summary(m)$fstatistic
                     pF = pf(fParts[1], fParts[2], fParts[3], lower.tail=FALSE)
                    tt = summary(m)$coefficients
                    if (nrow(tt) == 7)
                    {
                        pRc = tt[2,4]
                        pNaive = tt[5,4]
                          pExp = tt[6,4]
                          pHC = tt[7,4]
                          ExpSlope = tt[6,1] + tt[5,1]
                          HCSlope = tt[7,1] + tt[5,1]
                          q13 = quantile(dcc[,ana2], c(.25,.75))
                          NaiveIqrE = tt[5,1] * (q13[2] - q13[1])
                          HCIqrE = HCSlope * (q13[2] - q13[1])
                          ExpIqrE = ExpSlope * (q13[2] - q13[1])                                                
                        #append to output file
                        
                        write(paste(assays, ana1, ana2, n, nNot0, adjRsq, pF, pNaive, pExp, pHC, NaiveIqrE,  ExpIqrE, HCIqrE, dffitsMax, pRc, sep=","), dataOut, append=TRUE)

                        
                    }
            } 
        }   
    }
}

#Create header row for output file.
#Writing these results directly to file performs well, and sets stage for permutation testing.
#Should something go wrong in a long job, it is possible to restart without losing the data previously generated, since it's in the output file.
header = paste("assays", "ana1", "ana2", "n", "nNot0", "adjRsq", "pF", "pNaive", "pExp", "pHC", "IqrNaive",  "IqrExp", "IqrHC", "maxInfluence", "pRc", sep=",")

write(header, dataOut, append=FALSE)
#Add timer for general information
tm0 = proc.time()

for (k in 1:nrow(cc))
{

  #k = 2
  print(paste("processing row ", k))
  #omeA = myOmes[1]
  oA = omeCols[[cc[k,1]]]
  #shorten array for testing purposes
  #oA = omeCols[[cc[k,1]]][1:10]

  oB = omeCols[[cc[k,2]]]
  #omeCols[[oA]]
  #omeCols[[oB]]
  regress(oA, oB, gsub("_", "", paste(myOmes[cc[k,1]], myOmes[cc[k,2]], sep=".")))
}
proc.time() - tm0





#Explore results
d = read.csv(dataOut)
d$pAdjFDR = p.adjust(d$pF, method="fdr")


dBest = subset(d, pAdjFDR < 0.2 & maxInfluence < 4 & pNaive < 0.05 & (pExp < 0.05 | pHC < 0.05) & adjRsq > 0.15)

dBestStrict = subset(d, pAdjFDR < 0.2 & maxInfluence < 2 & pNaive < 0.05 & (pExp < 0.05 | pHC < 0.05) & adjRsq > 0.25)

sort(table(c(dBest$ana1, dBest$ana2)))

#bos=best of show
bosOut = paste("data/NetworkAnalysis/dataOut/bos_", tag, "June2023.csv", sep="")
write.csv(dBest, bosOut, quote=FALSE, row.names=FALSE)


sort(table(c(dBestStrict$ana1, dBestStrict$ana2)))
bosOutStrict = paste("data/NetworkAnalysis/dataOut/bos_", tag, "Strict_June2023.csv", sep="")
write.csv(dBestStrict, bosOutStrict, quote=FALSE, row.names=FALSE)

Create Network Diagram

d = read.csv("data/NetworkAnalysis/dataOut/bos_3sW0Strict_June2023.csv", stringsAsFactors=FALSE)
gg = graph.data.frame(d[,c("ana1","ana2")], directed=FALSE)

#retrieve niceNames
niceNames = read.csv("data/NetworkAnalysis/dataFrom/bosNamesWithNice.csv")
niceNames$nice = gsub("\\n", "\n", niceNames$nice, fixed=TRUE)

#perform lookup
featureToNice = list()
featureToNice[niceNames$orig] = as.character(niceNames$nice)
myNiceNames = unlist(sapply(V(gg)$name, function(x) featureToNice[[x]] ))



#1 solid
d$slope = "1"
#2 dotted
d$slope[d$IqrNaive < 0] = "2"

#red positive, blue negative
edgeColors = c("red2", "deepskyblue3")

myRgb = col2rgb(edgeColors)

myTint = 0.45
newColors = myRgb + (255 - myRgb) * myTint

newColorsRgb = rgb(t(newColors), maxColorValue=255)

d$edgeColor = newColorsRgb[1]
d$edgeColor[d$IqrNaive < 0] = newColorsRgb[2]


#Vertex shape:  circle normally, square for module
myShapes = rep("circle",length(V(gg)))
myShapes[grep("module",V(gg)$name)] = "square"

assays = c("im", "mb")

myColorsRedPink = c('#feebe2','#fa9fb5')


myRgb = col2rgb(myColorsRedPink)

myTint = 0.45
newColors = myRgb + (255 - myRgb) * myTint

newColorsRgb = rgb(t(newColors), maxColorValue=255)

myColors = newColorsRgb


V(gg)$color=myColors[1]
for(i in 2:length(assays))
{
  myExpr = paste("^", assays[i], sep="")
  V(gg)$color[grep(myExpr, V(gg)$name, fixed=FALSE)] = myColors[i]
}
myColor = V(gg)$color



myEdgeWidth = -1*log10(d$pNaive)
myEdgeLty = as.numeric(d$slope)

#To adjust label position, use distance, degree pairs
#Set vertex.label.degree and vertex.label.dist in the plot command
myDist = rep(0,length(V(gg)))
myDeg = rep(0,length(V(gg)))

myDist[which(myNiceNames=="g_Clostridia_vadinBB60_group")] = .5
myDeg[which(myNiceNames=="g_Clostridia_vadinBB60_group")] = -pi/2

myDist[which(myNiceNames=="g_Blautia")] = .5
myDeg[which(myNiceNames=="g_Blautia")] = pi/2

myDist[which(myNiceNames=="g_Ruminococcus_torques_group")] = 3
myDeg[which(myNiceNames=="g_Ruminococcus_torques_group")] = pi

myDist[which(myNiceNames=="s_Massiliprevotella_massiliensis")] = 2.5
myDeg[which(myNiceNames=="s_Massiliprevotella_massiliensis")] = pi

myDist[which(myNiceNames=="g_Solobacterium")] = .3
myDeg[which(myNiceNames=="g_Solobacterium")] = pi/2


set.seed(20)
lay = layout_with_fr(gg)

par(mar = rep(2, 4))

plot(gg, vertex.label.family="sans", vertex.label= myNiceNames,  vertex.shape = myShapes, vertex.label.cex=.7, vertex.label.font=1, vertex.label.color="black", edge.lty=1, edge.label=NA, edge.label.font=1, edge.label.cex=.8, edge.label.color=NA, edge.label.family="sans", edge.curved=.1, edge.width=myEdgeWidth, edge.color=d$edgeColor, vertex.label.dist=myDist, vertex.label.degree=myDeg, vertex.frame.color="darkgray", layout=lay, vertex.color=V(gg)$color, vertex.frame.lty=2, vertex.width=4)

legend(x="topleft", assays, pch=21,
       col="#777777", pt.bg=myColors, pt.cex=1.3, cex=.8, bty="n", ncol=1)

Derive and Filter Immune:Microbe Regressions, Exp v HC

Create Subsets of Data for Exp and HC only

dd = read.csv("data/NetworkAnalysis/dataOut/imMbW0.csv")
h = read.csv("data/NetworkAnalysis/dataOut/imMbW0_h.csv")
colnames(dd) = h$x

ds = subset(dd, Cohort != "Naive") #75

write.csv(ds, "data/NetworkAnalysis/dataOut/imMbW0_ExpHC.csv", quote=FALSE, row.names=FALSE)

dd = read.csv("data/NetworkAnalysis/dataOut/imMbW24.csv")
h = read.csv("data/NetworkAnalysis/dataOut/imMbW24_h.csv")
colnames(dd) = h$x

ds = subset(dd, Cohort != "Naive") #69

write.csv(ds, "data/NetworkAnalysis/dataOut/imMbW24_ExpHC.csv", quote=FALSE, row.names=FALSE)

#No need to write headers since those stay the same

Perform Regressions

##########Parameters
#tag = "3sW0_ExpHC"
tag = "3sW24_ExpHC"

dataOut = paste("data/NetworkAnalysis/dataOut/models_", tag, "June2023.csv", sep="")



##########Begin processing

if (grepl("W0", tag))
{
  dd = read.csv("data/NetworkAnalysis/dataOut/imMbW0_ExpHC.csv")
  h = read.csv("data/NetworkAnalysis/dataOut/imMbW0_h.csv")
  colnames(dd) = h$x
} else
{
  dd = read.csv("data/NetworkAnalysis/dataOut/imMbW24_ExpHC.csv")
  h = read.csv("data/NetworkAnalysis/dataOut/imMbW24_h.csv")
  colnames(dd) = h$x
}

coCol = which(colnames(dd)=="Cohort")

#Rename this to Grp
colnames(dd)[coCol] = "Grp"


#read count column
rc = which(colnames(dd)=="reads")

#must know where in file which "ome" starts and stops.  These are column numbers

myOmes = c("im_", "mb_")

omeCols = lapply(myOmes, function(x) grep(paste("^", x, sep=""), colnames(dd)))

cc = combinations(n=length(myOmes),r=2)

#create function regress
regress = function(oA, oB, assays)
{
    #oA = o1
    #oB = o2
    for (h in oA)
    {   
        #print(h)
        #h = 6
        ana1 = colnames(dd)[h]
        #print(ana1)
        
        for (i in oB)
        {    
            #i=15
            #print(paste(h,i))
            ana2 = colnames(dd)[i]
            #print(paste(ana1,ana2, sep=","))
            #subset for complete cases.
            #not thin yet, so this works    
            dcc = dd[complete.cases(dd[, c(h,i,rc,coCol)]), c(h,i,rc,coCol) ]
                        
            #print(nrow(dcc))
            n=nrow(dcc)
            nNot0 = sum(dcc[,ana2] > 0)
            if(min(table(dcc$Grp)) > 10)
            {
                                        
                    #3 slope, readcount in model
                    m = lm(dcc[,ana1] ~ dcc$reads + dcc$Grp + dcc[,ana2]*dcc$Grp)
                    adjRsq = summary(m)$adj.r.squared
                    dffitsMax = max(abs(dffits(m)))
                    fParts = summary(m)$fstatistic
                     pF = pf(fParts[1], fParts[2], fParts[3], lower.tail=FALSE)
                    tt = summary(m)$coefficients
                    #cheap trick to catch bad model
                    if (nrow(tt) == 5)
                    {
                    
                          pRc = tt[2,4]
                      pExp = tt[4,4]
                      pHC = tt[4,4]
                      ExpSlope = tt[4,1]
                      HCSlope = tt[5,1] + ExpSlope
                      q13 = quantile(dcc[,ana2], c(.25,.75))
                      HCIqrE = HCSlope * (q13[2] - q13[1])
                      ExpIqrE = ExpSlope * (q13[2] - q13[1])                                                
                    
                      #append to output file
                          write(paste(assays, ana1, ana2, n, nNot0, adjRsq, pF,  pExp, pHC, ExpIqrE, HCIqrE, dffitsMax, pRc, sep=","), dataOut, append=TRUE)
                    }
                    
            }
        }   
    }
}


#Create header row for output file.
#Writing these results directly to file performs well, and sets stage for permutation testing.
#Should something go wrong in a long job, it is possible to restart without losing the data previously generated, since it's in the output file.
header = paste("assays", "ana1", "ana2", "n", "nNot0", "adjRsq", "pF",  "pExp", "pHC", "IqrExp", "IqrHC", "maxInfluence", "pRc", sep=",")

write(header, dataOut, append=FALSE)
#Add timer for general information
tm0 = proc.time()

for (k in 1:nrow(cc))
{

  #k = 2
  #print(paste("processing row ", k))
  #omeA = myOmes[1]
  oA = omeCols[[cc[k,1]]]
  #shorten array for testing purposes
  #oA = omeCols[[cc[k,1]]][1:10]

  oB = omeCols[[cc[k,2]]]
  #omeCols[[oA]]
  #omeCols[[oB]]
  assays = gsub("_", "", paste(myOmes[cc[k,1]], myOmes[cc[k,2]], sep="."))
  regress(oA, oB, assays)
}
proc.time() - tm0

Compute p-Value of HC Slope Different than 0 and Filter Results

#tag = "3sW0_ExpHC"
tag = "3sW24_ExpHC"


inFile = paste("data/NetworkAnalysis/dataOut/models_", tag, "June2023.csv", sep="")
outFile = paste("data/NetworkAnalysis/dataOut//models_", tag, "Contrasts.csv", sep="")


models = read.csv(inFile, stringsAsFactors=TRUE)


if (grepl("W0", tag))
{
    dd = read.csv("data/NetworkAnalysis/dataOut/imMbW0_ExpHC.csv")
    h = read.csv("data/NetworkAnalysis/dataOut/imMbW0_h.csv")
    colnames(dd) = h$x
} else
{
    dd = read.csv("data/NetworkAnalysis/dataOut/imMbW24_ExpHC.csv")
    h = read.csv("data/NetworkAnalysis/dataOut/imMbW24_h.csv")
    colnames(dd) = h$x
}

dd$Cohort = factor(dd$Cohort)

#nts:  doing contrasts this way corrects for the multiple comparisons by default
#below is contrast for slope of HC different than 0 (reference slope + HC offset)
#can specify method, including "none"
contr <- rbind("SlopeC2"=c(0,0,0,1,1))
               

models$pSHC=NA

for (i in 1:nrow(models))
{
    myRow = i
    #myRow = 5
    
    ana1 = as.character(models$ana1[myRow])
    ana2 = as.character(models$ana2[myRow])
    
    myCols = c("SampleID","Cohort",ana1,ana2,"reads")
    
    #Extract a subset of the data for building the model
    #Identify read column based on the ana2
    ss = as.data.frame(dd[,myCols])
    ssc = ss[complete.cases(ss),]
        
    #Rename the relevant columns to a generic "Ana1" and "Ana2"
    #This makes the regression model a little clearer than the subscript approach.  
    colnames(ssc)[3] = "Ana1"
    colnames(ssc)[4] = "Ana2"
        
    m = lm(Ana1 ~ reads + Cohort + Ana2*Cohort , data=ssc)
    test = glht(m, linfct=contr)
        
    pvals = summary(test, adjusted(type="none"))$test$pvalues
            
    models$pSHC[i] = pvals[1]
                
}


models$pFAdj = p.adjust(models$pF, method="fdr")
write.csv(models, outFile, quote=FALSE, row.names=FALSE)

models = read.csv(outFile)


#At least one of the slopes for Exp or HC is not 0, and the difference between slopes is not 0

dBest = droplevels(subset(models, pFAdj < 0.2 & maxInfluence < 2 & (pExp < 0.05 | pSHC < 0.05) & pHC < 0.05 & adjRsq > 0.2))
nrow(dBest)

sort(table(c(dBest$ana1, dBest$ana2)))

#save dBest
outFileBos = paste("../dataOut/bos_", tag, "Contrasts.csv", sep="")
write.csv(dBest, outFileBos, quote=FALSE, row.names=FALSE)

Differential effects of antiretroviral treatment on immunity and gut microbiome composition in people living with HIV in rural versus urban Zimbabwe

Import Needed Packages

Data Read In and Wrangle

Cross-sectional and Longitudinal Analysis of Immune Phenotype

Inter and Intra Cohort Immune Analyses

Fixed-effects Ordinary Least Squares (OLS) Linear Modeling

Cross-sectional and Longitudinal Analysis of Microbiome Diversity

Alpha Diversity

Get Alpha Diveristy Data

Get deltas

Plot

Beta Diversity

Function for replacing taxon labels

Biplots

Preparing the Data

Figure Creation

Dysbiosis and Delta Dysbiosis

Beta Data Read In and Wrangling

Plotting

Performing Adonis Tests

Differential Abundance Analysis

Get Data

Run ANCOMBC

View ANCOMBC results

Plot

Integrative Analysis of Immune Markers and Microbiome

Building Predictive Models for Immune Markers

Data Preprocessing

Feature Selection

Model Building & Testing

Plotting

Network Analysis of Immune and Microbial Associations

Format Mb Data

Combine Immune and Microbiome data

Derive and Filter Immune:Microbe regressions

Create Network Diagram

Derive and Filter Immune:Microbe Regressions, Exp v HC

Create Subsets of Data for Exp and HC only

Perform Regressions

Compute p-Value of HC Slope Different than 0 and Filter Results