Import Needed Packages

library(readxl)             # version 1.4.2
library(dplyr)              # version 1.1.2
library(lubridate)          # version 1.9.2
library(RColorBrewer)       # version 1.1-3
library(FSA)                # version 0.9.4
library(ggh4x)              # version 0.2.4
library(fixest)             # version 0.11.1  
library(ggplot2)            # version 3.4.2
library(qiime2R)            # version 0.99.6
library(cowplot)            # version 1.1.1
library(readr)              # version 2.1.4
library(ggpubr)             # version 0.6.0
library(ggrepel)            # version 0.9.3
library(vegan)              # version 2.6-4
library(ANCOMBC)            # version 2.0.2 
library(phyloseq)           # version 1.42.0
library(tidyr)              # version 1.3.0
library(microViz)           # version 0.10.8
library(jsonlite)           # version 1.8.4
library(MicrobiotaProcess)  # version 1.12.3
library(gtools)             # version 3.9.4
library(igraph)             # version 1.4.2
library(stringr)            # version 1.5.0
library(multcomp)           # version 1.4-25

set.seed(5)

Data Read In and Wrangle

Metadata281 <- read_excel("data/Metadata281.xlsx")

# Rename some columns 
colnames(Metadata281)[colnames(Metadata281) == 'CD4+_count'] <- 'CD4_count'
colnames(Metadata281)[colnames(Metadata281) == 'Cohort'] <- 'Cohort_Description'
colnames(Metadata281)[colnames(Metadata281) == 'HIV-1_viral_load'] <- 'HIV1_viral_load'
colnames(Metadata281)[colnames(Metadata281) == 'IL-6_pg_mL'] <- 'IL6_pg_mL'

# Create viral outcome column
Metadata281$HIV1_viral_load[Metadata281$HIV1_viral_load == 'ND'] <- 0
Metadata281$HIV1_viral_load <- as.numeric(Metadata281$HIV1_viral_load)
Metadata281$HIV1_viral_outcome <- 'Suppressed'

Metadata281$HIV1_viral_outcome[Metadata281$Cohort_Short == 'A' & 
                                Metadata281$Visit == 3 &
                                Metadata281$HIV1_viral_load > 200] <- 'Viremic'
Metadata281$HIV1_viral_outcome[Metadata281$Cohort_Short == 'B' & 
                                Metadata281$Visit == 2 &
                                Metadata281$HIV1_viral_load > 200] <- 'Viremic'
Metadata281$HIV1_viral_outcome <- as.factor(Metadata281$HIV1_viral_outcome)

# Calculate length on ART column 
Metadata281$ARTStart[Metadata281$Cohort_Short == "B"] <- Metadata281$ART_Start_Date[Metadata281$Cohort_Short == "B"]
Metadata281$ARTStart <- as.Date(Metadata281$ARTStart, "%Y") 
lubridate::day(Metadata281$ARTStart) <- 1
lubridate::month(Metadata281$ARTStart) <- 6
Metadata281$Visit_date <- as.Date(Metadata281$Visit_date, "%m/%d/%y")

Metadata281$LenghtOnArt_yrs[Metadata281$Cohort_Short == "A"] <- 0.5
Metadata281$LenghtOnArt_yrs[Metadata281$Cohort_Short == "C"] <- 0.0
Metadata281$LenghtOnArt_yrs[Metadata281$Cohort_Short == "B"] <- time_length(Metadata281$Visit_date[Metadata281$Cohort_Short == "B"] - Metadata281$ARTStart[Metadata281$Cohort_Short == "B"], "years")

# Make cleaner columns for visualizations
Metadata281$Location[Metadata281$Arm_Short == "MA"] <- "Urban"
Metadata281$Location[Metadata281$Arm_Short == "MT"] <- "Rural"
Metadata281$Cohort[Metadata281$Cohort_Short == "A"] <- "Naïve"
Metadata281$Cohort[Metadata281$Cohort_Short == "B"] <- "Exp"
Metadata281$Cohort[Metadata281$Cohort_Short == "C"] <- "HC"
Metadata281$Week[Metadata281$Visit == 2] <- 0
Metadata281$Week[Metadata281$Visit == 3] <- 24
Metadata281$Location <- as.factor(Metadata281$Location)
Metadata281$Cohort <- as.factor(Metadata281$Cohort)
Metadata281$Cohort <- ordered(Metadata281$Cohort, levels = c("Naïve", "Exp", "HC"))
Metadata281$Week <- as.factor(Metadata281$Week)

# make correct factored variables 
fact_cols <- c("Gender", "HIV_Status", "Bristol_Score", "Water_Source_Num", 
               "Cotrimoxazole_HIV", "Manual_work_job", "Manual_chores_home",
               "Head_of_household", "Education_Level", "Normal_work_transportation")
Metadata281[fact_cols] <- lapply(Metadata281[fact_cols], factor) 
Metadata281$BMI_Cat <- ordered(Metadata281$BMI_Cat, levels = c("Severe_thinness", "Moderate_thinnes", "Mild_thinness", "Normal", "Overweight", "Obese"))

# make sure immune markers are numeric 
immune_markers <- c("CD4_Percent",
                    "CD4_HLADRpos_CD38pos_Percent", 
                    "CD8_HLADRpos_CD38pos_Percent", 
                    "CD4_PD1_Percent", 
                    "CD8_PD1_Percent",
                    "CD4_CD103_Percent",
                    "CD8_CD103_Percent",
                    "IL6_pg_mL", 
                    "CRP_mg_L")
for (marker in immune_markers){
  Metadata281[[marker]] <- as.numeric(Metadata281[[marker]])
}

SupplementalFig3 <- Metadata281

# Impute means for immune markers where needed 

for(marker in immune_markers){ 
  Metadata281[[marker]][is.na(Metadata281[[marker]])] <- mean(Metadata281[[marker]], na.rm = TRUE)
}

# Make sure to remove sample with no immune data 
Metadata281 <- Metadata281 %>% 
  filter(SampleID != "ZIM033.3")

# Keep only samples with two visits  
Metadata281 <- Metadata281[Metadata281$PID %in% Metadata281$PID[duplicated(Metadata281$PID)],]

# Remove patients that were viremic 
vir_PID <- Metadata281 %>%  
  filter(HIV1_viral_outcome == "Viremic") %>% 
  dplyr::select(SampleID, PID, Cohort_Short, HIV1_viral_outcome)
PIDs <- unique(vir_PID$PID)
Metadata281$HIV1_viral_outcome[Metadata281$PID %in% PIDs] <- 'Viremic'

# get figure 1 inter data 
inter_fig1 <- Metadata281 %>% 
  filter(!(HIV1_viral_outcome == "Viremic" & Cohort_Short == "B")) %>% 
  filter(Visit == 2)

# remove all viremmic patients from metadata
Metadata281 <- subset(Metadata281, !(PID %in% PIDs))

# Get dataframe of just baseline measurements 
baseline <- Metadata281 %>% filter(Week == 0)

# Creating a color palettes
visitColors <- palette(brewer.pal(n = 5, name = "BrBG")) 
cohortColors <- palette(brewer.pal(n = 5, name = "PiYG"))

Cross-sectional and Longitudinal Analysis of Immune Phenotype

Inter and Intra Cohort Immune Analyses

# Figure 1 

sig_df <- data.frame(matrix(ncol = 5, nrow = 0))
colnames(sig_df) <- c("marker", "comparison", "pval", "direction", "patients")

categories <- c("All", "Urban", "Rural")
markers <- c("CD4_HLADRpos_CD38pos_Percent", 
             "CD8_HLADRpos_CD38pos_Percent", 
              "CD4_PD1_Percent", 
              "CD8_PD1_Percent",
              "CD4_CD103_Percent",
              "CD8_CD103_Percent",
              "IL6_pg_mL", 
              "CRP_mg_L")

for (category in categories){
  if(category != "All"){
    heatmap_df <- Metadata281 %>% filter(Location == category)
    heatmap_baseline <- inter_fig1 %>% filter(Location == category)
  }else{
    heatmap_df <- Metadata281
    heatmap_baseline <- inter_fig1
  }

  for (marker in markers){
  patients <- category
  
  # inter cohort tests 
  dt <- dunnTest(heatmap_baseline[[marker]], heatmap_baseline$Cohort_Short, method = "bonferroni")
  c1 <- "Naïve - Exp"
  c2 <- "Naïve - HC"
  c3 <- "Exp - HC"
  p1 <- dt$res$P.adj[1]
  p2 <- dt$res$P.adj[2]
  p3 <- dt$res$P.adj[3]
  
  mean <- heatmap_baseline %>% 
    group_by(Cohort_Short) %>%  
    summarise(avg = mean(eval(as.name(marker))))
  
  avgA <- mean[mean$Cohort_Short == "A",2]$avg
  avgB <- mean[mean$Cohort_Short == "B",2]$avg
  avgC <- mean[mean$Cohort_Short == "C",2]$avg
  
  diffAB <- avgA - avgB
  diffAC <- avgA - avgC
  diffBC <- avgB - avgC 
  
  r1 <- c(marker, c1, p1, diffAB, patients)
  r2 <- c(marker, c2, p2, diffAC, patients)
  r3 <- c(marker, c3, p3, diffBC, patients)
  
  sig_df[nrow(sig_df)+1, ] <- r1
  sig_df[nrow(sig_df)+1, ] <- r2
  sig_df[nrow(sig_df)+1, ] <- r3  
  
  # intra cohort tests 
  sub <- heatmap_df %>% filter(Cohort_Short == "A")
  comparison <- "Naïve: 0 - 24"
  t <- wilcox.test(sub[[marker]] ~ Visit, data = sub, paired = T)
  mean <- sub %>% 
    group_by(Visit) %>%  
    summarise(avg = mean(eval(as.name(marker))))
  avg1 <- mean[mean$Visit == 2,2]$avg
  avg2 <- mean[mean$Visit ==3,2]$avg
  mean_diff <- avg1 - avg2
  r <- c(marker, comparison, t$p.value, mean_diff, patients)
  sig_df[nrow(sig_df)+1, ] <- r
  
  sub <- heatmap_df %>% filter(Cohort_Short == "B")
  comparison <- "Exp: 0 - 24"
  t <- wilcox.test(sub[[marker]] ~ Visit, data = sub, paired = T)
  mean <- sub %>% 
    group_by(Visit) %>%  
    summarise(avg = mean(eval(as.name(marker))))
  avg1 <- mean[mean$Visit == 2,2]$avg
  avg2 <- mean[mean$Visit ==3,2]$avg
  mean_diff <- avg1 - avg2
  r <- c(marker, comparison, t$p.value, mean_diff, patients)
  sig_df[nrow(sig_df)+1, ] <- r
  
  sub <- heatmap_df %>% filter(Cohort_Short == "C")
  comparison <- "HC: 0 - 24"
  t <- wilcox.test(sub[[marker]] ~ Visit, data = sub, paired = T)
  mean <- sub %>% 
    group_by(Visit) %>%  
    summarise(avg = mean(eval(as.name(marker))))
  avg1 <- mean[mean$Visit == 2,2]$avg
  avg2 <- mean[mean$Visit ==3,2]$avg
  mean_diff <- avg1 - avg2
  r <- c(marker, comparison, t$p.value, mean_diff, patients)
  sig_df[nrow(sig_df)+1, ] <- r
  }
}

sig_df$direction <- as.numeric(sig_df$direction)
sig_df$pval <- as.numeric(sig_df$pval)
sig_df$patients <- as.factor(sig_df$patients)

sig_df$Significance <- 'NS'
sig_df$Significance[sig_df$pval <= 0.05] <- '*'
sig_df$Significance[sig_df$pval <= 0.01] <- '**'
sig_df$Significance[sig_df$pval <= 0.001] <- '***'
sig_df$Significance[sig_df$pval <= 0.0001] <- '****'
sig_df$Significance <- as.factor(sig_df$Significance)

sig_df$direction <- sig_df$direction * -1
sig_df$direction[sig_df$pval > 0.05] <- NA

markers_labels <- c("CD4+CD38+HLA-DR+ (%)", 
                    "CD8+CD38+HLA-DR+ (%)", 
                    "CD4+PD1+ (%)", 
                    "CD8+PD1+ (%)",
                    "CD4+CD103+ (%)",
                    "CD8+CD103+ (%)",
                    "IL-6 (pg/mL)", 
                    "CRP (mg/L)")
for (i in 1:length(markers)){
  sig_df$marker[sig_df$marker == markers[i]] <- markers_labels[i]
}
sig_df$category[sig_df$comparison == "Naïve - Exp" |
                sig_df$comparison == "Naïve - HC"  |
                sig_df$comparison == "Exp - HC"] <- "Inter"
sig_df$category[is.na(sig_df$category)] <- "Intra"


comp <- c("Naïve - Exp","Naïve - HC","Exp - HC", "Naïve: 0 - 24", "Exp: 0 - 24", "HC: 0 - 24")
ggplot(sig_df, aes(x = factor(comparison, levels = comp), 
                   y = factor(marker, levels = markers_labels))) + 
  geom_raster(aes(fill=direction))+  
  scale_fill_gradient2(high = "red2", mid = "white", low = "deepskyblue3", na.value = "lightgrey", midpoint = 0,
                       limits = c(-25, 25))+
  geom_text(aes(label=Significance), size = 6) + 
  theme_bw() + 
  xlab("") + 
  ylab("") +
  theme(text = element_text(size = 18), 
        plot.title = element_text(hjust = 0.5), 
        axis.text.x = element_text(angle = 45, vjust = 1, hjust=1), 
        legend.key = element_rect(colour = "black", size = 1), 
        legend.title = element_text(size = 14),
        legend.text=element_text(size=12)) + 
  guides(fill=guide_legend(title="Percent \nDelta", reverse = T)) + 
  facet_nested(~patients + category, scales = "free_x")

Fixed-effects Ordinary Least Squares (OLS) Linear Modeling

# Supplemental Figure S3 but not stratified by location 
SupplementalFig3$Cohort_Short <- factor(SupplementalFig3$Cohort_Short, levels = c("C", "A", "B")) 
SupplementalFig3$Visit <- as.factor(SupplementalFig3$Visit)

linear_methods_df <- as.data.frame(matrix(ncol = 4, nrow = 0))
colnames(linear_methods_df) <- c('ImmuneMarker', 'ExplanatoryVariable', 'Coefficient', 'pvalue')

explanatory_vars <- c('Naïve vs HC', 'Exp vs HC', 'COT (HC)', 
                      'COT (Naïve, \nrelative to HC)', 'COT (Exp, \nrelative to HC)')

for (marker in immune_markers) {
  # Model M1
  fm <- as.formula(paste0('rank(', marker,') ~ Cohort_Short + Visit + Cohort_Short*Visit'))
  results <- feols(fm, data = SupplementalFig3, cluster = ~PID)
  r1 <- c(marker, explanatory_vars[1], results$coeftable[2,1], round(results$coeftable[2,4], 2))
  r2 <- c(marker, explanatory_vars[2], results$coeftable[3,1], round(results$coeftable[3,4], 2))
  r3 <- c(marker, explanatory_vars[3], results$coeftable[4,1], round(results$coeftable[4,4], 2))
  r4 <- c(marker, explanatory_vars[4], results$coeftable[5,1], round(results$coeftable[5,4], 2))
  r5 <- c(marker, explanatory_vars[5], results$coeftable[6,1], round(results$coeftable[6,4], 2))
  
  linear_methods_df[nrow(linear_methods_df)+1, ] <- r1
  linear_methods_df[nrow(linear_methods_df)+1, ] <- r2
  linear_methods_df[nrow(linear_methods_df)+1, ] <- r3
  linear_methods_df[nrow(linear_methods_df)+1, ] <- r4
  linear_methods_df[nrow(linear_methods_df)+1, ] <- r5
}

linear_methods_df$Coefficient <- as.numeric(linear_methods_df$Coefficient)
linear_methods_df$pvalue <- as.numeric(linear_methods_df$pvalue)
linear_methods_df$Coefficient[linear_methods_df$pvalue > 0.05] <- NA
linear_methods_df$Significance <- 'NS'
linear_methods_df$Significance[linear_methods_df$pvalue <= 0.05] <- '*'
linear_methods_df$Significance[linear_methods_df$pvalue <= 0.01] <- '**'
linear_methods_df$Significance[linear_methods_df$pvalue <= 0.001] <- '***'
linear_methods_df$Significance[linear_methods_df$pvalue <= 0.0001] <- '****'
linear_methods_df$Significance <- as.factor(linear_methods_df$Significance)

markers_labels_1 <- c("CD4+ (%)",
                      "CD4+CD38+HLA-DR+ (%)", 
                      "CD8+CD38+HLA-DR+ (%)", 
                      "CD4+PD1+ (%)", 
                      "CD8+PD1+ (%)",
                      "CD4+CD103+ (%)",
                      "CD8+CD103+ (%)",
                      "IL-6 (pg/mL)", 
                      "CRP (mg/L)")
for (i in 1:length(immune_markers)){
  linear_methods_df$ImmuneMarker[linear_methods_df$ImmuneMarker == immune_markers[i]] <- markers_labels_1[i]
}

ggplot(linear_methods_df, aes(x = factor(ExplanatoryVariable, levels = explanatory_vars), 
                   y = factor(ImmuneMarker, levels = markers_labels_1))) + 
  geom_raster(aes(fill=Coefficient))+  
  scale_fill_gradient2(low = "deepskyblue3", mid = "white", high = "red2", na.value = "lightgrey", midpoint = 0, 
                       limits = c(-105, 105))+
  geom_text(aes(label=Significance), size = 6) + 
  theme_bw() + 
  xlab("Predictor") + 
  ylab("Immune Marker") +
  theme(text = element_text(size = 18), 
        plot.title = element_text(hjust = 0.5), 
        axis.text.x = element_text(angle = 45, vjust = 1, hjust=1), 
        legend.key = element_rect(colour = "black", size = 1), 
        legend.title = element_text(size = 14),
        legend.text=element_text(size=12)) + 
  guides(fill=guide_legend(title="Coefficient", reverse = T)) 

Cross-sectional and Longitudinal Analysis of Microbiome Diversity

Alpha Diversity

Get Alpha Diveristy Data

shannon_df <- qiime2R::read_qza("data/DiversityData/shannon_vector.qza")
shannon_df <- shannon_df$data
shannon_df <- tibble::rownames_to_column(shannon_df, "SampleID")
Metadata281 <- Metadata281 %>% 
  dplyr::left_join(shannon_df, by = "SampleID")
shannon_baseline <- Metadata281 %>% filter(Visit == 2)

Get deltas

v2 <- Metadata281 %>% filter(Visit == 2) %>% dplyr::select(PID, shannon_entropy, Cohort, Location)
v3 <- Metadata281 %>% filter(Visit == 3) %>% dplyr::select(PID, shannon_entropy)
delta_df <- dplyr::full_join(v2, v3, by = "PID")

delta_df<- delta_df %>%  
  mutate(delta = shannon_entropy.y - shannon_entropy.x)

Plot

# All statistical calculations were done separately, see Methods 

ann <- data.frame(lx = c(2, 2, 2.5, 2.5, 2.5,2.5),
                  ly = c(8, 8, 7.5, 7.5, 7.5, 7.5),
                  lab = c("*", "____________", "*", "______", "*", "______"),
                  Location = c("Rural", "Rural", "Rural", "Rural", "Urban", "Urban"))
A <- ggplot(shannon_baseline, aes(x = Cohort, y = shannon_entropy)) + 
  geom_boxplot(show.legend = F, 
               coef = 6, 
               alpha = 0.5, aes(fill = Cohort)) + 
  scale_fill_manual(values = c(cohortColors[1],
                               cohortColors[3],
                               cohortColors[5])) + 
  geom_point(show.legend = F,  
             position = position_jitterdodge(0.2), 
             aes(color = Cohort)) + 
  scale_color_manual(values = c("black",
                                 "black",
                                 "black")) +  
  theme_bw() +
  theme(text = element_text(size = 18)) +
  xlab("Cohort") + 
  ylab("Shannon Entropy") + 
  facet_grid(~Location) + 
  geom_text(data = ann, aes(x = lx, y = ly, label = lab), size = 6)

ann <- data.frame(lx = c(1, 1),
                  ly = c(7.5, 7.5),
                  lab = c("*", "_____"),
                  Location = c("Rural", "Rural") )
B <- ggplot(Metadata281, aes(x = Cohort, y = shannon_entropy)) + 
  geom_boxplot(show.legend = T,coef = 6, alpha = 0.5, aes(fill = Week)) + 
  geom_point(show.legend = F, aes(color = Week), position = position_jitterdodge(0.2)) +
  scale_fill_manual(values = c(visitColors[1], visitColors[5])) + 
  scale_color_manual(values = c(visitColors[1], visitColors[5])) + 
  theme_bw() +
  theme(text = element_text(size = 18)) +
  facet_grid(~Location)+
  xlab("Cohort") + 
  ylab("") + 
  geom_text(data = ann, aes(x = lx, y = ly, label = lab), size = 6)

# Model M2
C <- ggplot(delta_df, aes(x = shannon_entropy.x, y = delta, color = Cohort)) +
  geom_point() +
  scale_color_manual(values = c(cohortColors[1],
                               "gray65",
                               cohortColors[5])) +
  geom_smooth(method = "lm", fill = NA)+ 
  theme_bw() + 
  theme(text = element_text(size = 18)) +
  xlab("Baseline Shannon Entropy") + 
  ylab("Delta Shannon Entropy \n (Wk24 - Wk0)") + 
  stat_cor(label.x = 5)+
  ylim(-2,3)

cowplot::plot_grid(A, B, C,
          labels = c("A", "B", "C"),
          ncol = 3, nrow = 1 ,
          rel_widths = c(0.7, 1, 1))

Beta Diversity

Function for replacing taxon labels

# takes a row containing taxon names and returns last two known levels
# e.g. d__Eukaryota;p__Incertae_Sedis;c__Incertae_Sedis;o__Incertae_Sedis;f__Incertae_Sedis;g__Blastocystis
# to f__Prevotellaceae; g__Prevotella
getNiceTaxon <- function(rowVector){
  newVector <- c()
  for(rowValue in rowVector){
    if(substr(rowValue, 1, 6) != "module"){
      newName <- tail(strsplit(rowValue, split = ";")[[1]], 2)
      newName <- paste0(newName[1], ";", newName[2])
      newVector <- c(newVector, c(newName))
    }else{
      newVector <- c(newVector, c(rowValue))
    }
  }
  return(newVector)
}

Biplots

Preparing the Data

# biom table 
biom <- qiime2R::read_qza("data/BiplotsData/tax_filtered_281.qza")
biom <- as_tibble(biom$data, rownames = 'ASV')

# subset of metadata
sub_meta <- readr::read_delim("data/subset_Metadata281.tsv", 
                           delim = "\t", 
                           escape_double = FALSE, 
                           col_types = cols(Location = col_factor(levels = c("Rural","Urban")), 
                                            Week = col_factor(levels = c("0","24")), 
                                            HIV1_viral_outcome = col_factor(levels = c("Suppressed", 
                                                                                       "Viremic"))), 
                           trim_ws = TRUE)

# Weighted UniFrac PCoA
w_uni <- qiime2R::read_qza("data/DiversityData/weighted_unifrac_pcoa_results.qza")$data
w_uni_pcoa <- as_tibble(w_uni$Vectors)

# get proportions explained
w_uni_prop <- w_uni$ProportionExplained[1:3]

# get biplot from qiime 
q_biplot <- qiime2R::read_qza("data/BiplotsData/biplot.qza")
q_biplotSpecies <- as_tibble(q_biplot$data$Species)
q_biplotVectors <- as_tibble(q_biplot$data$Vectors)

# get taxonomy and replace with better labels 
tax <- qiime2R::read_qza("data/BiplotsData/classification.qza")
tax <- tax$data 
colnames(tax)[1] <- "FeatureID"

q_biplotSpecies <- q_biplotSpecies %>% 
  dplyr::left_join(tax, by = "FeatureID") %>% 
  dplyr::select(Taxon, PC1, PC2, PC3)
colnames(q_biplotSpecies)[1] <- "FeatureID"

q_biplotSpecies %>% 
  dplyr::mutate(FeatureID = getNiceTaxon(FeatureID)) -> abbrBiplotSpecies

# prepare sample data 
w_uni_pcoa <- w_uni_pcoa %>% 
  dplyr::select(SampleID, PC1, PC2, PC3) %>% 
  dplyr::left_join(sub_meta, by = "SampleID")

w_uni_pcoa$Cohort <- as.factor(w_uni_pcoa$Cohort)
w_uni_pcoa$Cohort <- ordered(w_uni_pcoa$Cohort, levels = c("Naïve", "Exp", "HC"))

Figure Creation

# prep axis labels 
pc1_lab <- paste0('PC1 (', round(w_uni_prop[1],3)*100, '%)')
pc2_lab <- paste0('PC2 (', round(w_uni_prop[2],3)*100, '%)')
pc3_lab <- paste0('PC3 (', round(w_uni_prop[3],3)*100, '%)')

# create base plot with arrows for first plot 
abbrBiplotSpecies %>% 
  mutate(a=sqrt(PC1^2+PC2^2)) %>% # calculate the distance from the origin
  top_n(4, a) %>% #keep 8 furthest away points
  mutate(PC1=PC1*5, PC2=PC2*5) -> abbrBiplotSpeciesArrows # scale arrows linearly

new_lables <- c()
for (row in 1:nrow(abbrBiplotSpeciesArrows)) {
  tax_name <- abbrBiplotSpeciesArrows$FeatureID[row]
  genus <- tail(strsplit(tax_name, split = ";")[[1]], n=1)
  new_lables <- c(new_lables, genus)
}

abbrBiplotSpeciesArrows <- cbind(abbrBiplotSpeciesArrows, new_lables)

A <- ggplot(w_uni_pcoa, aes(x = PC1, y = PC2, color = Cohort)) + 
  geom_point(show.legend = F, size = 5 ,alpha = 0.6) + 
  scale_color_manual(values = c(cohortColors[1],
                               "grey65",
                               cohortColors[5])) + 
  theme_bw(base_size = 18) + 
  xlab(pc1_lab) +
  ylab(pc2_lab) + 
  geom_segment(data = abbrBiplotSpeciesArrows, aes(x=0, xend=PC1, y=0, yend=PC2),
               arrow = arrow(length = unit(0.3,"cm")),color = 'black')+
  geom_text_repel(data = abbrBiplotSpeciesArrows, aes(x=PC1, y=PC2, label=new_lables),size=7, color = 'black')




# second plot 
abbrBiplotSpecies %>% 
  mutate(a=sqrt(PC1^2+PC3^2)) %>% # calculate the distance from the origin
  top_n(4, a) %>% #keep 8 furthest away points
  mutate(PC1=PC1*5, PC3=PC3*5) -> abbrBiplotSpeciesArrows # scale arrows linearly

new_lables <- c()
for (row in 1:nrow(abbrBiplotSpeciesArrows)) {
  tax_name <- abbrBiplotSpeciesArrows$FeatureID[row]
  genus <- tail(strsplit(tax_name, split = ";")[[1]], n=1)
  new_lables <- c(new_lables, genus)
}

abbrBiplotSpeciesArrows <- cbind(abbrBiplotSpeciesArrows, new_lables)

B <- ggplot(w_uni_pcoa, aes(x = PC1, y = PC3, color = Cohort)) + 
  geom_point(size = 5, alpha = 0.6 ) + 
  scale_color_manual(values = c(cohortColors[1],
                                "grey65",
                                cohortColors[5])) + 
  theme_bw(base_size = 18) + 
  xlab(pc1_lab) +
  ylab(pc3_lab) + 
  geom_segment(data = abbrBiplotSpeciesArrows, aes(x=0, xend=PC1, y=0, yend=PC3),
               arrow = arrow(length = unit(0.3,"cm")),color = 'black')+
  geom_text_repel(data = abbrBiplotSpeciesArrows, aes(x=PC1, y=PC3, label=new_lables),size=7, color = 'black')

cowplot::plot_grid(A, B, 
                   labels = c("A", " "), 
                   ncol = 2, nrow = 1)

Dysbiosis and Delta Dysbiosis

Beta Data Read In and Wrangling

beta_df <- data.frame(matrix(ncol = 2, nrow = 0))
colnames(beta_df) <- c("SampleID", "Dysbiosis") 

distance_matrix<- qiime2R::read_qza("data/DiversityData/weighted_unifrac_distance_matrix.qza")
distance_matrix <- distance_matrix$data %>% as.matrix()
distance_matrix[upper.tri(distance_matrix)] <- NA
diag(distance_matrix) <- NA
distance_matrix <- distance_matrix %>% as.data.frame()
distance_matrix <- tibble::rownames_to_column(distance_matrix, "SampleID")

# calculate average distance from PLWH to healthy controls for samples at 
# each time point 
weeks <- c(0, 24)
for (current_week in weeks){
  
  sampleIDs <- Metadata281 %>% filter(Cohort_Short != 'C' & Week == current_week)
  AB_SIDs <- sampleIDs$SampleID
  sampleIDs <- Metadata281 %>% filter(Cohort_Short == 'C'& Week == current_week)
  C_SIDs <- sampleIDs$SampleID
  
  dysbiosis_matrix <- distance_matrix[distance_matrix$SampleID %in% C_SIDs, ]
  
  for (sample in AB_SIDs) {
    avg <- mean(dysbiosis_matrix[[sample]], na.rm = T)
    beta_df[nrow(beta_df)+1, ] <- c(sample, avg)
  }
}

# remove NAs 
beta_df <- beta_df[!is.na(beta_df$Dysbiosis), ]
beta_df <- beta_df %>% filter(Dysbiosis != "NaN")

# create subset and baseline 
meta_sub <- Metadata281 %>% dplyr::select(SampleID, Cohort, Week, PID, Location)
beta_df <- dplyr::left_join(beta_df, meta_sub, by = "SampleID")
beta_df$Dysbiosis <- as.numeric(beta_df$Dysbiosis)
baseline_beta <- beta_df %>% 
  filter(Week == 0) %>% 
  dplyr::select(PID, Dysbiosis)

# calculate deltas
bv3 <- beta_df %>% filter(Week == 24) %>% dplyr::select(PID, Dysbiosis, Cohort, Location)
beta_delta <- dplyr::left_join(baseline_beta, bv3, by = "PID")
beta_delta <- beta_delta %>% mutate(Delta_Dysbiosis = Dysbiosis.y - Dysbiosis.x)

Plotting

# Model M3
B <- ggplot(na.omit(beta_delta), aes(x = Dysbiosis.x, y = Delta_Dysbiosis, color = Cohort)) +
  geom_point() +
  geom_smooth(method = "lm", fill = NA)+ 
  scale_color_manual(values = c(cohortColors[1],
                                "gray65")) +
  theme_bw() + 
  theme(text = element_text(size = 18)) +
  xlab("Baseline Dysbiosis") + 
  ylab("Delta Dysbiosis \n (Wk24 - Wk0)") + 
  stat_cor(label.x = 0.5) 


C <- ggplot(na.omit(beta_delta), aes(x = Dysbiosis.x, y = Delta_Dysbiosis, color = Location)) +
  geom_point() +
  geom_smooth(method = "lm", fill = NA)+ 
  theme_bw() + 
  theme(text = element_text(size = 18)) +
  xlab("Baseline Dysbiosis") + 
  ylab(" ") + 
  stat_cor(label.x = 0.6)

cowplot::plot_grid(B, C, 
                   labels = c("B", "C"), 
                   ncol = 2, nrow = 1)

Performing Adonis Tests

# reload metadata to retain as many samples as possible 
Metadata281_full <- read_excel("data/Metadata281.xlsx")

Metadata281_full <- subset(Metadata281_full, Metadata281_full$SampleID != "ZIM026.2" &
                         Metadata281_full$SampleID != "ZIM026.3" &
                         Metadata281_full$SampleID != "ZIM008.3" & 
                         Metadata281_full$SampleID != "ZIM062.3")


Metadata281_full$Location[Metadata281_full$Arm_Short == "MA"] <- "Urban"
Metadata281_full$Location[Metadata281_full$Arm_Short == "MT"] <- "Rural"
Metadata281_full$Cohort[Metadata281_full$Cohort_Short == "A"] <- "Naïve"
Metadata281_full$Cohort[Metadata281_full$Cohort_Short == "B"] <- "Exp"
Metadata281_full$Cohort[Metadata281_full$Cohort_Short == "C"] <- "HC"
Metadata281_full$Week[Metadata281_full$Visit == 2] <- 0
Metadata281_full$Week[Metadata281_full$Visit == 3] <- 24
Metadata281_full$Cohort <- factor(Metadata281_full$Cohort, levels = c("Naïve", "Exp", "HC"))

Metadata281_full$Treatment_stat[Metadata281_full$Cohort_Short == "A" & Metadata281_full$Visit == 2] <- "untreated"
Metadata281_full$Treatment_stat[Metadata281_full$Cohort_Short == "A" & Metadata281_full$Visit == 3] <- "treated"
Metadata281_full$Treatment_stat[Metadata281_full$Cohort_Short == "B"] <- "treated"
Metadata281_full$Treatment_stat[Metadata281_full$Cohort_Short == "C"] <- "untreated"

# read in weighted unifrac distance matrix 
distance_matrix<- qiime2R::read_qza("data/DiversityData/weighted_unifrac_distance_matrix.qza")
distance_matrix <- distance_matrix$data %>% as.matrix()
distance_matrix <- distance_matrix %>% as.data.frame()
distance_matrix <- tibble::rownames_to_column(distance_matrix, "SampleID")
indices <- order(distance_matrix$SampleID)
distance_matrix <- distance_matrix[,2:ncol(distance_matrix)]
distance_matrix <- distance_matrix[indices, indices]

adonis_data <- Metadata281_full[Metadata281_full$SampleID %in% colnames(distance_matrix),]

# make sure they are in the same sample order
adonis_data$SampleID == colnames(distance_matrix)
##   [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [46] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [61] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [76] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [91] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [106] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [121] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [136] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [151] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [166] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [181] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [196] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [211] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [226] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [241] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [256] TRUE TRUE TRUE TRUE
# Model M4
adonis2(as.dist(distance_matrix) ~ adonis_data$Location + 
          adonis_data$Location*adonis_data$Week, 
        strata = adonis_data$PID)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Blocks:  strata 
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = as.dist(distance_matrix) ~ adonis_data$Location + adonis_data$Location * adonis_data$Week, strata = adonis_data$PID)
##                                        Df SumOfSqs      R2      F Pr(>F)   
## adonis_data$Location                    1    2.870 0.02181 5.7291  0.029 * 
## adonis_data$Week                        1    0.853 0.00648 1.7034  0.010 **
## adonis_data$Location:adonis_data$Week   1    0.152 0.00115 0.3034  0.795   
## Residual                              255  127.734 0.97056                 
## Total                                 258  131.610 1.00000                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Model M5
adonis2(as.dist(distance_matrix) ~ adonis_data$Visit_Age + 
          adonis_data$HIV_Status + 
          adonis_data$Treatment_stat, 
        strata = adonis_data$PID)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Blocks:  strata 
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = as.dist(distance_matrix) ~ adonis_data$Visit_Age + adonis_data$HIV_Status + adonis_data$Treatment_stat, strata = adonis_data$PID)
##                             Df SumOfSqs      R2      F Pr(>F)  
## adonis_data$Visit_Age        1    2.721 0.02067 5.4688  0.086 .
## adonis_data$HIV_Status       1    1.045 0.00794 2.1004  0.114  
## adonis_data$Treatment_stat   1    0.991 0.00753 1.9928  0.051 .
## Residual                   255  126.853 0.96386                
## Total                      258  131.610 1.00000                
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Differential Abundance Analysis

Get Data

# create phyloseq object 
pseq <- qiime2R::qza_to_phyloseq(features = "data/ANCOMBCData/tax_filtered_281.qza", 
                                 taxonomy = "data/ANCOMBCData/classification.qza", 
                                 metadata = "data/subset_Metadata281.tsv", 
                                 tree = "data/ANCOMBCData/tree_Zim.qza")

# Change factored variable in metadata 
meta <- pseq %>% sam_data() %>% as.data.frame()
meta$Cohort <- factor(meta$Cohort, levels = c("HC", "Naïve", "Exp"))
sample_data(pseq) <- sample_data(meta)

# get baseline 
pseq.baseline <- pseq
baseline <- pseq %>% sample_data() %>% as.data.frame()
baseline <- baseline[baseline$Week == 0]
sample_data(pseq.baseline) <- sample_data(baseline)

# only look at cohort A
pseq.A <- pseq
cohortA <- pseq %>% sample_data() %>%  as.data.frame()
cohortA <- cohortA[cohortA$Cohort == "Naïve"]
sample_data(pseq.A) <- sample_data(cohortA) 

Run ANCOMBC

# Model M6
out <- ancombc2(
  data = pseq.baseline, 
  tax_level = 'Genus',
  fix_formula = 'Location + HIV1_viral_outcome + Cohort',
  group = 'Cohort'
)

# Model M7
# out <- ancombc2(
#   data = pseq, 
#   tax_level = 'Genus',
#   fix_formula = 'Location + Week + HIV1_viral_outcome + Cohort',
#   rand_formula = '(1 | PID)', 
#   group = 'Cohort'
# )

# Model M8
# out <- ancombc2(
#   data = pseq.A, 
#   tax_level = 'Genus',
#   fix_formula = 'Location + Week + HIV1_viral_outcome',
#   rand_formula = '(1 | PID)'
# )

View ANCOMBC results

# Results 
res <- out$res

## Location results 
df_location <- res %>% 
  dplyr::select(taxon, contains("Location")) 

## Week results 
df_week <- res %>% 
  dplyr::select(taxon, contains("Week"))

## Viral Outcome results 
df_vo <- res %>% 
  dplyr::select(taxon, contains("Viral"))

## Cohort results
df_cohort <- res %>% 
  dplyr::select(taxon, contains("Cohort"))

Plot

# put results data in correct format 
df_cohort_fig <- df_cohort %>% 
  dplyr::filter(diff_CohortExp == 1 | diff_CohortNaïve == 1) %>%  
  dplyr::mutate(lfc_CohortNaïve = ifelse(diff_CohortNaïve == 1, 
                                  lfc_CohortNaïve, 0), 
         lfc_CohortExp = ifelse(diff_CohortExp == 1, 
                               lfc_CohortExp, 0) 
  ) %>% 
  dplyr::transmute(taxon, 
            `Exp vs HC` = round(lfc_CohortExp, 2),
            `Naïve vs HC` = round(lfc_CohortNaïve, 2)) %>% 
  tidyr::pivot_longer(cols = `Exp vs HC`:`Naïve vs HC`, 
               names_to = "group", values_to = "value") %>% 
  dplyr::arrange(taxon)

# plot 
lo <- floor(min(df_cohort_fig$value))
up <- ceiling(max(df_cohort_fig$value))
mid <- (lo + up)/2
df_cohort_fig %>%
  mutate(taxon = paste(substr(taxon, 7, 100000L))) %>% 
  ggplot(aes(x = factor(group, level = c("Naïve vs HC","Exp vs HC")), y = taxon, fill = value)) + 
  geom_tile(color = "black") +
  scale_fill_gradient2(low = "deepskyblue3", high = "red2", mid = "white", 
                       na.value = "white", midpoint = mid, limit = c(lo, up),
                       name = NULL) +
  geom_text(aes(group, taxon, label = value), color = "black", size = 6) +
  labs(x = NULL, y = NULL) +
  ggtitle("Baseline") + 
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5), 
        text = element_text(size = 18)) 

Integrative Analysis of Immune Markers and Microbiome

Building Predictive Models for Immune Markers

Data Preprocessing

# get microbiome diversity data 
shannon_df <- read_qza("data/DiversityData/shannon_vector.qza")
shannon_df <- shannon_df$data
shannon_df <- tibble::rownames_to_column(shannon_df, "SampleID")

w_uni <- read_qza("data/DiversityData/weighted_unifrac_pcoa_results.qza")$data
w_uni_pcoa <- as_tibble(w_uni$Vectors)
w_uni_pcoa <- w_uni_pcoa %>% 
  dplyr::select(SampleID, PC1, PC2, PC3, PC4)

# add to metadata
Metadata281 <- Metadata281 %>% 
  left_join(shannon_df, by = "SampleID") %>% 
  left_join(w_uni_pcoa, by = "SampleID") 

# get immune marker table and feature table 
IM_data <- Metadata281 %>% dplyr::select(SampleID, all_of(immune_markers)) 
features  <- Metadata281 %>% 
  dplyr::select(SampleID, Cohort, Location, Gender, BMI, BMI_Cat, HIV_Status, Week, Visit_Age,
         Bristol_Score, Education_Level, Water_Source_Num, Cotrimoxazole_HIV, 
         Cotri_Length_Month, Normal_work_transportation, Manual_work_job,
         Manual_chores_home, Head_of_household, LenghtOnArt_yrs) %>% 
  left_join(shannon_df, by = "SampleID") %>% 
  left_join(w_uni_pcoa, by = "SampleID") 

# some preprocessing to address NAs 
# there were 20 samples without microbiome informations due to quality control
features$Cotrimoxazole_HIV[features$Cotrimoxazole_HIV == "NA"] <- NA
features$Cotrimoxazole_HIV[is.na(features$Cotrimoxazole_HIV)] <- "No"
features$Cotrimoxazole_HIV <- droplevels(features$Cotrimoxazole_HIV)
features$Normal_work_transportation[features$Normal_work_transportation == "NA"] <- NA
features$Normal_work_transportation <- droplevels(features$Normal_work_transportation)
features$BMI_Cat[is.na(features$BMI_Cat)] <- "Moderate_thinnes"
features$Head_of_household <- droplevels(features$Head_of_household)

Feature Selection

# backwards stepwise regression feature selection
formulas <- vector(mode="list")

for(marker in immune_markers){
  print(marker)
  # merge immune marker with features 
  IM_one <- IM_data %>% 
    dplyr::select(SampleID, marker)
  step_data <- features %>% 
    left_join(IM_one, by = "SampleID") %>% 
    dplyr::select(-SampleID) %>% 
    na.omit()
  
  ## we want the model to have location 
  f <- as.formula(paste0(marker," ~ Location"))
  myM <- lm(f, data = step_data)
  
  ## model with everything 
  fB <- as.formula(paste0(marker, " ~."))
  biggest <- lm(fB, data = step_data)
  
  ## stepwise regression, direction = "backward"
  bwd <- step(biggest, 
              trace = 1, 
              direction = "backward", 
              scope = list(lower = myM, upper = biggest))
  
  bwd_formula <- paste0(bwd$call)[2]
  formulas[[marker]] <- bwd_formula
}

Model Building & Testing

# for models involving PLWH we viral load 
# for models involving ART exp PLWH we added length on ART 

# made models dataset 
lm_data <- as.data.frame(matrix(nrow = 0, ncol = 4))
colnames(lm_data) <- c("Marker", "Week", "Cohort", "Formula")

weeks <- c(0, 24)
Cohorts <- c("Naïve", "Exp", "HC")

for(marker in immune_markers){
  for(week in weeks){
    for(cohort in Cohorts){
      if(cohort == "Naïve"){
        form <- paste0(formulas[marker], "+ HIV1_viral_load")
      }
      if(cohort == "Exp"){
        form <- paste0(formulas[marker], "+ HIV1_viral_load + LenghtOnArt_yrs")
      }
      if(cohort == "HC"){
        form <- paste0(formulas[marker])
      }
      r <- c(marker, week, cohort, form)
      lm_data[nrow(lm_data)+1, ] <- r
    }
  }
}

# run models and make results 
sub_d <- Metadata281 %>% 
  filter(Cohort == "HC" & Week == 24)
f <- as.formula(CRP_mg_L ~ Location + BMI + Visit_Age + PC1 )
summary(lm(f, data = sub_d))

Plotting

lm_table <- read_excel("data/PredictiveModeling/LinearModelInfo.xlsx")

lm_table$Tested[lm_table$Tested == "Yes"] <- "Included in Model"
lm_table$Cohort <- factor(lm_table$Cohort, levels = c("Naïve", "Exp", "HC"))
lm_table$Tested <- factor(lm_table$Tested, levels = c("Up", "Up (Model NS)", "Down", 
                                                      "Down (Model NS)", "Included in Model"))

im_labels <- c("CD4+CD38+HLA-DR+ (%)", 
                "CD8+CD38+HLA-DR+ (%)", 
                "CD4+PD1+ (%)", 
                "CD8+PD1+ (%)",
                "CD4+CD103+ (%)",
                "CD8+CD103+ (%)",
                "IL-6 (pg/mL)", 
                "CRP (mg/L)", 
                "CD4+ (%)")
lm_table$`Immune Marker` <- factor(lm_table$`Immune Marker`, levels = im_labels)

cols <- colnames(lm_table)
lm_table[cols] <- lapply(lm_table[cols], factor)



ggplot(lm_table, aes(x = Predictor, y = `Immune Marker`)) + 
  geom_point(aes(color = Tested), size = 6) + 
  scale_color_manual(values = c("Up" = "red2",
                                "Up (Model NS)"="peachpuff",
                                "Down"="deepskyblue3", 
                                "Down (Model NS)" = "paleturquoise", 
                                "Included in Model" = "lightgrey")) + 
  theme_bw() + 
  theme(text = element_text(size = 18), 
        axis.text.x = element_text(angle = 45, vjust = 1, hjust=1), 
        legend.title = element_blank()) +
  ylab("Immune Marker") +
  facet_grid(Cohort~Week)

Network Analysis of Immune and Microbial Associations

Format Mb Data

library(MicrobiotaProcess)

# making a pretty taxa table 
tax <- read_qza("data/NetworkAnalysis/dataFrom/classification.qza")
  
#head(tax$taxatab)
tax$taxatab$sequence = seq_along(rownames(tax$taxatab))

getNiceTaxon <- function(rowVector){
  newVector <- c()
  for(rowValue in 1:length(rowVector)){
    #print(rowVector[rowValue])
    if(substr(rowVector[rowValue], 1, 6) != "module"){
     newName <- paste(tax$taxatab[rowVector[rowValue],], collapse=";")

      #newName <- tax$taxatab[rowVector[rowValue],]
      newName <- tail(strsplit(newName, split = ";")[[1]], 3)
      newName <- paste0(newName[1], ".", newName[2],"_",newName[3])
      newVector <- c(newVector, c(newName))
    }else{
      newVector <- c(newVector, c(rowVector[rowValue]))
    }
  }
  return(newVector)
}  
 

# Filter SCNIC output for ASVs that are not seen in less than 20% of samples
# before loading in convert SCNIC biom to TSV
# remove first line in the TSV 

SCNIC_output <- read_delim("data/NetworkAnalysis/dataFrom/table_collapse.txt", 
                           delim = "\t", escape_double = FALSE, 
                           trim_ws = TRUE)
colnames(SCNIC_output)[1] <- "OTU_ID"

# convert to proper format 
t_SO <- data.frame(t(SCNIC_output[,-1]))
colnames(t_SO) <- SCNIC_output$OTU_ID

#Compute relative abundance

reads = rowSums(t_SO) #not same, so not rarefied
scRA = t_SO/reads
rowSums(scRA) #sums to 1


# remove ASVs prevalent in less than 20% of samples 
# goes from 8853 to 237 ASVs/modules
# goes from 338 to 106 taxa/modules 

count0s<- function(col) {
  zeros <- sum(col==0)
  perc <- length(col)*0.2
  length(col) - zeros > perc 
}


scRAthin = scRA[,apply(scRA, 2, count0s)]

head(t_SO[,"ae988b9b4c6c1e2466fa26cd1e2ec5c7"])


newCols = getNiceTaxon(colnames(scRAthin))
colnames(scRAthin) = newCols

scRAthin$SampleID <- row.names(scRAthin)
scRAthin = cbind(reads, scRAthin)

write.csv(scRAthin, "data/NetworkAnalysis/dataOut/mb.csv", quote=FALSE, row.names=FALSE)
write.csv(colnames(scRAthin), "data/NetworkAnalysis/dataOut/mbH.csv", quote=FALSE, row.names=FALSE)

Combine Immune and Microbiome data

im = read.csv("data/NetworkAnalysis/dataOut/immuneMarkersImputed.csv")
mb = read.csv("data/NetworkAnalysis/dataOut/mb.csv")
h = read.csv("data/NetworkAnalysis/dataOut/mbH.csv")
colnames(mb) = h$x

#Add prefixes to data

colnames(im)[5:ncol(im)] = paste("im_", colnames(im)[5:ncol(im)], sep="")
colnames(mb)[2:(ncol(mb)-1)] = paste("mb_", colnames(mb)[2:(ncol(mb)-1)], sep="")

#Merge data
#Have less data with im than mb
d = merge(im, mb, by.x="SampleID", by.y="SampleID")


#separate into Week0 and Week24
d0 = subset(d, Week==0) #127 rows
d24 = subset(d, Week==24) #113.  Delta would be pretty good

#Save files and column names
write.csv(d0, "data/NetworkAnalysis/dataOut/imMbW0.csv", quote=FALSE, row.names=FALSE)
write.csv(colnames(d0), "data/NetworkAnalysis/dataOut/imMbW0_h.csv", quote=FALSE, row.names=FALSE)


write.csv(d24, "data/NetworkAnalysis/dataOut/imMbW24.csv", quote=FALSE, row.names=FALSE)
write.csv(colnames(d24), "data/NetworkAnalysis/dataOut/imMbW24_h.csv", quote=FALSE, row.names=FALSE)

Derive and Filter Immune:Microbe regressions

##########Parameters
tag = "3sW0"

dataOut = paste("data/NetworkAnalysis/dataOut/models_", tag, "June2023.csv", sep="")

##########Begin processing
dd = read.csv("data/NetworkAnalysis/dataOut/imMbW0.csv")
h = read.csv("data/NetworkAnalysis/dataOut/imMbW0_h.csv")
colnames(dd) = h$x

#dd = read.csv("data/NetworkAnalysis/dataOut/imMbW24.csv")
#h = read.csv("data/NetworkAnalysis/dataOut/imMbW24_h.csv")
#colnames(dd) = h$x

#Which arm is reference?  HC
dd$Cohort = factor(dd$Cohort, levels = c("Naive", "Exp", "HC"))

coCol = which(colnames(dd)=="Cohort")

#Rename this to Grp
colnames(dd)[coCol] = "Grp"

#read count column
rc = which(colnames(dd)=="reads")

#must know where in file which "ome" starts and stops.  These are column numbers

myOmes = c("im_", "mb_")

omeCols = lapply(myOmes, function(x) grep(paste("^", x, sep=""), colnames(dd)))

cc = combinations(n=length(myOmes),r=2)

#want to regress, e.g.: im ~ mb

#create function regress
regress = function(oA, oB, assays)
{
    #oA = o1
    #oB = o2
    
    for (h in oA)
    {   
        #print(h)
        #h = 6
        ana1 = colnames(dd)[h]
        print(ana1)
        
        for (i in oB)
        {    
            #i=15
            #print(paste(h,i))
            ana2 = colnames(dd)[i]
            print(paste(ana1,ana2, sep=","))
            #subset for complete cases.
            #not thin yet, so this works    
            dcc = dd[complete.cases(dd[, c(h,i,rc,coCol)]), c(h,i,rc,coCol) ]
                        
            #print(nrow(dcc))
            n=nrow(dcc)
            nNot0 = sum(dcc[,ana2] > 0)
            if(min(table(dcc$Grp)) > 10)
            {
                                        
                    #3 slope, readcount in model
                    m = lm(dcc[,ana1] ~ dcc$reads + dcc$Grp + dcc[,ana2]*dcc$Grp)
                    adjRsq = summary(m)$adj.r.squared
                    dffitsMax = max(abs(dffits(m)))
                    fParts = summary(m)$fstatistic
                     pF = pf(fParts[1], fParts[2], fParts[3], lower.tail=FALSE)
                    tt = summary(m)$coefficients
                    if (nrow(tt) == 7)
                    {
                        pRc = tt[2,4]
                        pNaive = tt[5,4]
                          pExp = tt[6,4]
                          pHC = tt[7,4]
                          ExpSlope = tt[6,1] + tt[5,1]
                          HCSlope = tt[7,1] + tt[5,1]
                          q13 = quantile(dcc[,ana2], c(.25,.75))
                          NaiveIqrE = tt[5,1] * (q13[2] - q13[1])
                          HCIqrE = HCSlope * (q13[2] - q13[1])
                          ExpIqrE = ExpSlope * (q13[2] - q13[1])                                                
                        #append to output file
                        
                        write(paste(assays, ana1, ana2, n, nNot0, adjRsq, pF, pNaive, pExp, pHC, NaiveIqrE,  ExpIqrE, HCIqrE, dffitsMax, pRc, sep=","), dataOut, append=TRUE)

                        
                    }
            } 
        }   
    }
}

#Create header row for output file.
#Writing these results directly to file performs well, and sets stage for permutation testing.
#Should something go wrong in a long job, it is possible to restart without losing the data previously generated, since it's in the output file.
header = paste("assays", "ana1", "ana2", "n", "nNot0", "adjRsq", "pF", "pNaive", "pExp", "pHC", "IqrNaive",  "IqrExp", "IqrHC", "maxInfluence", "pRc", sep=",")

write(header, dataOut, append=FALSE)
#Add timer for general information
tm0 = proc.time()

for (k in 1:nrow(cc))
{

  #k = 2
  print(paste("processing row ", k))
  #omeA = myOmes[1]
  oA = omeCols[[cc[k,1]]]
  #shorten array for testing purposes
  #oA = omeCols[[cc[k,1]]][1:10]

  oB = omeCols[[cc[k,2]]]
  #omeCols[[oA]]
  #omeCols[[oB]]
  regress(oA, oB, gsub("_", "", paste(myOmes[cc[k,1]], myOmes[cc[k,2]], sep=".")))
}
proc.time() - tm0





#Explore results
d = read.csv(dataOut)
d$pAdjFDR = p.adjust(d$pF, method="fdr")


dBest = subset(d, pAdjFDR < 0.2 & maxInfluence < 4 & pNaive < 0.05 & (pExp < 0.05 | pHC < 0.05) & adjRsq > 0.15)

dBestStrict = subset(d, pAdjFDR < 0.2 & maxInfluence < 2 & pNaive < 0.05 & (pExp < 0.05 | pHC < 0.05) & adjRsq > 0.25)

sort(table(c(dBest$ana1, dBest$ana2)))

#bos=best of show
bosOut = paste("data/NetworkAnalysis/dataOut/bos_", tag, "June2023.csv", sep="")
write.csv(dBest, bosOut, quote=FALSE, row.names=FALSE)


sort(table(c(dBestStrict$ana1, dBestStrict$ana2)))
bosOutStrict = paste("data/NetworkAnalysis/dataOut/bos_", tag, "Strict_June2023.csv", sep="")
write.csv(dBestStrict, bosOutStrict, quote=FALSE, row.names=FALSE)

Create Network Diagram

d = read.csv("data/NetworkAnalysis/dataOut/bos_3sW0Strict_June2023.csv", stringsAsFactors=FALSE)
gg = graph.data.frame(d[,c("ana1","ana2")], directed=FALSE)

#retrieve niceNames
niceNames = read.csv("data/NetworkAnalysis/dataFrom/bosNamesWithNice.csv")
niceNames$nice = gsub("\\n", "\n", niceNames$nice, fixed=TRUE)

#perform lookup
featureToNice = list()
featureToNice[niceNames$orig] = as.character(niceNames$nice)
myNiceNames = unlist(sapply(V(gg)$name, function(x) featureToNice[[x]] ))



#1 solid
d$slope = "1"
#2 dotted
d$slope[d$IqrNaive < 0] = "2"

#red positive, blue negative
edgeColors = c("red2", "deepskyblue3")

myRgb = col2rgb(edgeColors)

myTint = 0.45
newColors = myRgb + (255 - myRgb) * myTint

newColorsRgb = rgb(t(newColors), maxColorValue=255)

d$edgeColor = newColorsRgb[1]
d$edgeColor[d$IqrNaive < 0] = newColorsRgb[2]


#Vertex shape:  circle normally, square for module
myShapes = rep("circle",length(V(gg)))
myShapes[grep("module",V(gg)$name)] = "square"

assays = c("im", "mb")

myColorsRedPink = c('#feebe2','#fa9fb5')


myRgb = col2rgb(myColorsRedPink)

myTint = 0.45
newColors = myRgb + (255 - myRgb) * myTint

newColorsRgb = rgb(t(newColors), maxColorValue=255)

myColors = newColorsRgb


V(gg)$color=myColors[1]
for(i in 2:length(assays))
{
  myExpr = paste("^", assays[i], sep="")
  V(gg)$color[grep(myExpr, V(gg)$name, fixed=FALSE)] = myColors[i]
}
myColor = V(gg)$color



myEdgeWidth = -1*log10(d$pNaive)
myEdgeLty = as.numeric(d$slope)

#To adjust label position, use distance, degree pairs
#Set vertex.label.degree and vertex.label.dist in the plot command
myDist = rep(0,length(V(gg)))
myDeg = rep(0,length(V(gg)))

myDist[which(myNiceNames=="g_Clostridia_vadinBB60_group")] = .5
myDeg[which(myNiceNames=="g_Clostridia_vadinBB60_group")] = -pi/2

myDist[which(myNiceNames=="g_Blautia")] = .5
myDeg[which(myNiceNames=="g_Blautia")] = pi/2

myDist[which(myNiceNames=="g_Ruminococcus_torques_group")] = 3
myDeg[which(myNiceNames=="g_Ruminococcus_torques_group")] = pi

myDist[which(myNiceNames=="s_Massiliprevotella_massiliensis")] = 2.5
myDeg[which(myNiceNames=="s_Massiliprevotella_massiliensis")] = pi

myDist[which(myNiceNames=="g_Solobacterium")] = .3
myDeg[which(myNiceNames=="g_Solobacterium")] = pi/2


set.seed(20)
lay = layout_with_fr(gg)

par(mar = rep(2, 4))

plot(gg, vertex.label.family="sans", vertex.label= myNiceNames,  vertex.shape = myShapes, vertex.label.cex=.7, vertex.label.font=1, vertex.label.color="black", edge.lty=1, edge.label=NA, edge.label.font=1, edge.label.cex=.8, edge.label.color=NA, edge.label.family="sans", edge.curved=.1, edge.width=myEdgeWidth, edge.color=d$edgeColor, vertex.label.dist=myDist, vertex.label.degree=myDeg, vertex.frame.color="darkgray", layout=lay, vertex.color=V(gg)$color, vertex.frame.lty=2, vertex.width=4)

legend(x="topleft", assays, pch=21,
       col="#777777", pt.bg=myColors, pt.cex=1.3, cex=.8, bty="n", ncol=1)

Derive and Filter Immune:Microbe Regressions, Exp v HC

Create Subsets of Data for Exp and HC only

dd = read.csv("data/NetworkAnalysis/dataOut/imMbW0.csv")
h = read.csv("data/NetworkAnalysis/dataOut/imMbW0_h.csv")
colnames(dd) = h$x

ds = subset(dd, Cohort != "Naive") #75

write.csv(ds, "data/NetworkAnalysis/dataOut/imMbW0_ExpHC.csv", quote=FALSE, row.names=FALSE)

dd = read.csv("data/NetworkAnalysis/dataOut/imMbW24.csv")
h = read.csv("data/NetworkAnalysis/dataOut/imMbW24_h.csv")
colnames(dd) = h$x

ds = subset(dd, Cohort != "Naive") #69

write.csv(ds, "data/NetworkAnalysis/dataOut/imMbW24_ExpHC.csv", quote=FALSE, row.names=FALSE)

#No need to write headers since those stay the same

Perform Regressions

##########Parameters
#tag = "3sW0_ExpHC"
tag = "3sW24_ExpHC"

dataOut = paste("data/NetworkAnalysis/dataOut/models_", tag, "June2023.csv", sep="")



##########Begin processing

if (grepl("W0", tag))
{
  dd = read.csv("data/NetworkAnalysis/dataOut/imMbW0_ExpHC.csv")
  h = read.csv("data/NetworkAnalysis/dataOut/imMbW0_h.csv")
  colnames(dd) = h$x
} else
{
  dd = read.csv("data/NetworkAnalysis/dataOut/imMbW24_ExpHC.csv")
  h = read.csv("data/NetworkAnalysis/dataOut/imMbW24_h.csv")
  colnames(dd) = h$x
}

coCol = which(colnames(dd)=="Cohort")

#Rename this to Grp
colnames(dd)[coCol] = "Grp"


#read count column
rc = which(colnames(dd)=="reads")

#must know where in file which "ome" starts and stops.  These are column numbers

myOmes = c("im_", "mb_")

omeCols = lapply(myOmes, function(x) grep(paste("^", x, sep=""), colnames(dd)))

cc = combinations(n=length(myOmes),r=2)

#create function regress
regress = function(oA, oB, assays)
{
    #oA = o1
    #oB = o2
    for (h in oA)
    {   
        #print(h)
        #h = 6
        ana1 = colnames(dd)[h]
        #print(ana1)
        
        for (i in oB)
        {    
            #i=15
            #print(paste(h,i))
            ana2 = colnames(dd)[i]
            #print(paste(ana1,ana2, sep=","))
            #subset for complete cases.
            #not thin yet, so this works    
            dcc = dd[complete.cases(dd[, c(h,i,rc,coCol)]), c(h,i,rc,coCol) ]
                        
            #print(nrow(dcc))
            n=nrow(dcc)
            nNot0 = sum(dcc[,ana2] > 0)
            if(min(table(dcc$Grp)) > 10)
            {
                                        
                    #3 slope, readcount in model
                    m = lm(dcc[,ana1] ~ dcc$reads + dcc$Grp + dcc[,ana2]*dcc$Grp)
                    adjRsq = summary(m)$adj.r.squared
                    dffitsMax = max(abs(dffits(m)))
                    fParts = summary(m)$fstatistic
                     pF = pf(fParts[1], fParts[2], fParts[3], lower.tail=FALSE)
                    tt = summary(m)$coefficients
                    #cheap trick to catch bad model
                    if (nrow(tt) == 5)
                    {
                    
                          pRc = tt[2,4]
                      pExp = tt[4,4]
                      pHC = tt[4,4]
                      ExpSlope = tt[4,1]
                      HCSlope = tt[5,1] + ExpSlope
                      q13 = quantile(dcc[,ana2], c(.25,.75))
                      HCIqrE = HCSlope * (q13[2] - q13[1])
                      ExpIqrE = ExpSlope * (q13[2] - q13[1])                                                
                    
                      #append to output file
                          write(paste(assays, ana1, ana2, n, nNot0, adjRsq, pF,  pExp, pHC, ExpIqrE, HCIqrE, dffitsMax, pRc, sep=","), dataOut, append=TRUE)
                    }
                    
            }
        }   
    }
}


#Create header row for output file.
#Writing these results directly to file performs well, and sets stage for permutation testing.
#Should something go wrong in a long job, it is possible to restart without losing the data previously generated, since it's in the output file.
header = paste("assays", "ana1", "ana2", "n", "nNot0", "adjRsq", "pF",  "pExp", "pHC", "IqrExp", "IqrHC", "maxInfluence", "pRc", sep=",")

write(header, dataOut, append=FALSE)
#Add timer for general information
tm0 = proc.time()

for (k in 1:nrow(cc))
{

  #k = 2
  #print(paste("processing row ", k))
  #omeA = myOmes[1]
  oA = omeCols[[cc[k,1]]]
  #shorten array for testing purposes
  #oA = omeCols[[cc[k,1]]][1:10]

  oB = omeCols[[cc[k,2]]]
  #omeCols[[oA]]
  #omeCols[[oB]]
  assays = gsub("_", "", paste(myOmes[cc[k,1]], myOmes[cc[k,2]], sep="."))
  regress(oA, oB, assays)
}
proc.time() - tm0

Compute p-Value of HC Slope Different than 0 and Filter Results

#tag = "3sW0_ExpHC"
tag = "3sW24_ExpHC"


inFile = paste("data/NetworkAnalysis/dataOut/models_", tag, "June2023.csv", sep="")
outFile = paste("data/NetworkAnalysis/dataOut//models_", tag, "Contrasts.csv", sep="")


models = read.csv(inFile, stringsAsFactors=TRUE)


if (grepl("W0", tag))
{
    dd = read.csv("data/NetworkAnalysis/dataOut/imMbW0_ExpHC.csv")
    h = read.csv("data/NetworkAnalysis/dataOut/imMbW0_h.csv")
    colnames(dd) = h$x
} else
{
    dd = read.csv("data/NetworkAnalysis/dataOut/imMbW24_ExpHC.csv")
    h = read.csv("data/NetworkAnalysis/dataOut/imMbW24_h.csv")
    colnames(dd) = h$x
}

dd$Cohort = factor(dd$Cohort)

#nts:  doing contrasts this way corrects for the multiple comparisons by default
#below is contrast for slope of HC different than 0 (reference slope + HC offset)
#can specify method, including "none"
contr <- rbind("SlopeC2"=c(0,0,0,1,1))
               

models$pSHC=NA

for (i in 1:nrow(models))
{
    myRow = i
    #myRow = 5
    
    ana1 = as.character(models$ana1[myRow])
    ana2 = as.character(models$ana2[myRow])
    
    myCols = c("SampleID","Cohort",ana1,ana2,"reads")
    
    #Extract a subset of the data for building the model
    #Identify read column based on the ana2
    ss = as.data.frame(dd[,myCols])
    ssc = ss[complete.cases(ss),]
        
    #Rename the relevant columns to a generic "Ana1" and "Ana2"
    #This makes the regression model a little clearer than the subscript approach.  
    colnames(ssc)[3] = "Ana1"
    colnames(ssc)[4] = "Ana2"
        
    m = lm(Ana1 ~ reads + Cohort + Ana2*Cohort , data=ssc)
    test = glht(m, linfct=contr)
        
    pvals = summary(test, adjusted(type="none"))$test$pvalues
            
    models$pSHC[i] = pvals[1]
                
}


models$pFAdj = p.adjust(models$pF, method="fdr")
write.csv(models, outFile, quote=FALSE, row.names=FALSE)

models = read.csv(outFile)


#At least one of the slopes for Exp or HC is not 0, and the difference between slopes is not 0

dBest = droplevels(subset(models, pFAdj < 0.2 & maxInfluence < 2 & (pExp < 0.05 | pSHC < 0.05) & pHC < 0.05 & adjRsq > 0.2))
nrow(dBest)

sort(table(c(dBest$ana1, dBest$ana2)))

#save dBest
outFileBos = paste("../dataOut/bos_", tag, "Contrasts.csv", sep="")
write.csv(dBest, outFileBos, quote=FALSE, row.names=FALSE)