Import Needed Packages

library(readxl)             # version 1.4.2
library(dplyr)              # version 1.1.2
library(lubridate)          # version 1.9.2
library(RColorBrewer)       # version 1.1-3
library(FSA)                # version 0.9.4
library(ggh4x)              # version 0.2.4
library(fixest)             # version 0.11.1  
library(ggplot2)            # version 3.4.2
library(qiime2R)            # version 0.99.6
library(cowplot)            # version 1.1.1
library(readr)              # version 2.1.4
library(ggpubr)             # version 0.6.0
library(ggrepel)            # version 0.9.3
library(vegan)              # version 2.6-4
library(ANCOMBC)            # version 2.0.2 
library(phyloseq)           # version 1.42.0
library(tidyr)              # version 1.3.0
library(microViz)           # version 0.10.8
library(jsonlite)           # version 1.8.4
library(MicrobiotaProcess)  # version 1.12.3
library(gtools)             # version 3.9.4
library(igraph)             # version 1.4.2
library(stringr)            # version 1.5.0
library(multcomp)           # version 1.4-25

set.seed(5)

Data Read In and Wrangle

Metadata281 <- read_excel("data/Metadata281.xlsx")

# Rename some columns 
colnames(Metadata281)[colnames(Metadata281) == 'CD4+_count'] <- 'CD4_count'
colnames(Metadata281)[colnames(Metadata281) == 'Cohort'] <- 'Cohort_Description'
colnames(Metadata281)[colnames(Metadata281) == 'HIV-1_viral_load'] <- 'HIV1_viral_load'
colnames(Metadata281)[colnames(Metadata281) == 'IL-6_pg_mL'] <- 'IL6_pg_mL'

# Create viral outcome column
Metadata281$HIV1_viral_load[Metadata281$HIV1_viral_load == 'ND'] <- 0
Metadata281$HIV1_viral_load <- as.numeric(Metadata281$HIV1_viral_load)
Metadata281$HIV1_viral_outcome <- 'Suppressed'

Metadata281$HIV1_viral_outcome[Metadata281$Cohort_Short == 'A' & 
                                Metadata281$Visit == 3 &
                                Metadata281$HIV1_viral_load > 200] <- 'Viremic'
Metadata281$HIV1_viral_outcome[Metadata281$Cohort_Short == 'B' & 
                                Metadata281$Visit == 2 &
                                Metadata281$HIV1_viral_load > 200] <- 'Viremic'
Metadata281$HIV1_viral_outcome <- as.factor(Metadata281$HIV1_viral_outcome)

# Calculate length on ART column 
Metadata281$ARTStart[Metadata281$Cohort_Short == "B"] <- Metadata281$ART_Start_Date[Metadata281$Cohort_Short == "B"]
Metadata281$ARTStart <- as.Date(Metadata281$ARTStart, "%Y") 
lubridate::day(Metadata281$ARTStart) <- 1
lubridate::month(Metadata281$ARTStart) <- 6
Metadata281$Visit_date <- as.Date(Metadata281$Visit_date, "%m/%d/%y")

Metadata281$LenghtOnArt_yrs[Metadata281$Cohort_Short == "A"] <- 0.5
Metadata281$LenghtOnArt_yrs[Metadata281$Cohort_Short == "C"] <- 0.0
Metadata281$LenghtOnArt_yrs[Metadata281$Cohort_Short == "B"] <- time_length(Metadata281$Visit_date[Metadata281$Cohort_Short == "B"] - Metadata281$ARTStart[Metadata281$Cohort_Short == "B"], "years")

# Make cleaner columns for visualizations
Metadata281$Location[Metadata281$Arm_Short == "MA"] <- "Urban"
Metadata281$Location[Metadata281$Arm_Short == "MT"] <- "Rural"
Metadata281$Cohort[Metadata281$Cohort_Short == "A"] <- "Naïve"
Metadata281$Cohort[Metadata281$Cohort_Short == "B"] <- "Exp"
Metadata281$Cohort[Metadata281$Cohort_Short == "C"] <- "HC"
Metadata281$Week[Metadata281$Visit == 2] <- 0
Metadata281$Week[Metadata281$Visit == 3] <- 24
Metadata281$Location <- as.factor(Metadata281$Location)
Metadata281$Cohort <- as.factor(Metadata281$Cohort)
Metadata281$Cohort <- ordered(Metadata281$Cohort, levels = c("Naïve", "Exp", "HC"))
Metadata281$Week <- as.factor(Metadata281$Week)

# make correct factored variables 
fact_cols <- c("Gender", "HIV_Status", "Bristol_Score", "Water_Source_Num", 
               "Cotrimoxazole_HIV", "Manual_work_job", "Manual_chores_home",
               "Head_of_household", "Education_Level", "Normal_work_transportation")
Metadata281[fact_cols] <- lapply(Metadata281[fact_cols], factor) 
Metadata281$BMI_Cat <- ordered(Metadata281$BMI_Cat, levels = c("Severe_thinness", "Moderate_thinnes", "Mild_thinness", "Normal", "Overweight", "Obese"))

# make sure immune markers are numeric 
immune_markers <- c("CD4_Percent",
                    "CD4_HLADRpos_CD38pos_Percent", 
                    "CD8_HLADRpos_CD38pos_Percent", 
                    "CD4_PD1_Percent", 
                    "CD8_PD1_Percent",
                    "CD4_CD103_Percent",
                    "CD8_CD103_Percent",
                    "IL6_pg_mL", 
                    "CRP_mg_L")
for (marker in immune_markers){
  Metadata281[[marker]] <- as.numeric(Metadata281[[marker]])
}

SupplementalFig3 <- Metadata281

# Impute means for immune markers where needed 

for(marker in immune_markers){ 
  Metadata281[[marker]][is.na(Metadata281[[marker]])] <- mean(Metadata281[[marker]], na.rm = TRUE)
}

# Make sure to remove sample with no immune data 
Metadata281 <- Metadata281 %>% 
  filter(SampleID != "ZIM033.3")

# Keep only samples with two visits  
Metadata281 <- Metadata281[Metadata281$PID %in% Metadata281$PID[duplicated(Metadata281$PID)],]

# Remove patients that were viremic 
vir_PID <- Metadata281 %>%  
  filter(HIV1_viral_outcome == "Viremic") %>% 
  dplyr::select(SampleID, PID, Cohort_Short, HIV1_viral_outcome)
PIDs <- unique(vir_PID$PID)
Metadata281$HIV1_viral_outcome[Metadata281$PID %in% PIDs] <- 'Viremic'

# get figure 1 inter data 
inter_fig1 <- Metadata281 %>% 
  filter(!(HIV1_viral_outcome == "Viremic" & Cohort_Short == "B")) %>% 
  filter(Visit == 2)

# remove all viremmic patients from metadata
Metadata281 <- subset(Metadata281, !(PID %in% PIDs))

# Get dataframe of just baseline measurements 
baseline <- Metadata281 %>% filter(Week == 0)

# Creating a color palettes
visitColors <- palette(brewer.pal(n = 5, name = "BrBG")) 
cohortColors <- palette(brewer.pal(n = 5, name = "PiYG"))

Cross-sectional and Longitudinal Analysis of Immune Phenotype

Inter and Intra Cohort Immune Analyses

# Figure 1 

sig_df <- data.frame(matrix(ncol = 5, nrow = 0))
colnames(sig_df) <- c("marker", "comparison", "pval", "direction", "patients")

categories <- c("All", "Urban", "Rural")
markers <- c("CD4_HLADRpos_CD38pos_Percent", 
             "CD8_HLADRpos_CD38pos_Percent", 
              "CD4_PD1_Percent", 
              "CD8_PD1_Percent",
              "CD4_CD103_Percent",
              "CD8_CD103_Percent",
              "IL6_pg_mL", 
              "CRP_mg_L")

for (category in categories){
  if(category != "All"){
    heatmap_df <- Metadata281 %>% filter(Location == category)
    heatmap_baseline <- inter_fig1 %>% filter(Location == category)
  }else{
    heatmap_df <- Metadata281
    heatmap_baseline <- inter_fig1
  }

  for (marker in markers){
  patients <- category
  
  # inter cohort tests 
  dt <- dunnTest(heatmap_baseline[[marker]], heatmap_baseline$Cohort_Short, method = "bonferroni")
  c1 <- "Naïve - Exp"
  c2 <- "Naïve - HC"
  c3 <- "Exp - HC"
  p1 <- dt$res$P.adj[1]
  p2 <- dt$res$P.adj[2]
  p3 <- dt$res$P.adj[3]
  
  mean <- heatmap_baseline %>% 
    group_by(Cohort_Short) %>%  
    summarise(avg = mean(eval(as.name(marker))))
  
  avgA <- mean[mean$Cohort_Short == "A",2]$avg
  avgB <- mean[mean$Cohort_Short == "B",2]$avg
  avgC <- mean[mean$Cohort_Short == "C",2]$avg
  
  diffAB <- avgA - avgB
  diffAC <- avgA - avgC
  diffBC <- avgB - avgC 
  
  r1 <- c(marker, c1, p1, diffAB, patients)
  r2 <- c(marker, c2, p2, diffAC, patients)
  r3 <- c(marker, c3, p3, diffBC, patients)
  
  sig_df[nrow(sig_df)+1, ] <- r1
  sig_df[nrow(sig_df)+1, ] <- r2
  sig_df[nrow(sig_df)+1, ] <- r3  
  
  # intra cohort tests 
  sub <- heatmap_df %>% filter(Cohort_Short == "A")
  comparison <- "Naïve: 0 - 24"
  t <- wilcox.test(sub[[marker]] ~ Visit, data = sub, paired = T)
  mean <- sub %>% 
    group_by(Visit) %>%  
    summarise(avg = mean(eval(as.name(marker))))
  avg1 <- mean[mean$Visit == 2,2]$avg
  avg2 <- mean[mean$Visit ==3,2]$avg
  mean_diff <- avg1 - avg2
  r <- c(marker, comparison, t$p.value, mean_diff, patients)
  sig_df[nrow(sig_df)+1, ] <- r
  
  sub <- heatmap_df %>% filter(Cohort_Short == "B")
  comparison <- "Exp: 0 - 24"
  t <- wilcox.test(sub[[marker]] ~ Visit, data = sub, paired = T)
  mean <- sub %>% 
    group_by(Visit) %>%  
    summarise(avg = mean(eval(as.name(marker))))
  avg1 <- mean[mean$Visit == 2,2]$avg
  avg2 <- mean[mean$Visit ==3,2]$avg
  mean_diff <- avg1 - avg2
  r <- c(marker, comparison, t$p.value, mean_diff, patients)
  sig_df[nrow(sig_df)+1, ] <- r
  
  sub <- heatmap_df %>% filter(Cohort_Short == "C")
  comparison <- "HC: 0 - 24"
  t <- wilcox.test(sub[[marker]] ~ Visit, data = sub, paired = T)
  mean <- sub %>% 
    group_by(Visit) %>%  
    summarise(avg = mean(eval(as.name(marker))))
  avg1 <- mean[mean$Visit == 2,2]$avg
  avg2 <- mean[mean$Visit ==3,2]$avg
  mean_diff <- avg1 - avg2
  r <- c(marker, comparison, t$p.value, mean_diff, patients)
  sig_df[nrow(sig_df)+1, ] <- r
  }
}

sig_df$direction <- as.numeric(sig_df$direction)
sig_df$pval <- as.numeric(sig_df$pval)
sig_df$patients <- as.factor(sig_df$patients)

sig_df$Significance <- 'NS'
sig_df$Significance[sig_df$pval <= 0.05] <- '*'
sig_df$Significance[sig_df$pval <= 0.01] <- '**'
sig_df$Significance[sig_df$pval <= 0.001] <- '***'
sig_df$Significance[sig_df$pval <= 0.0001] <- '****'
sig_df$Significance <- as.factor(sig_df$Significance)

sig_df$direction <- sig_df$direction * -1
sig_df$direction[sig_df$pval > 0.05] <- NA

markers_labels <- c("CD4+CD38+HLA-DR+ (%)", 
                    "CD8+CD38+HLA-DR+ (%)", 
                    "CD4+PD1+ (%)", 
                    "CD8+PD1+ (%)",
                    "CD4+CD103+ (%)",
                    "CD8+CD103+ (%)",
                    "IL-6 (pg/mL)", 
                    "CRP (mg/L)")
for (i in 1:length(markers)){
  sig_df$marker[sig_df$marker == markers[i]] <- markers_labels[i]
}
sig_df$category[sig_df$comparison == "Naïve - Exp" |
                sig_df$comparison == "Naïve - HC"  |
                sig_df$comparison == "Exp - HC"] <- "Inter"
sig_df$category[is.na(sig_df$category)] <- "Intra"


comp <- c("Naïve - Exp","Naïve - HC","Exp - HC", "Naïve: 0 - 24", "Exp: 0 - 24", "HC: 0 - 24")
ggplot(sig_df, aes(x = factor(comparison, levels = comp), 
                   y = factor(marker, levels = markers_labels))) + 
  geom_raster(aes(fill=direction))+  
  scale_fill_gradient2(high = "red2", mid = "white", low = "deepskyblue3", na.value = "lightgrey", midpoint = 0,
                       limits = c(-25, 25))+
  geom_text(aes(label=Significance), size = 6) + 
  theme_bw() + 
  xlab("") + 
  ylab("") +
  theme(text = element_text(size = 18), 
        plot.title = element_text(hjust = 0.5), 
        axis.text.x = element_text(angle = 45, vjust = 1, hjust=1), 
        legend.key = element_rect(colour = "black", size = 1), 
        legend.title = element_text(size = 14),
        legend.text=element_text(size=12)) + 
  guides(fill=guide_legend(title="Percent \nDelta", reverse = T)) + 
  facet_nested(~patients + category, scales = "free_x")