####################################################################################
### This workflow processes multiple 10x runs into one integrated Seurat dataset ###
####################################################################################

library(scDblFinder)
library(DropletUtils)
library(harmony)
library(Seurat)
library(tidyverse)
library(viridis)
library(readxl)

### functions
CellRanger_to_Sobj  <- function() {
  
  ### Make Seurat Object List
  rawdata_list <- vector("list", length = length(samples))                 # this will contain our count matrices
  Sobj_list    <- vector("list", length = length(samples))                 # our Seurat objects will be generated in here
  
  
  ### following loop reads in rawdata from cellranger output folders according to the sample name and then creates a Seurat object in the list
  for (i in seq_along(samples)) {
    rawdata_list[[i]] <- Read10X            (data.dir = file.path("CellRanger_Out/", samples[i] , "/outs/filtered_feature_bc_matrix/"))
    Sobj_list   [[i]] <- CreateSeuratObject (rawdata_list[[i]], project = samples[i], min.cells = 1, min.features = 1)
  }
  
  names(Sobj_list) <- samples # assign the correct names to the Sobj_List levels
  
  # rename cells using object names as prefix to prevent duplicates
  for (i in names(Sobj_list)) {
    Sobj_list[[i]] <- RenameCells(Sobj_list[[i]], add.cell.id = i)
  }
  
  # merge into one Seurat Object
  Sobj <<- merge(x = Sobj_list[[1]], y = Sobj_list[-1])
  
}
AssignClusterIdents <- function() {
  
  # make assignment of clusters to celltypes by using the best fit of marker genes
  CellType_Markers_Final <- read_excel("utilities/CellType_Markers_Final.xlsx") #read in cell type marker list
  Avg.Expr <- AverageExpression(Sobj, assays = "RNA", slot = "data", features = CellType_Markers_Final$Marker, group.by = res)
  
  # choose which subclusters have highest avg. expression of selected marker genes
  Assign <- Avg.Expr$RNA %>% 
    as.data.frame() %>% 
    rownames_to_column("Marker") %>% 
    pivot_longer(-1, names_to = res) %>% 
    left_join(CellType_Markers_Final) %>% 
    group_by(Marker) %>% 
    top_n(1, value) %>% 
    group_by(.data[[res]], CellType) %>% 
    summarise(n = n()) %>% 
    group_by(CellType) %>% 
    filter(n > 1) %>% 
    select(-n)
  
  
  # create the new labels
  Idents(Sobj) <- res
  newIdent <- Assign$CellType
  names(newIdent) <- Assign$RNA_snn_res.0.1
  Sobj <- RenameIdents(object = Sobj, newIdent)
  Sobj$CellType <- Idents(Sobj)
  
}
DoubletDetection    <- function() {
  
  # prepare celltype and barcode assignment for merging with sce data
  cluster.assign <- Sobj@meta.data %>% 
    rownames_to_column("Barcode") %>% 
    select(Barcode, Sample = "orig.ident", CellType) %>% 
    mutate(Barcode = substr(Barcode, nchar(Barcode)-17, 40))
  
  
  #### load 10x data as sce object for scDblFinder
  # create vector with file destinations
  p = vector()
  for (i in 1:length(samples)) {
    p[i] <- paste0("CellRanger_Out/", samples[i], "/outs/filtered_feature_bc_matrix/")
  }
  
  sce <- read10xCounts(samples = p, sample.names = samples)
  
  # merge sce cellnames (ordering) with cluster assignemnt
  order <- sce@colData %>% 
    as.data.frame() %>% 
    rownames_to_column("Cell") %>% 
    select(Cell, Sample, Barcode) %>% 
    left_join(cluster.assign)
  
  # add metadata column
  sce@colData$CellType <- order$CellType
  
  # run doublet detection
  sce <- scDblFinder(sce, samples = "Sample", clusters = "CellType")
  
  # add doublet detection results to Sobj
  meta <- sce@colData %>% as_tibble() %>% unite(x, Sample, Barcode) %>% column_to_rownames("x")
  Sobj <<- AddMetaData(Sobj, meta)
  
}
GenerateMetaData    <- function() {
  
  ### assigning Patient_ID/replicate_ID to the Data
  Idents(Sobj) <- "orig.ident"
  Sobj@meta.data$Library <- Idents(Sobj)
  

  Idents(Sobj) <- "orig.ident"
  new.cluster.ids <- str_remove(levels(Sobj), "_2")
  names(new.cluster.ids) <- levels(Sobj)
  Sobj <- RenameIdents(Sobj, new.cluster.ids)
  Sobj@meta.data$Sample <- Idents(Sobj)
  
  
  ### assigning Treatment groups to the Data
  Idents(Sobj) <- "Sample"
  new.cluster.ids <- substring(levels(Sobj), 1, 2) #only keep CF or TM
  names(new.cluster.ids) <- levels(Sobj)
  Sobj <- RenameIdents(Sobj, new.cluster.ids)
  Sobj@meta.data$Type <- Idents(Sobj)
  
  
  ## assigning broader Groups on 0.1 resolution
  Idents(Sobj) <- "CellType"
  new.cluster.ids <- levels(Sobj)
  new.cluster.ids <- recode(new.cluster.ids, 
                            "LUM_HR-pos" = "Epithelial",
                            "LUM_HR-neg" = "Epithelial",
                            "Basal"      = "Epithelial",
                            "Fibroblast" = "Stroma",
                            "Adipocyte"  = "Stroma",
                            "Blood_EC"   = "Vascular",
                            "Lymph_EC"   = "Vascular",
                            "Vasc.Acc."  = "Vascular",
                            "Myeloid"    = "Immune",
                            "Lymphoid"   = "Immune"
  )
  
  names(new.cluster.ids) <- levels(Sobj)
  Sobj <- RenameIdents(Sobj, new.cluster.ids)
  Sobj@meta.data$Group <- Idents(Sobj)
  
  
  LibraryMetrics <- read_excel("utilities/LibraryMetrics.xlsx", skip = 0, sheet = 1) %>% filter(Sample %in% Sobj$orig.ident)
  levels <- LibraryMetrics$Sample
  
  # Batch
  Idents(Sobj) <- "orig.ident"
  newIdent <- LibraryMetrics$Batch
  names(newIdent) <- LibraryMetrics$Sample
  Sobj <- RenameIdents(object = Sobj, newIdent)
  Sobj$Batch <- Idents(Sobj)
  
  # PrepMethod
  Idents(Sobj) <- "orig.ident"
  newIdent <- LibraryMetrics$PrepMethod
  names(newIdent) <- LibraryMetrics$Sample
  Sobj <- RenameIdents(object = Sobj, newIdent)
  Sobj$Prep <- Idents(Sobj)
  
  # cDNA Avg
  Idents(Sobj) <- "orig.ident"
  newIdent <- LibraryMetrics$`Lib avg.`
  names(newIdent) <- LibraryMetrics$Sample
  Sobj <- RenameIdents(object = Sobj, newIdent)
  Sobj$cDNA_Avg <- Idents(Sobj)

  }


setwd("~/Path/To/Working/Directory/")

### Preparation of our file lists that will contain our seurat objects
samples <- list.files("./CellRanger_Out/")    # folder structure should be:  Cellranger_Out/Sample_Name/filtered_feature_bc_matrix

####################################################################################

### read in 10X data and create a merged Seurat object with unique barcodes
CellRanger_to_Sobj()

### build some metadata columns for further downstream processing
GenerateMetaData()


### standard preprocessing for initial clustering which helps with Doublet detection
DefaultAssay (Sobj) <- "RNA"

Sobj <- NormalizeData        (Sobj, verbose = T)
Sobj <- FindVariableFeatures (Sobj, selection.method = "vst", nfeatures = 5000, verbose = T)
Sobj <- ScaleData            (Sobj, vars.to.regress = c("nCount_RNA", "percent.mt"), verbose = T) 
Sobj <- RunPCA               (Sobj, npcs = 50, verbose = T)
Sobj <- RunHarmony           (Sobj, "Batch")
Sobj <- RunUMAP              (Sobj, reduction  = "harmony", dims = 1:50)
Sobj <- FindNeighbors        (Sobj, reduction  = "harmony", dims = 1:50)
Sobj <- FindClusters         (Sobj, resolution = c(0.05, 0.1, 0.2, 0.5))


### choose the resolution that fits the data best and assign cluster identities based on peviously determined markers
res <- "RNA_snn_res.0.1"
AssignClusterIdents()

# confirm that everything looks good
VlnPlot(Sobj,
        features = c("ANKRD30A", "ELF5", "TP63", "VWF", "FLT4", "PLIN1", "COL6A3", "IL7R", "CD163", "RGS6", "CD69"), 
        pt.size = 0, group.by = "CellType")


### Now we can use this assignment to improve doublet detection in the next step
DoubletDetection()

# Quick check
DimPlot(Sobj, group.by = "scDblFinder.class")
VlnPlot(Sobj, group.by = "scDblFinder.class", features = "nCount_RNA", split.by = "orig.ident")


### Standard filtering 
# mito genes
Sobj[["percent.mt"]] <- PercentageFeatureSet(Sobj, pattern = "^MT-")

# VlnPlot of UMI and mito distribution
VlnPlot(Sobj, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
VlnPlot(Sobj, features = c("nCount_RNA"), ncol = 1, group.by = res) + ylim(0, 5000)

# filtering out high mito and high UMI cells as well as doublets as dected by scDblFinder
Sobj <- subset(Sobj, subset = percent.mt < 2.5 & nCount_RNA < 30000 & scDblFinder.class == "singlet")


# adding cell cycle scores
s.genes   <- cc.genes$s.genes
g2m.genes <- cc.genes$g2m.genes
Sobj <- CellCycleScoring(Sobj, s.features = s.genes, g2m.features = g2m.genes, set.ident = TRUE)
rm(s.genes, g2m.genes)


### repeat clusering on final cells after QC filtering and doublet removal
DefaultAssay (Sobj) <- "RNA"

Sobj <- NormalizeData        (Sobj, verbose = T)
Sobj <- FindVariableFeatures (Sobj, selection.method = "vst", nfeatures = 5000, verbose = T)
Sobj <- ScaleData            (Sobj, vars.to.regress = c("nCount_RNA", "percent.mt"), verbose = T) 
Sobj <- RunPCA               (Sobj, npcs = 50, verbose = T)
Sobj <- RunHarmony           (Sobj, "Batch")
Sobj <- RunUMAP              (Sobj, reduction  = "harmony", dims = 1:50)
Sobj <- FindNeighbors        (Sobj, reduction  = "harmony", dims = 1:50)
Sobj <- FindClusters         (Sobj, resolution = c(0.05, 0.1, 0.2, 0.5))


Sobj %>% saveRDS("Sobj_Final_Scaled.rds")