# **MODIFY THIS CHUNK**

project_id <- "mizrak_data" # determines the name of the cache folder
doc_id     <- "02" # determines name of the subfolder of `figures` where pngs/pdfs are saved
out        <- paste0("../processed_data/", doc_id)
figout     <- paste0("figures/", doc_id, "/")
cache      <- paste0("~/tmp/", project_id, "/", doc_id, "/")

1 Overview

Load expression matrix obtained from GEO, and create a Seurat object in order to run basic analyses (normalization, dimensionality reduction, clustering, derivation of cluster markers, etc).

2 Libraries

library(tidyr)
library(dplyr)
library(readr)
library(magrittr)
library(glue)
library(purrr)
library(ggplot2)
library(cytobox)
library(cowplot)
library(Seurat)

source("../../../sjessa/from_hydra/HGG-G34/samples/functions.R")
source("../../../sjessa/from_hydra/misc/sjlib.R")
ggplot2::theme_set(cytobox::theme_min(border_colour = "black", border_size = 0.5))

3 Read data

id  <- read.table("../data/GSE109447_13055_cells_id.txt.gz", stringsAsFactors = FALSE)

# It seems that to get the right cell #s, we leave out c("cell")
label <- read.table("../data/GSE109447_Rep2_13055cells_Basic.txt.gz", stringsAsFactors = FALSE) %>% 
  filter(!(V1 %in% c("+", "Cell", "Cell+", "Mural")))

barcodes <- readxl::read_xlsx("../data/GSE109447_13055_Cell_Barcodes.xlsx", col_names = FALSE)
## New names:
## * `` -> ...1
mat <- data.table::fread("../data/GSE109447_13055_cells_matrix.txt.gz", data.table = FALSE)

mat[1:5, 1:5]
# Separated out the rows containing gene name duplciates
gene_instances <- table(mat[, 2])
duplicates     <- names(gene_instances)[gene_instances > 1]
mat_dups       <- mat[mat[, 2] %in% duplicates, ]
mat_dups$V1    <- NULL

# Sum counts per duplicated gene
mat_dups_summed <- purrr::map(duplicates, function(gene) {
  
  mat_dups[mat_dups$V2 %in% gene, ] %>% 
    select(-V2) %>% 
    colSums()
  
})

mat_dups_summed <- do.call(rbind, mat_dups_summed)
rownames(mat_dups_summed) <- duplicates

# Prep the matrix of non-duplicated counts in the same format
mat_nodups    <- mat[!(mat[, 2] %in% duplicates), ]
mat_nodups$V1 <- NULL
rownames(mat_nodups) <- mat_nodups$V2
mat_nodups$V2 <- NULL
mat_nodups <- as.matrix(mat_nodups)

# Put the two matrices back together
mat_fixed <- rbind(mat_dups_summed, mat_nodups)

# Put it in the same order as the original gene list
mat_fixed <- mat_fixed[unique(mat[, 2]), ]
colnames(mat_fixed) <- paste0(id[[1]], "_", barcodes$...1)

# Create metadata
metadata <- data.frame(ID = id[[1]],
                       Sample = id %>% separate(V1, into = c("rep", "number"), sep = "\\_") %>% pull(rep),
                       Cluster = label[[1]])
rownames(metadata) <- paste0(id[[1]], "_", barcodes$...1)

# Palette
palette <- c("FlatRep2" = "#CBC9E2",
             "FsepRep2" = "#9E9AC8",
             "MsepRep2" = "#BAE4B3",
             "MlatRep2" = "#74C476")

4 QC

4.1 Create the Seurat object

seurat <- CreateSeuratObject(raw.data = mat_fixed,
                             min.cells = 3,
                             min.genes = 0,
                             is.expr = 0,
                             project = "Mizrak_VSVZ_Rep2",
                             meta.data = metadata)

# Add mitochondrial content to the metadata
mito_genes <- grep("^mt-", rownames(seurat@data), value = TRUE)
percent_mito <- Matrix::colSums(seurat@data[mito_genes, ])/Matrix::colSums(seurat@data) * 100
seurat <- AddMetaData(seurat, percent_mito, "percent_mito")

# Add cell cycle score
seurat <- addCellCycle(seurat, species = "m_musculus")

4.2 QC metrics

ngene_lim <- 3500
numi_lim  <- 10000
mito_lim  <- 10

plot_grid(VlnPlot(seurat, c("nGene"), cols.use = palette,       
                  do.return = TRUE, group.by = "Sample", point.size.use = -1, x.lab.rot = TRUE) + ylim(NA, ngene_lim),
          VlnPlot(seurat, c("nUMI"),  cols.use = palette,           
                  do.return = TRUE, group.by = "Sample", point.size.use = -1, x.lab.rot = TRUE) + ylim(NA, numi_lim),
          VlnPlot(seurat, c("percent_mito"),cols.use = palette,   
                  do.return = TRUE, group.by = "Sample", point.size.use = -1, x.lab.rot = TRUE) + ylim(NA, mito_lim),
          ncol = 3)
## Scale for 'y' is already present. Adding another scale for 'y', which
## will replace the existing scale.
## Scale for 'y' is already present. Adding another scale for 'y', which
## will replace the existing scale.
## Scale for 'y' is already present. Adding another scale for 'y', which
## will replace the existing scale.

5 Dimensionality reduction and clustering

(process_params <- data.frame(
  n_pcs = 10,
  resolution = 0.6,
  seed = 100
))

seurat <- seurat %>%
  NormalizeData(normalization.method = "LogNormalize", scale.factor = 10000) %>%
  FindVariableGenes(mean.function = ExpMean, dispersion.function = LogVMR,
                    x.low.cutoff = 0.0125, x.high.cutoff = 3, y.cutoff = 0.5) %>%
  ScaleData(vars.to.regress = c("nUMI", "percent_mito")) %>%
  RunPCA(pc.genes = .@var.genes, pcs.compute = 100, do.print = FALSE) %>%
  RunTSNE(dims.use = 1:process_params$n_pcs, do.fast = TRUE, seed.use = process_params$seed) %>% 
  FindClusters(dims.use = 1:process_params$n_pcs, resolution = process_params$resolution, random.seed = process_params$seed)
## Regressing out: nUMI, percent_mito
## Scaling data matrix

# Set the identity to the provided cluster
seurat <- SetAllIdent(seurat, "Cluster")

seurat <- set_qual_pal(seurat)

saveRDS(seurat, glue("{out}-seurat.Rds"))
write_tsv(process_params, glue("{out}-process_params.tsv"))

5.1 Visualization and QC

How many cells per cluster?

table(seurat@ident)
## 
##      aNSC+TAC+NB       Astrocytes              COP      Endothelial 
##             1437             3516              223              472 
##        Ependymal       Fibroblast        Microglia           Neuron 
##              517              557             1613             2089 
## Oligodendrocytes              OPC                T 
##             2148              341              142
plot_grid(cytobox::pca(seurat, title = "PCA", legend = TRUE),
          tsne(seurat, title = "tSNE", legend = TRUE),
          tsne(seurat, title = "nGene", colour_by = "nGene", colours = rdbu, colour_by_type = "continuous", label = FALSE),
          tsne(seurat, title = "nUMI", colour_by = "nUMI", colours = rdbu, colour_by_type = "continuous", label = FALSE),
          tsne(seurat, title = "percent_mito", colour_by = "percent_mito", colours = rdbu, colour_by_type = "continuous", label = FALSE),
          tsne(seurat, title = "G1_S_score", colour_by = "G1_S_score", colours = ylrd, colour_by_type = "continuous", label = FALSE),
          tsne(seurat, title = "G2_M_score", colour_by = "G2_M_score", colours = ylrd, colour_by_type = "continuous", label = FALSE))

5.2 PCA

plot_grid(cytobox::pca(seurat, dim1 = 1, dim2 = 2),
          cytobox::pca(seurat, dim1 = 1, dim2 = 3),
          cytobox::pca(seurat, dim1 = 1, dim2 = 4), ncol = 3)

VizPCA(seurat, 1:4, nCol = 4, font.size = 0.9)

getVarianceExplained(seurat)
## $percent.var.explained
##  [1] 10.522409  8.125522  6.870937  5.983236  5.649987  3.829027  3.518382
##  [8]  2.133000  1.890025  1.698974
## 
## $cum.var.explained
##  [1] 10.52241 18.64793 25.51887 31.50210 37.15209 40.98112 44.49950
##  [8] 46.63250 48.52253 50.22150

6 Markers

Call markers based on the provided cluster labels:

seurat_markers <- FindAllMarkers(seurat)

write_tsv(seurat_markers, glue("{out}-cluster_markers.tsv"))

Plot them:

seurat_markers <- readr::read_tsv(glue("{out}-cluster_markers.tsv"))
## Parsed with column specification:
## cols(
##   p_val = col_double(),
##   avg_logFC = col_double(),
##   pct.1 = col_double(),
##   pct.2 = col_double(),
##   p_val_adj = col_double(),
##   cluster = col_character(),
##   gene = col_character()
## )
DT::datatable(seurat_markers %>%
                group_by(cluster) %>%
                top_n(100, avg_logFC), filter = "top")
top15 <- seurat_markers %>% group_by(cluster) %>% top_n(15, avg_logFC)

DoHeatmap(object = seurat, genes.use = top15$gene,
          slim.col.label = TRUE, remove.key = TRUE,
          col.low = "#2166AC",
          col.mid = "#E5E0DC",
          col.high = "#B2182B")

rownames(seurat_markers) <- NULL

7 Session info

sessionInfo()
## R version 3.5.0 (2018-04-23)
## Platform: x86_64-redhat-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
## 
## Matrix products: default
## BLAS/LAPACK: /var/chroots/hydrars-centos-7/usr/lib64/R/lib/libRblas.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C         LC_TIME=C           
##  [4] LC_COLLATE=C         LC_MONETARY=C        LC_MESSAGES=C       
##  [7] LC_PAPER=C           LC_NAME=C            LC_ADDRESS=C        
## [10] LC_TELEPHONE=C       LC_MEASUREMENT=C     LC_IDENTIFICATION=C 
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] Seurat_2.3.4  Matrix_1.2-14 cowplot_0.9.4 cytobox_0.6.1 ggplot2_3.1.0
##  [6] purrr_0.3.4   glue_1.4.1    magrittr_1.5  readr_1.3.1   dplyr_0.7.7  
## [11] tidyr_0.8.2  
## 
## loaded via a namespace (and not attached):
##   [1] Rtsne_0.15          colorspace_1.4-0    ellipsis_0.3.1     
##   [4] class_7.3-14        modeltools_0.2-22   ggridges_0.5.1     
##   [7] mclust_5.4.2        htmlTable_1.13.1    base64enc_0.1-3    
##  [10] rstudioapi_0.11     proxy_0.4-22        npsurv_0.4-0       
##  [13] bit64_0.9-7         flexmix_2.3-14      mvtnorm_1.0-10     
##  [16] codetools_0.2-15    splines_3.5.0       R.methodsS3_1.7.1  
##  [19] lsei_1.2-0          robustbase_0.93-2   knitr_1.21         
##  [22] jsonlite_1.7.0      Formula_1.2-3       ica_1.0-2          
##  [25] cluster_2.0.7-1     kernlab_0.9-27      png_0.1-7          
##  [28] R.oo_1.22.0         httr_1.4.1          compiler_3.5.0     
##  [31] backports_1.1.3     assertthat_0.2.0    lazyeval_0.2.1     
##  [34] lars_1.2            acepack_1.4.1       htmltools_0.3.6    
##  [37] tools_3.5.0         igraph_1.2.2        bindrcpp_0.2.2     
##  [40] gtable_0.2.0        reshape2_1.4.3      RANN_2.6           
##  [43] Rcpp_1.0.0          trimcluster_0.1-2.1 vctrs_0.3.1        
##  [46] gdata_2.18.0        ape_5.2             nlme_3.1-137       
##  [49] iterators_1.0.10    fpc_2.1-11.1        lmtest_0.9-36      
##  [52] gbRd_0.4-11         xfun_0.12           stringr_1.3.1      
##  [55] irlba_2.3.3         lifecycle_0.2.0     gtools_3.8.1       
##  [58] DEoptimR_1.0-8      zoo_1.8-4           MASS_7.3-49        
##  [61] scales_1.0.0        hms_0.4.2           doSNOW_1.0.16      
##  [64] parallel_3.5.0      RColorBrewer_1.1-2  yaml_2.2.0         
##  [67] reticulate_1.10     pbapply_1.4-0       gridExtra_2.3      
##  [70] segmented_0.5-3.0   rpart_4.1-13        latticeExtra_0.6-28
##  [73] stringi_1.2.4       foreach_1.4.4       randomForest_4.6-14
##  [76] checkmate_1.9.1     caTools_1.17.1.1    bibtex_0.4.2       
##  [79] Rdpack_0.10-1       SDMTools_1.1-221    rlang_0.4.6        
##  [82] pkgconfig_2.0.2     dtw_1.20-1          prabclus_2.2-7     
##  [85] bitops_1.0-6        evaluate_0.12       lattice_0.20-35    
##  [88] ROCR_1.0-7          bindr_0.1.1         htmlwidgets_1.3    
##  [91] bit_1.1-14          tidyselect_1.1.0    plyr_1.8.4         
##  [94] R6_2.3.0            snow_0.4-3          gplots_3.0.1.1     
##  [97] Hmisc_4.2-0         pillar_1.4.4        foreign_0.8-70     
## [100] withr_2.1.2         mixtools_1.1.0      fitdistrplus_1.0-14
## [103] survival_2.41-3     nnet_7.3-12         tsne_0.1-3         
## [106] tibble_3.0.1        hdf5r_1.0.0         crayon_1.3.4       
## [109] KernSmooth_2.23-15  rmarkdown_1.11      viridis_0.5.1      
## [112] grid_3.5.0          data.table_1.12.0   metap_1.1          
## [115] digest_0.6.25       diptest_0.75-7      R.utils_2.7.0      
## [118] stats4_3.5.0        munsell_0.5.0       viridisLite_0.3.0