# **MODIFY THIS CHUNK**
project_id <- "mizrak_data" # determines the name of the cache folder
doc_id <- "02" # determines name of the subfolder of `figures` where pngs/pdfs are saved
out <- paste0("../processed_data/", doc_id)
figout <- paste0("figures/", doc_id, "/")
cache <- paste0("~/tmp/", project_id, "/", doc_id, "/")
Load expression matrix obtained from GEO, and create a Seurat object in order to run basic analyses (normalization, dimensionality reduction, clustering, derivation of cluster markers, etc).
library(tidyr)
library(dplyr)
library(readr)
library(magrittr)
library(glue)
library(purrr)
library(ggplot2)
library(cytobox)
library(cowplot)
library(Seurat)
source("../../../sjessa/from_hydra/HGG-G34/samples/functions.R")
source("../../../sjessa/from_hydra/misc/sjlib.R")
ggplot2::theme_set(cytobox::theme_min(border_colour = "black", border_size = 0.5))
id <- read.table("../data/GSE109447_13055_cells_id.txt.gz", stringsAsFactors = FALSE)
# It seems that to get the right cell #s, we leave out c("cell")
label <- read.table("../data/GSE109447_Rep2_13055cells_Basic.txt.gz", stringsAsFactors = FALSE) %>%
filter(!(V1 %in% c("+", "Cell", "Cell+", "Mural")))
barcodes <- readxl::read_xlsx("../data/GSE109447_13055_Cell_Barcodes.xlsx", col_names = FALSE)
## New names:
## * `` -> ...1
mat <- data.table::fread("../data/GSE109447_13055_cells_matrix.txt.gz", data.table = FALSE)
mat[1:5, 1:5]
# Separated out the rows containing gene name duplciates
gene_instances <- table(mat[, 2])
duplicates <- names(gene_instances)[gene_instances > 1]
mat_dups <- mat[mat[, 2] %in% duplicates, ]
mat_dups$V1 <- NULL
# Sum counts per duplicated gene
mat_dups_summed <- purrr::map(duplicates, function(gene) {
mat_dups[mat_dups$V2 %in% gene, ] %>%
select(-V2) %>%
colSums()
})
mat_dups_summed <- do.call(rbind, mat_dups_summed)
rownames(mat_dups_summed) <- duplicates
# Prep the matrix of non-duplicated counts in the same format
mat_nodups <- mat[!(mat[, 2] %in% duplicates), ]
mat_nodups$V1 <- NULL
rownames(mat_nodups) <- mat_nodups$V2
mat_nodups$V2 <- NULL
mat_nodups <- as.matrix(mat_nodups)
# Put the two matrices back together
mat_fixed <- rbind(mat_dups_summed, mat_nodups)
# Put it in the same order as the original gene list
mat_fixed <- mat_fixed[unique(mat[, 2]), ]
colnames(mat_fixed) <- paste0(id[[1]], "_", barcodes$...1)
# Create metadata
metadata <- data.frame(ID = id[[1]],
Sample = id %>% separate(V1, into = c("rep", "number"), sep = "\\_") %>% pull(rep),
Cluster = label[[1]])
rownames(metadata) <- paste0(id[[1]], "_", barcodes$...1)
# Palette
palette <- c("FlatRep2" = "#CBC9E2",
"FsepRep2" = "#9E9AC8",
"MsepRep2" = "#BAE4B3",
"MlatRep2" = "#74C476")
seurat <- CreateSeuratObject(raw.data = mat_fixed,
min.cells = 3,
min.genes = 0,
is.expr = 0,
project = "Mizrak_VSVZ_Rep2",
meta.data = metadata)
# Add mitochondrial content to the metadata
mito_genes <- grep("^mt-", rownames(seurat@data), value = TRUE)
percent_mito <- Matrix::colSums(seurat@data[mito_genes, ])/Matrix::colSums(seurat@data) * 100
seurat <- AddMetaData(seurat, percent_mito, "percent_mito")
# Add cell cycle score
seurat <- addCellCycle(seurat, species = "m_musculus")
ngene_lim <- 3500
numi_lim <- 10000
mito_lim <- 10
plot_grid(VlnPlot(seurat, c("nGene"), cols.use = palette,
do.return = TRUE, group.by = "Sample", point.size.use = -1, x.lab.rot = TRUE) + ylim(NA, ngene_lim),
VlnPlot(seurat, c("nUMI"), cols.use = palette,
do.return = TRUE, group.by = "Sample", point.size.use = -1, x.lab.rot = TRUE) + ylim(NA, numi_lim),
VlnPlot(seurat, c("percent_mito"),cols.use = palette,
do.return = TRUE, group.by = "Sample", point.size.use = -1, x.lab.rot = TRUE) + ylim(NA, mito_lim),
ncol = 3)
## Scale for 'y' is already present. Adding another scale for 'y', which
## will replace the existing scale.
## Scale for 'y' is already present. Adding another scale for 'y', which
## will replace the existing scale.
## Scale for 'y' is already present. Adding another scale for 'y', which
## will replace the existing scale.
(process_params <- data.frame(
n_pcs = 10,
resolution = 0.6,
seed = 100
))
seurat <- seurat %>%
NormalizeData(normalization.method = "LogNormalize", scale.factor = 10000) %>%
FindVariableGenes(mean.function = ExpMean, dispersion.function = LogVMR,
x.low.cutoff = 0.0125, x.high.cutoff = 3, y.cutoff = 0.5) %>%
ScaleData(vars.to.regress = c("nUMI", "percent_mito")) %>%
RunPCA(pc.genes = .@var.genes, pcs.compute = 100, do.print = FALSE) %>%
RunTSNE(dims.use = 1:process_params$n_pcs, do.fast = TRUE, seed.use = process_params$seed) %>%
FindClusters(dims.use = 1:process_params$n_pcs, resolution = process_params$resolution, random.seed = process_params$seed)
## Regressing out: nUMI, percent_mito
## Scaling data matrix
# Set the identity to the provided cluster
seurat <- SetAllIdent(seurat, "Cluster")
seurat <- set_qual_pal(seurat)
saveRDS(seurat, glue("{out}-seurat.Rds"))
write_tsv(process_params, glue("{out}-process_params.tsv"))
How many cells per cluster?
table(seurat@ident)
##
## aNSC+TAC+NB Astrocytes COP Endothelial
## 1437 3516 223 472
## Ependymal Fibroblast Microglia Neuron
## 517 557 1613 2089
## Oligodendrocytes OPC T
## 2148 341 142
plot_grid(cytobox::pca(seurat, title = "PCA", legend = TRUE),
tsne(seurat, title = "tSNE", legend = TRUE),
tsne(seurat, title = "nGene", colour_by = "nGene", colours = rdbu, colour_by_type = "continuous", label = FALSE),
tsne(seurat, title = "nUMI", colour_by = "nUMI", colours = rdbu, colour_by_type = "continuous", label = FALSE),
tsne(seurat, title = "percent_mito", colour_by = "percent_mito", colours = rdbu, colour_by_type = "continuous", label = FALSE),
tsne(seurat, title = "G1_S_score", colour_by = "G1_S_score", colours = ylrd, colour_by_type = "continuous", label = FALSE),
tsne(seurat, title = "G2_M_score", colour_by = "G2_M_score", colours = ylrd, colour_by_type = "continuous", label = FALSE))
plot_grid(cytobox::pca(seurat, dim1 = 1, dim2 = 2),
cytobox::pca(seurat, dim1 = 1, dim2 = 3),
cytobox::pca(seurat, dim1 = 1, dim2 = 4), ncol = 3)
VizPCA(seurat, 1:4, nCol = 4, font.size = 0.9)
getVarianceExplained(seurat)
## $percent.var.explained
## [1] 10.522409 8.125522 6.870937 5.983236 5.649987 3.829027 3.518382
## [8] 2.133000 1.890025 1.698974
##
## $cum.var.explained
## [1] 10.52241 18.64793 25.51887 31.50210 37.15209 40.98112 44.49950
## [8] 46.63250 48.52253 50.22150
Call markers based on the provided cluster labels:
seurat_markers <- FindAllMarkers(seurat)
write_tsv(seurat_markers, glue("{out}-cluster_markers.tsv"))
Plot them:
seurat_markers <- readr::read_tsv(glue("{out}-cluster_markers.tsv"))
## Parsed with column specification:
## cols(
## p_val = col_double(),
## avg_logFC = col_double(),
## pct.1 = col_double(),
## pct.2 = col_double(),
## p_val_adj = col_double(),
## cluster = col_character(),
## gene = col_character()
## )
DT::datatable(seurat_markers %>%
group_by(cluster) %>%
top_n(100, avg_logFC), filter = "top")
top15 <- seurat_markers %>% group_by(cluster) %>% top_n(15, avg_logFC)
DoHeatmap(object = seurat, genes.use = top15$gene,
slim.col.label = TRUE, remove.key = TRUE,
col.low = "#2166AC",
col.mid = "#E5E0DC",
col.high = "#B2182B")
rownames(seurat_markers) <- NULL
sessionInfo()
## R version 3.5.0 (2018-04-23)
## Platform: x86_64-redhat-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
##
## Matrix products: default
## BLAS/LAPACK: /var/chroots/hydrars-centos-7/usr/lib64/R/lib/libRblas.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=C
## [4] LC_COLLATE=C LC_MONETARY=C LC_MESSAGES=C
## [7] LC_PAPER=C LC_NAME=C LC_ADDRESS=C
## [10] LC_TELEPHONE=C LC_MEASUREMENT=C LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] Seurat_2.3.4 Matrix_1.2-14 cowplot_0.9.4 cytobox_0.6.1 ggplot2_3.1.0
## [6] purrr_0.3.4 glue_1.4.1 magrittr_1.5 readr_1.3.1 dplyr_0.7.7
## [11] tidyr_0.8.2
##
## loaded via a namespace (and not attached):
## [1] Rtsne_0.15 colorspace_1.4-0 ellipsis_0.3.1
## [4] class_7.3-14 modeltools_0.2-22 ggridges_0.5.1
## [7] mclust_5.4.2 htmlTable_1.13.1 base64enc_0.1-3
## [10] rstudioapi_0.11 proxy_0.4-22 npsurv_0.4-0
## [13] bit64_0.9-7 flexmix_2.3-14 mvtnorm_1.0-10
## [16] codetools_0.2-15 splines_3.5.0 R.methodsS3_1.7.1
## [19] lsei_1.2-0 robustbase_0.93-2 knitr_1.21
## [22] jsonlite_1.7.0 Formula_1.2-3 ica_1.0-2
## [25] cluster_2.0.7-1 kernlab_0.9-27 png_0.1-7
## [28] R.oo_1.22.0 httr_1.4.1 compiler_3.5.0
## [31] backports_1.1.3 assertthat_0.2.0 lazyeval_0.2.1
## [34] lars_1.2 acepack_1.4.1 htmltools_0.3.6
## [37] tools_3.5.0 igraph_1.2.2 bindrcpp_0.2.2
## [40] gtable_0.2.0 reshape2_1.4.3 RANN_2.6
## [43] Rcpp_1.0.0 trimcluster_0.1-2.1 vctrs_0.3.1
## [46] gdata_2.18.0 ape_5.2 nlme_3.1-137
## [49] iterators_1.0.10 fpc_2.1-11.1 lmtest_0.9-36
## [52] gbRd_0.4-11 xfun_0.12 stringr_1.3.1
## [55] irlba_2.3.3 lifecycle_0.2.0 gtools_3.8.1
## [58] DEoptimR_1.0-8 zoo_1.8-4 MASS_7.3-49
## [61] scales_1.0.0 hms_0.4.2 doSNOW_1.0.16
## [64] parallel_3.5.0 RColorBrewer_1.1-2 yaml_2.2.0
## [67] reticulate_1.10 pbapply_1.4-0 gridExtra_2.3
## [70] segmented_0.5-3.0 rpart_4.1-13 latticeExtra_0.6-28
## [73] stringi_1.2.4 foreach_1.4.4 randomForest_4.6-14
## [76] checkmate_1.9.1 caTools_1.17.1.1 bibtex_0.4.2
## [79] Rdpack_0.10-1 SDMTools_1.1-221 rlang_0.4.6
## [82] pkgconfig_2.0.2 dtw_1.20-1 prabclus_2.2-7
## [85] bitops_1.0-6 evaluate_0.12 lattice_0.20-35
## [88] ROCR_1.0-7 bindr_0.1.1 htmlwidgets_1.3
## [91] bit_1.1-14 tidyselect_1.1.0 plyr_1.8.4
## [94] R6_2.3.0 snow_0.4-3 gplots_3.0.1.1
## [97] Hmisc_4.2-0 pillar_1.4.4 foreign_0.8-70
## [100] withr_2.1.2 mixtools_1.1.0 fitdistrplus_1.0-14
## [103] survival_2.41-3 nnet_7.3-12 tsne_0.1-3
## [106] tibble_3.0.1 hdf5r_1.0.0 crayon_1.3.4
## [109] KernSmooth_2.23-15 rmarkdown_1.11 viridis_0.5.1
## [112] grid_3.5.0 data.table_1.12.0 metap_1.1
## [115] digest_0.6.25 diptest_0.75-7 R.utils_2.7.0
## [118] stats4_3.5.0 munsell_0.5.0 viridisLite_0.3.0