if (!require("pacman")) install.packages("pacman")
## Loading required package: pacman
library(pacman)
p_load(knitr, dplyr, reshape2, spdep, raster, doParallel, scales, viridis, plot.matrix,
data.table, dynamicTreeCut, RColorBrewer, ggplot2)
Alter this path to the location containing the “Code” and “Data” folders
knitr::opts_knit$set(root.dir = 'E:/SpatialProteomicsCode')
#Define function to read maxquant proteingroups table
readMQ <- function(fp) {
read.csv(fp, sep = "\t", stringsAsFactors = FALSE, header = TRUE, quote = "")
}
#Define function to median centre columns of a dataframe
normalise <- function(protein.data) {
sapply(protein.data, function(y) y - median(as.numeric(y), na.rm = TRUE))
}
#Define function to calculate Moran's Index in parallel
moran.Parallel <- function(protein.data, geneNames, nPixels, cores, multiPlate = FALSE, nsim = 999, nrow = 16, ncol = 24,
impute = FALSE, lag.max = (nrow*ncol)/2, plot = FALSE,
cutoff = 8) {
registerDoParallel(cores = cores)
fill.na <- function(x, i=5) {
if( is.na(x)[i] ) {
return( round(mean(x, na.rm=TRUE),0) )
} else {
#return( round(x[i],0) )
return(x[i])
}
}
a <- foreach(gene = iter(geneNames), .combine = "rbind", .packages = c("raster")) %dopar% {
errorFlag <- FALSE
filteredGenes <- dplyr::filter(protein.data, Gene.names == gene)
vec <- as.numeric(filteredGenes[1,][1:nPixels])
if (all(is.na(vec))) {
return(data.frame(gene = gene, moran = NA, p.value = NA, valid.values = 0))
}
if (multiPlate == TRUE) {
plate1 = matrix(rev(vec[1:96]), nrow = 8, ncol = 12, byrow = T)
plate2 = matrix(rev(vec[97:192]), nrow = 8, ncol = 12, byrow = T)
plate3 = matrix(rev(vec[193:288]), nrow = 8, ncol = 12, byrow = T)
plate4 = matrix(rev(vec[289:384]), nrow = 8, ncol = 12, byrow = T)
m = rbind(cbind(plate1,plate2),cbind(plate3,plate4))
r <- raster::raster(m)
}
else {
r <- raster(matrix(vec, nrow = nrow, ncol = ncol, byrow = T))
}
if (impute == TRUE) {
r <- raster::focal(r, w = matrix(1,3,3), fun = fill.na, pad = TRUE, na.rm = FALSE)
}
if (plot == TRUE) {
if (impute == FALSE) {
plot(as.matrix(r), xlab = '', ylab = '', xaxt = 'n', na.col = 'white',
col = viridis(24), main = paste(gene))
}
if (impute == TRUE) {
plot(as.matrix(r), xlab = '', ylab = '', xaxt = 'n', na.col = 'white',
col = viridis(24), main = paste(gene, "(Imputed)"))
}
}
if (sum(!is.na(raster::as.matrix(r))) < cutoff) {
return(data.frame(gene = gene, moran = NA, p.value = NA, valid.values = sum(!is.na(raster::as.matrix(r)))))
}
w <- spdep::poly2nb(raster::rasterToPolygons(r, na.rm = F))
ww <- spdep::nb2listw(w, style = 'B')
tryCatch({
mi.mc <- spdep::moran.mc(raster::values(r),ww,nsim=nsim,na.action = na.omit, zero.policy = TRUE)
},
error = function(e){
errorFlag <<- TRUE
message("* Caught an error on gene ", gene)
})
if (errorFlag) {
return(data.frame(gene = gene, moran = NA, p.value = NA, valid.values = NA))
}
return(data.frame(gene = gene, moran = mi.mc$statistic, p.value = mi.mc$p.value, valid.values = sum(!is.na(raster::as.matrix(r)))))
}
stopImplicitCluster()
rownames(a) <- NULL
return(a)
}
#Define function to plot protein intensity in sampled grid layout
plotGeneConcat <- function(protein.data, geneName, nPixels = 384, ncol = 24, nrow = 16,
impute = FALSE, colPalette = viridis(24), na.col = "white",
scaled = FALSE, subset.data = FALSE, index = c(1:nPixels), ...) {
filteredGenes <- filter(protein.data, Gene.names == geneName)
vec <- as.numeric(filteredGenes[1,][1:nPixels])
if (all(is.na(vec))) {
next
}
if (subset.data == TRUE) {
vec2 <- c(rep(NaN,384))
vec2[index] <- vec[index]
vec <- vec2
}
if (scaled == TRUE) {
vec <- scales::rescale(vec, to=c(0,1))
vec <- vec * 100
}
plate1 = matrix(rev(vec[1:96]), nrow = 8, ncol = 12, byrow = T)
plate2 = matrix(rev(vec[97:192]), nrow = 8, ncol = 12, byrow = T)
plate3 = matrix(rev(vec[193:288]), nrow = 8, ncol = 12, byrow = T)
plate4 = matrix(rev(vec[289:384]), nrow = 8, ncol = 12, byrow = T)
m = rbind(cbind(plate1,plate2),cbind(plate3,plate4))
r <- raster(m)
if (impute == TRUE) {
r <- focal(r, w = matrix(1,3,3), fun = fill.na, pad = TRUE, na.rm = FALSE)
}
plot(as.matrix(r), xlab = '', ylab = '', xaxt = 'n', na.col = na.col,
#col = colPalette, main = paste(geneName), ...)
col = colPalette, ...)
}
#Define function to plot cluster assignment in sampled grid layout
plotClusters <- function(vec, nPixels, ncol, nrow, multiplate = FALSE, print.matrix = FALSE,
title = NULL, byrow = TRUE,
#col = colorRampPalette(brewer.pal(12, "Set3"))(n.col),
col = colorRampPalette(brewer.pal(9, "Set1"))(n.col), na.col = "white",
...) {
if (all(is.na(vec))) {
next
}
n.col = length(unique(as.numeric(vec)[!is.na(as.numeric(vec))]))
if (multiplate == TRUE) {
plate1 = matrix(rev(vec[1:96]), nrow = 8, ncol = 12, byrow = byrow)
plate2 = matrix(rev(vec[97:192]), nrow = 8, ncol = 12, byrow = byrow)
plate3 = matrix(rev(vec[193:288]), nrow = 8, ncol = 12, byrow = byrow)
plate4 = matrix(rev(vec[289:384]), nrow = 8, ncol = 12, byrow = byrow)
m = rbind(cbind(plate1,plate2),cbind(plate3,plate4))
r <- raster(m)
} else {
r <- raster(matrix(vec, nrow = nrow, ncol = ncol, byrow = byrow))
}
if (print.matrix == TRUE) {
print(as.matrix(r))
}
plot(as.matrix(r), xlab = '', ylab = '', xaxt = 'n', na.col = na.col,
col = col, main = title, ...)
#col = colorRampPalette(c("gray", "black"))(n.col), main = title, ...)
}
mq.output <- readMQ("Data/MaxQuant Result - 833 um.txt")
dim(mq.output)
## [1] 5758 3107
filterColumns <- c("Only.identified.by.site", "Reverse", "Potential.contaminant")
mq.filtered <- filter_at(mq.output, vars(all_of(filterColumns)), all_vars(. != "+"))
dim(mq.filtered)
## [1] 5373 3107
quantCols <- names(mq.filtered[,grepl("LFQ.intensity", names(mq.filtered))])
idCols <- names(mq.filtered[,c("Protein.IDs","Majority.protein.IDs",
"Protein.names","Gene.names")])
data <- mq.filtered[,c(quantCols,idCols)]
data[quantCols] <- log2(data[,quantCols])
data <- data %>% mutate_if(is.numeric, list(~na_if(., -Inf))) # remove infinite values from log-transform
data.melt <- reshape2::melt(data)
## Using Protein.IDs, Majority.protein.IDs, Protein.names, Gene.names as id variables
intensityDistribution <- ggplot(data.melt, aes(x = variable, y = value)) +
geom_boxplot(notch = FALSE, outlier.shape = NA) +
scale_x_discrete(labels = NULL, breaks = NULL) + labs(x = "Sample", y = "Intensity") +
theme_classic()
plot(intensityDistribution)
## Warning: Removed 1127388 rows containing non-finite values (stat_boxplot).
data.norm <- cbind(normalise(data[,quantCols]),data[,idCols])
data.norm.melt <- reshape2::melt(data.norm)
## Using Protein.IDs, Majority.protein.IDs, Protein.names, Gene.names as id variables
intensityDistribution.norm <- ggplot(data.norm.melt, aes(x = variable, y = value)) +
geom_boxplot(notch = FALSE, outlier.shape = NA) +
scale_x_discrete(labels = NULL, breaks = NULL) + labs(x = "Sample", y = "Normalised Intensity") +
theme_classic()
plot(intensityDistribution.norm)
## Warning: Removed 1127388 rows containing non-finite values (stat_boxplot).
Pre-calculated values are included for speed. To recalculate values, comment/uncomment lines as described below Adjust ‘cores’ to the number of logical cores to split the workload over
#Uncomment this block to re-calculate, adjust cores to appropriate value based on computer hardware
# data.moran <- moran.Parallel(data.norm,
# data.norm$Gene.names,
# nPixels = 384,
# multiPlate = TRUE,
# nrow = 16,
# ncol = 24,
# cores = 4
# )
#saveRDS(data.moran, "Data/data-moran-recalculated.rds")
# remove this line if recalculating, otherwise pre-calculated values will load
data.moran <- readRDS("Data/data-moran.rds")
titles <- c("Peripherin", "Glycogen phosphorylase, liver form", "Histone H4",
"Haemoglobin", "Aspartate beta-Hydroxylase", "CD45")
i <- 0
for (gene in c('PRPH','PYGL','HIST1H4A','HBB', 'ASPH', 'PTPRC')) {
i <- i + 1
par(mar = c(2,2,2,6))
res <- plotGeneConcat(data.norm, gene, scaled = TRUE, main = paste(titles[i]), cex = 12,
impute = F, axis.col=NULL, axis.row=NULL,
asp = T, col = viridis(48), na.col = "gray70", fmt.key = "%.0f %%",
key=list(side = 4, las = 1, tick = FALSE, labels = FALSE),
polygon.key = list(border = NA))
}
As some sampled voxels were either empty or contained only haemorrhage, these need to be removed
voxel.type <- c(rep('Tissue',84),rep('Haemorrhage',2),rep('Tissue',10), #plate1
rep('Tissue',52),rep('Haemorrhage',3),rep('Tissue',5),'Empty',rep('Tissue',2),rep('Haemorrhage',5),rep('Tissue',4),'Empty','Tissue',rep('Haemorrhage',9),'Tissue','Empty',rep('Haemorrhage',11), #plate2
rep(c(rep('Tissue',11),'Empty'),7),rep('Tissue',12), #plate3
'Empty',rep('Tissue',11),'Empty',rep('Tissue',11),rep('Tissue',72))#plate4
plate1 = matrix(rev(voxel.type[1:96]), nrow = 8, ncol = 12, byrow = T)
plate2 = matrix(rev(voxel.type[97:192]), nrow = 8, ncol = 12, byrow = T)
plate3 = matrix(rev(voxel.type[193:288]), nrow = 8, ncol = 12, byrow = T)
plate4 = matrix(rev(voxel.type[289:384]), nrow = 8, ncol = 12, byrow = T)
voxel.type.m = rbind(cbind(plate1,plate2),cbind(plate3,plate4))
par(mar = c(2,0,2,6))
plot(voxel.type.m, axis.col=NULL, axis.row=NULL, xlab='', ylab='', main = '', asp = T,
col = c("#FFFFCC","#A1DAB4","#225EA8"), breaks = c("Tissue","Haemorrhage","Empty"))
rm(plate1,plate2,plate3,plate4)
tissue <- names(data.norm[,seq(1:384)[voxel.type == 'Tissue']])
tissue.index <- seq(1:384)[voxel.type == 'Tissue']
tissue.empty <- names(data.norm[,seq(1:384)[voxel.type == 'Empty']])
tissue.empty.index <- seq(1:384)[voxel.type == 'Empty']
tissue.haemorrhage <- names(data.norm[,seq(1:384)[voxel.type == 'Haemorrhage']])
tissue.haemorrhage.index <- seq(1:384)[voxel.type == 'Haemorrhage']
data.tissue <- data[, voxel.type == "Tissue"]
data.norm.tissue <- data.norm[, voxel.type == "Tissue"]
Distance matrix and hierarchical clustering
data.tissue.t <- transpose(data.tissue)
rownames(data.tissue.t) <- colnames(data.tissue)
colnames(data.tissue.t) <- data.tissue.t["Gene.names", ]
dist.tissue <- dist(data.tissue.t[1:342,])
hclust.tissue <- hclust(dist.tissue, method = "average")
data.norm.tissue.t <- transpose(data.norm.tissue)
rownames(data.norm.tissue.t) <- colnames(data.norm.tissue)
colnames(data.norm.tissue.t) <- data.norm.tissue.t["Gene.names", ]
dist.tissue.norm <- dist(data.norm.tissue.t[1:342,])
hclust.tissue.norm <- hclust(dist.tissue, method = "average")
Assign clusters to dendrogram and plot clusters onto grid sampled
dynamicClusters <- cutreeDynamic(hclust.tissue.norm, distM = as.matrix(dist.tissue.norm),
minClusterSize = 1,
method = "hybrid",
cutHeight = 100)
## ..done.
temp <- c(rep(NaN,384))
temp[tissue.index] <- dynamicClusters
par(mar = c(2,2,2,2))
plotClusters(temp,
title = "",
axis.col=NULL, axis.row=NULL,
na.col = "gray70", na.print = FALSE,
nrow = data.cluster.nrow[7],
ncol = data.cluster.ncol[7],
multiplate = TRUE,
fmt.cell='%.0f', cex = 1.25,
key = NULL,
asp = T
)
#reinsert dropped voxels
clusters <- data.frame(sample = tissue.index, cluster = dynamicClusters)
empty <- c(1:384)[!c(1:384) %in% clusters$sample]
df.empty <- data.frame(sample = empty, cluster = "Empty")
clusters <- rbind(clusters, df.empty)
clusters <- clusters[order(clusters$sample),]