postscript("hw4plot1a.ps", horizontal=F)
par(pty="s")
hist(resultsA$thetastar-resultsB$thetastar)
#dev.off()
t.test(A,B, var.equal=T)
source("https://edoras.sdsu.edu/~babailey/stat672/bootstrap.r")
ice <- read.table("http://edoras.sdsu.edu/~babailey/stat700/ice.dat", header=T)
ice <- na.omit(ice)
attach(ice)
set.seed(6)
resultsA <- bootstrap(ice$A, 1000, mean)
resultsB <- bootstrap(ice$B, 1000, mean)
quantile(resultsA$thetastar-resultsB$thetastar, prob=c(0.025, 0.975))
#postscript("hw4plot1a.ps", horizontal=F)
#par(pty="s")
hist(resultsA$thetastar-resultsB$thetastar)
#dev.off()
t.test(A,B, var.equal=T)
dev.off()
dev.off()
source("https://edoras.sdsu.edu/~babailey/stat672/bootstrap.r")
ice <- read.table("http://edoras.sdsu.edu/~babailey/stat700/ice.dat", header=T)
ice <- na.omit(ice)
attach(ice)
set.seed(6)
resultsA <- bootstrap(ice$A, 1000, mean)
resultsB <- bootstrap(ice$B, 1000, mean)
quantile(resultsA$thetastar-resultsB$thetastar, prob=c(0.025, 0.975))
#postscript("hw4plot1a.ps", horizontal=F)
#par(pty="s")
hist(resultsA$thetastar-resultsB$thetastar)
#dev.off()
t.test(A,B, var.equal=T)
source("https://edoras.sdsu.edu/~babailey/stat672/bootstrap.r")
ice <- read.table("http://edoras.sdsu.edu/~babailey/stat700/ice.dat", header=T)
ice <- na.omit(ice)
attach(ice)
set.seed(6)
resultsA <- bootstrap(ice$A, 1000, mean)
resultsB <- bootstrap(ice$B, 1000, mean)
quantile(resultsA$thetastar-resultsB$thetastar, prob=c(0.025, 0.975))
#postscript("hw4plot1a.ps", horizontal=F)
#par(pty="s")
hist(resultsA$thetastar-resultsB$thetastar)
#dev.off()
t.test(A,B, var.equal=T)
source("https://edoras.sdsu.edu/~babailey/stat672/bootstrap.r")
ice <- read.table("http://edoras.sdsu.edu/~babailey/stat700/ice.dat", header=T)
ice <- na.omit(ice)
attach(ice)
set.seed(6)
resultsA <- bootstrap(ice$A, 1000, mean)
resultsB <- bootstrap(ice$B, 1000, mean)
quantile(resultsA$thetastar-resultsB$thetastar, prob=c(0.025, 0.975))
#postscript("hw4plot1a.ps", horizontal=F)
#par(pty="s")
hist(resultsA$thetastar-resultsB$thetastar)
#dev.off()
t.test(A,B, var.equal=T)
#!/usr/bin/env Rscript
######### checking dependencies, installing libraries. #######
insPacs = installed.packages()[,"Package"]
if(!("statmod" %in% insPacs)) install.packages("statmod", repos = "http://cran.us.r-project.org")
if(!("ggplot2" %in% insPacs)) install.packages("ggplot2", repos = "http://cran.us.r-project.org")
if(!("gridExtra" %in% insPacs)) install.packages("gridExtra", repos = "http://cran.us.r-project.org")
if(!("preprocessCore" %in% insPacs)) install.packages("preprocessCore", repos = "http://cran.us.r-project.org")
if(!("ggfortify" %in% insPacs)) install.packages("ggfortify", repos = "http://cran.us.r-project.org")
if(!("edgeR" %in% insPacs)){
source("https://bioconductor.org/biocLite.R")
biocLite("edgeR")}
library(preprocessCore)
library(statmod)
library(edgeR)
library(gridExtra)
library(ggfortify)
library(ggplot2)
library(preprocessCore)
library(statmod)
library(edgeR)
library(gridExtra)
library(ggfortify)
library(ggplot2)
inFile = "../demos/d1.Yarui/data.tsv"
fgs=c("cis1","cis2","cis3","cis4","cis5")
bgs=c("ctr1","ctr2")
qnorm = 0
min_cpm = 5
min_cpm_ratio = 0.5
negLabel = "negative"
testLabel = "test"
qcutoffs = c(0, 0.001, 0.005, 0.01, 0.05, 0.1) # pvalue guidelines help to aim at these qcutoffs.
### read data.
dat = read.table(inFile, header = T)
### derived values and more warnings
nCol = dim(dat)[2]
allExps = c(2:(nCol-1))
## check qnorm parameter.
# convert qnorm string input to list
str2lists <- function(inStr){
a = strsplit(inStr,";")[[1]]
b = strsplit(a,",")
return(b)
}
if(grepl(",", qnorm, fixed=TRUE)){
qnormLists = str2lists(qnorm)
qnormFlat = unlist(qnormLists)
if(all(qnormFlat %in% colnames(dat)[allExps]) == FALSE) stop("Specified qnorm experiments not in read count header!")
for (qnormList in qnormLists){
qnormArray = unlist(qnormList)
if(length(qnormArray)<=1) stop("Needs 2 or more specified experiments for each qnorm group!")
}
}
## check number of rep
if(length(fgs)==1 || length(bgs)==1){
cat("#### Warning: ####\n
no biological replicates provided, using pooled fg/bg for dispersion estimation\n
For details, checkout estimateGLMCommonDisp in edgeR.\n\n")
hasRep = 0
}else{
hasRep = 1
}
################## preprocessing #####################
### quantile normalization of reads
qnormFun <- function(df){
mat = data.matrix(df)
df.qnorm = as.data.frame(normalize.quantiles(mat))
rownames(df.qnorm) = rownames(df)
colnames(df.qnorm) = colnames(df)
return(df.qnorm)
}
## qnorm at given named columns, in place.
qnormCols <- function(df, cols){
df[,cols] = qnormFun(df[,cols])
return(df)
}
## run qnorm
if(qnorm=="1"){
cat("qnorm within fgs and within bgs\n")
if(length(fgs)>1) dat = qnormCols(dat, fgs)
if(length(bgs)>1) dat = qnormCols(dat, bgs)
}else if(grepl(",", qnorm, fixed=TRUE)){
cat("qnorm in the specified columns\n")
for(qnormList in str2lists(qnorm)){
qnormArray = unlist(qnormList)
dat = qnormCols(dat, qnormArray)
}
}
### boxplot on all reads. ###
cat("Plotting distribution of sgRNA reads ...\n")
X = data.frame(dat[, allExps])
X_stack = stack(X)
colnames(X_stack) <- c("Read","Exp")
distr1 <- ggplot(X_stack, aes(x=Exp, y=log10(Read+1))) +
geom_violin() +
geom_boxplot(width = 0.1, outlier.shape = NA) +
xlab("Experiments") +
ylab("Reads counts (log10(Read+1))") +
theme_classic()
### PCA on all reads. ###
cat("Running PCA ...\n")
## pca1 # view by sgRNAs
ppca1 <- autoplot(prcomp(X), data=dat, colour = colnames(dat)[dim(dat)[2]],
loadings = T,
loadings.colour = 'blue',
loadings.label = T,
size = 0.5) + theme_classic()
## pca2 # view by experiments
tX <- as.data.frame(t(X))
ppca2 <- autoplot(prcomp(tX),
label = TRUE,
loadings.label.repel=T,
shape = FALSE) + theme_classic()
### keep sgRNA that "expressed in at least one conditions" ### following guideline from edgeR
## the CRISPY cpm filter, scalable and fixes unbalanced issure
row.use.fgs = (rowMeans(data.frame(dat[,c(fgs)] > min_cpm)) >= min_cpm_ratio)
row.use.bgs = (rowMeans(data.frame(dat[,c(bgs)] > min_cpm)) >= min_cpm_ratio)
row.use = (row.use.fgs | row.use.bgs)
dat.filtered = dat[row.use, ]
nsgRNA = dim(dat)[1]
nsgRNA.filtered = dim(dat.filtered)[1]
msg = paste0("min_cpm: ", round(min_cpm,2), "\n",
"min_cpm_ratio: ", round(min_cpm_ratio,2), "\n",
nsgRNA.filtered, "/", nsgRNA," (", round(nsgRNA.filtered/nsgRNA*100,2), "%)")
### cumulative percentile ### only use fg and bg ###
## average
X = data.frame(FGs = rowMeans(data.frame(dat.filtered[,c(fgs)])),
BGs = rowMeans(data.frame(dat.filtered[,c(bgs)])))
X_stack = stack(X)
colnames(X_stack) <- c("Read","Exp")
distr2 <- ggplot(X_stack, aes(x=log10(Read+1), color=Exp)) +
stat_ecdf(geom = "line") +
xlab("Filtered normalized reads counts -- Log10(N+1)") +
ylab("Cumulative frequency") +
annotate("text",x=max(log10(X_stack$Read+1)),y=0,label=msg, hjust=1, vjust = 0) +
theme_classic()
## new way to see distro.
X = data.frame(FGs = rowMeans(log10(data.frame(dat.filtered[,c(fgs)])+1)),
BGs = rowMeans(log10(data.frame(dat.filtered[,c(bgs)])+1)),
Group = dat.filtered[,nCol])
distr3 <- ggplot(X, aes(x=BGs, y=FGs, color=Group)) +
geom_point(alpha = 0.3, size = 0.5) +
geom_point(size = 0.5, data = subset(X, Group!=testLabel)) +
xlab("Filtered normalized reads counts in BGs -- Log10(N+1)") +
ylab("Filtered normalized reads counts in FGs") +
theme_classic()
### negative binomial test ###
cat("Negative binomial test on sgRNAs ...\n")
status = as.factor(dat.filtered[,nCol])
group = c(rep("fg", length(fgs)), rep("bg", length(bgs)))
design = model.matrix(~group)
reads = dat.filtered[,c(fgs,bgs)]
dlist = DGEList(as.matrix(reads))
if(hasRep==1){
d = estimateDisp(calcNormFactors(dlist), design)
fit = glmQLFit(d, design, robust=TRUE)
results = glmQLFTest(fit)
}else{
d = estimateGLMCommonDisp(dlist, method="deviance", robust=TRUE, subset=NULL)
fit <- glmFit(d, design)
results <- glmLRT(fit)
}
tab = cbind(results$table, status)
rownames(tab) = dat.filtered[,1]
tab = tab[order(tab$PValue),]  # sort by pval
df
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
inFile = "../demos/d1.Yarui/data.tsv"
fgs=c("cis1","cis2","cis3","cis4","cis5")
bgs=c("ctr1","ctr2")
qnorm = 0
min_cpm = 5
min_cpm_ratio = 0.5
dat = read.table(inFile, header = T)
inFile
ls
ls()
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
inFile = "../demos/d1.Yarui/data.tsv"
fgs=c("cis1","cis2","cis3","cis4","cis5")
bgs=c("ctr1","ctr2")
qnorm = 0
min_cpm = 5
min_cpm_ratio = 0.5
negLabel = "negative"
testLabel = "test"
qcutoffs = c(0, 0.001, 0.005, 0.01, 0.05, 0.1) # pvalue guidelines help to aim at these qcutoffs.
### read data.
dat = read.table(inFile, header = T)
### derived values and more warnings
nCol = dim(dat)[2]
allExps = c(2:(nCol-1))
## check qnorm parameter.
# convert qnorm string input to list
str2lists <- function(inStr){
a = strsplit(inStr,";")[[1]]
b = strsplit(a,",")
return(b)
}
if(grepl(",", qnorm, fixed=TRUE)){
qnormLists = str2lists(qnorm)
qnormFlat = unlist(qnormLists)
if(all(qnormFlat %in% colnames(dat)[allExps]) == FALSE) stop("Specified qnorm experiments not in read count header!")
for (qnormList in qnormLists){
qnormArray = unlist(qnormList)
if(length(qnormArray)<=1) stop("Needs 2 or more specified experiments for each qnorm group!")
}
}
## check number of rep
if(length(fgs)==1 || length(bgs)==1){
cat("#### Warning: ####\n
no biological replicates provided, using pooled fg/bg for dispersion estimation\n
For details, checkout estimateGLMCommonDisp in edgeR.\n\n")
hasRep = 0
}else{
hasRep = 1
}
################## preprocessing #####################
### quantile normalization of reads
qnormFun <- function(df){
mat = data.matrix(df)
df.qnorm = as.data.frame(normalize.quantiles(mat))
rownames(df.qnorm) = rownames(df)
colnames(df.qnorm) = colnames(df)
return(df.qnorm)
}
## qnorm at given named columns, in place.
qnormCols <- function(df, cols){
df[,cols] = qnormFun(df[,cols])
return(df)
}
## run qnorm
if(qnorm=="1"){
cat("qnorm within fgs and within bgs\n")
if(length(fgs)>1) dat = qnormCols(dat, fgs)
if(length(bgs)>1) dat = qnormCols(dat, bgs)
}else if(grepl(",", qnorm, fixed=TRUE)){
cat("qnorm in the specified columns\n")
for(qnormList in str2lists(qnorm)){
qnormArray = unlist(qnormList)
dat = qnormCols(dat, qnormArray)
}
}
### boxplot on all reads. ###
cat("Plotting distribution of sgRNA reads ...\n")
X = data.frame(dat[, allExps])
X_stack = stack(X)
colnames(X_stack) <- c("Read","Exp")
distr1 <- ggplot(X_stack, aes(x=Exp, y=log10(Read+1))) +
geom_violin() +
geom_boxplot(width = 0.1, outlier.shape = NA) +
xlab("Experiments") +
ylab("Reads counts (log10(Read+1))") +
theme_classic()
### PCA on all reads. ###
cat("Running PCA ...\n")
## pca1 # view by sgRNAs
ppca1 <- autoplot(prcomp(X), data=dat, colour = colnames(dat)[dim(dat)[2]],
loadings = T,
loadings.colour = 'blue',
loadings.label = T,
size = 0.5) + theme_classic()
## pca2 # view by experiments
tX <- as.data.frame(t(X))
ppca2 <- autoplot(prcomp(tX),
label = TRUE,
loadings.label.repel=T,
shape = FALSE) + theme_classic()
### keep sgRNA that "expressed in at least one conditions" ### following guideline from edgeR
## the CRISPY cpm filter, scalable and fixes unbalanced issure
row.use.fgs = (rowMeans(data.frame(dat[,c(fgs)] > min_cpm)) >= min_cpm_ratio)
row.use.bgs = (rowMeans(data.frame(dat[,c(bgs)] > min_cpm)) >= min_cpm_ratio)
row.use = (row.use.fgs | row.use.bgs)
dat.filtered = dat[row.use, ]
nsgRNA = dim(dat)[1]
nsgRNA.filtered = dim(dat.filtered)[1]
msg = paste0("min_cpm: ", round(min_cpm,2), "\n",
"min_cpm_ratio: ", round(min_cpm_ratio,2), "\n",
nsgRNA.filtered, "/", nsgRNA," (", round(nsgRNA.filtered/nsgRNA*100,2), "%)")
### cumulative percentile ### only use fg and bg ###
## average
X = data.frame(FGs = rowMeans(data.frame(dat.filtered[,c(fgs)])),
BGs = rowMeans(data.frame(dat.filtered[,c(bgs)])))
X_stack = stack(X)
colnames(X_stack) <- c("Read","Exp")
distr2 <- ggplot(X_stack, aes(x=log10(Read+1), color=Exp)) +
stat_ecdf(geom = "line") +
xlab("Filtered normalized reads counts -- Log10(N+1)") +
ylab("Cumulative frequency") +
annotate("text",x=max(log10(X_stack$Read+1)),y=0,label=msg, hjust=1, vjust = 0) +
theme_classic()
## new way to see distro.
X = data.frame(FGs = rowMeans(log10(data.frame(dat.filtered[,c(fgs)])+1)),
BGs = rowMeans(log10(data.frame(dat.filtered[,c(bgs)])+1)),
Group = dat.filtered[,nCol])
distr3 <- ggplot(X, aes(x=BGs, y=FGs, color=Group)) +
geom_point(alpha = 0.3, size = 0.5) +
geom_point(size = 0.5, data = subset(X, Group!=testLabel)) +
xlab("Filtered normalized reads counts in BGs -- Log10(N+1)") +
ylab("Filtered normalized reads counts in FGs") +
theme_classic()
### negative binomial test ###
cat("Negative binomial test on sgRNAs ...\n")
status = as.factor(dat.filtered[,nCol])
group = c(rep("fg", length(fgs)), rep("bg", length(bgs)))
design = model.matrix(~group)
reads = dat.filtered[,c(fgs,bgs)]
dlist = DGEList(as.matrix(reads))
if(hasRep==1){
d = estimateDisp(calcNormFactors(dlist), design)
fit = glmQLFit(d, design, robust=TRUE)
results = glmQLFTest(fit)
}else{
d = estimateGLMCommonDisp(dlist, method="deviance", robust=TRUE, subset=NULL)
fit <- glmFit(d, design)
results <- glmLRT(fit)
}
tab = cbind(results$table, status)
rownames(tab) = dat.filtered[,1]
tab = tab[order(tab$PValue),]  # sort by pval
## preview and output
cat("preview top results:\n")
topTags(results)
outTsv=paste0(outDir, "/", paste0(prefix, ".sgRNA.tsv"))
write.table(format(tab,digits =4), file=outTsv, quote=FALSE, sep='\t')
## plotting QC
# highlight postive sgRNA with pvalue < cutoff.
if(pvalCut > 0){
tab$status <- as.character(tab$status)
# default. only positive FC
if(direction == 1){
filteredIdx = (tab$PValue < pvalCut & tab$status==testLabel & tab$logFC>0)
posSgrnaMsg = paste0("Enriched sgRNA p < ", pvalCut)
}
# only negative FC
if(direction == -1){
filteredIdx = (tab$PValue < pvalCut & tab$status==testLabel & tab$logFC<0)
posSgrnaMsg = paste0("Depleted sgRNA p < ", pvalCut)
}
tab[filteredIdx, "status"] = posSgrnaMsg
}
# fc.vs.cpm. The MA plot
p1 <- ggplot(tab, aes(x=logCPM,y=logFC, colour = status)) +
geom_point(size = 0.5) +
geom_point(size = 0.5, data = subset(tab, status!=testLabel)) +
theme_classic()
# pval.vs.fc
p2 <- ggplot(tab, aes(x=logFC,y=PValue, colour = status)) +
geom_point(size = 0.5) +
geom_point(size = 0.5, data = subset(tab, status!=testLabel)) +
theme_classic()
## plot pval distribution to guide pvalue cutoff choice.
tab_pvals = tab[,c("PValue","logFC","status")]
pvalDistro <- ggplot(tab_pvals, aes(x=status, y=-log10(PValue))) +
geom_violin() +
geom_boxplot(width = 0.1) +
xlab("sgRNA group") +
ylab("-log10(pval)") +
scale_y_continuous(breaks=seq(0,max(-log10(tab_pvals$PValue)),1)) +
theme(panel.grid.major.y = element_line(colour = "black", linetype = "dashed"),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
panel.background = element_rect(fill = "white",colour = "black"))
qval2pval <- function(qcutoffs, pvals, status, testLabel, negLabel){
df.pval = data.frame(pvals= pvals,status = status)
df.pval = subset(df.pval, status %in% c(testLabel, negLabel))
df.pval$status = as.factor(as.character(df.pval$status))
df.pval.sorted = df.pval[order(df.pval$pvals),]
N_recNeg = cumsum(df.pval.sorted$status == negLabel)
N_allNeg = sum(status==negLabel)
dfm = data.frame(df.pval.sorted,
FDR = N_recNeg/N_allNeg)
idx = 1:nrow(dfm)
df.q2p = data.frame()
for(qcutoff in qcutoffs){
pcutoff = dfm[max(idx[dfm$FDR <= qcutoff]),]$pval
pcutoff = round(pcutoff,5)
df.q2p = rbind(df.q2p, data.frame(qcutoff = qcutoff,
pcutoff = pcutoff))
}
return(df.q2p)
}
msg = qval2pval(qcutoffs, tab$PValue, tab$status, testLabel, negLabel)
q2pGuideTab = tableGrob(msg, rows = NULL)
grid.arrange(distr1, ppca1, ppca2, distr2, distr3, p1, p2, pvalDistro, q2pGuideTab, nrow = 3)
ggplot(tab_pvals, aes(x=status, y=logFC)) +
geom_violin() +
geom_boxplot(width = 0.1) +
xlab("sgRNA group") +
ylab("log(FC)") +
scale_y_continuous(breaks=seq(0,max(log10(tab_pvals$logFC)),1)) +
theme(panel.grid.major.y = element_line(colour = "black", linetype = "dashed"),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
panel.background = element_rect(fill = "white",colour = "black"))
log10(tab_pvals$logFC)
max(tab_pvals$logFC)
ggplot(tab_pvals, aes(x=status, y=logFC)) +
geom_violin() +
geom_boxplot(width = 0.1) +
xlab("sgRNA group") +
ylab("log(FC)") +
scale_y_continuous(breaks=seq(0,max(tab_pvals$logFC),1)) +
theme(panel.grid.major.y = element_line(colour = "black", linetype = "dashed"),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
panel.background = element_rect(fill = "white",colour = "black"))
ggplot(tab_pvals, aes(x=status, y=logFC)) +
geom_violin() +
geom_boxplot(width = 0.1) +
xlab("sgRNA group") +
ylab("log(FC)") +
scale_y_continuous(breaks=seq(min(tab_pvals$logFC),max(tab_pvals$logFC),1)) +
theme(panel.grid.major.y = element_line(colour = "black", linetype = "dashed"),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
panel.background = element_rect(fill = "white",colour = "black"))
ggplot(tab_pvals, aes(x=status, y=logFC)) +
geom_violin() +
geom_boxplot(width = 0.1) +
xlab("sgRNA group") +
ylab("log(FC)") +
scale_y_continuous(breaks=seq(floor(min(tab_pvals$logFC)),max(tab_pvals$logFC),1)) +
theme(panel.grid.major.y = element_line(colour = "black", linetype = "dashed"),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
panel.background = element_rect(fill = "white",colour = "black"))
tab
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
inFile = "../tmp.4"
method = "RRA"
dat = read.table(inFile)
loci = paste(dat[,1], dat[,2], dat[,3], sep="_")
signalLists = dat[,4] ## -log(pval) * sign(log(fg/bg))  ## so the higher is stronger enrichment in fg.
signalLists = lapply(strsplit(as.character(signalLists), ","), unlist)
df <- plyr::ldply(signalLists, rbind)
mat <- as.matrix(df)
mat2 <- apply(mat, 2, as.numeric)
rownames(mat2) = loci
rraResult <- aggregateRanks(rmat = mat2, method = method, full=T)
CountSgRNA = data.frame(rowSums(!is.na(mat2)))
library(RobustRankAggreg)
rraResult <- aggregateRanks(rmat = mat2, method = method, full=T)
CountSgRNA = data.frame(rowSums(!is.na(mat2)))
#results$Count = CountSgRNA
result = merge(rraResult, CountSgRNA, by = 0)[,-1]
colnames(result)=c("Name","Score","Count")
result <- subset(result, Score<1)
result$Score = p.adjust(result$Score, method = "fdr") # change the pval score to FDR; But this reduces resolution of region peaks.
result <- subset(result, Score<1)
rraResult
CountSgRNA
