## ---------------------------
##
## Purpose of script: Analyses of genomic datasets from manuscript "AGO1x prevents dsRNA-induced interferon signaling to promote proliferation of breast cancer cells" (doi:YYY)
##
## Author: Dr. Joao C Guimaraes
##
## Date Created: 2020-02-27
##
## Email: joaoguima@gmail.com
##
## Tested with:
##	- R 3.3.3 [macOS 10.15.2 : x86_64-apple-darwin13.4.0 (64-bit)]
##	- DESeq2_1.12.4
##	- biomaRt_2.28.0
##
## ---------------------------

##### Load/Install dependencies #####
if (!require('DESeq2')) install.packages('DESeq2'); library('DESeq2')
if (!require('biomaRt')) install.packages('biomaRt'); library('biomaRt')

##### Create folders to save output #####
dir.create(file.path(".", "figures"), showWarnings = TRUE)
dir.create(file.path(".", "results"), showWarnings = TRUE)

##### Auxiliary function #####

# Merge multiple DF
mergeDF <- function(listDF){
	tmpDF = NULL
	for(df in listDF){
		if(is.null(tmpDF)){
			tmpDF = df
		} else {
			tmpDF = merge(tmpDF,df,by="gene",all=T)
		}
	}
	tmpDF[is.na(tmpDF)]=0
	return(tmpDF)
}

#########
##### Load gene counts tables of mRNA-seq samples from different cell lines
#########

# Read file
data_counts=read.csv("./data/rnaseq.hela-cas9-triplicates/txome/hela.rnaseq.raw.counts.csv",row.names=1)
data_tpm=read.csv("./data/rnaseq.hela-cas9-triplicates/txome/hela.rnaseq.tpm.csv",row.names=1)

# TPMs (average for each cell line)
data_tpm_avg = data.frame(row.names=rownames(data_tpm))
data_tpm_avg$hela.gfp = rowMeans(data_tpm[,c("ago1gfp_r1","ago1gfp_r2","ago1gfp_r3")])
data_tpm_avg$hela.w1a = rowMeans(data_tpm[,c("ago1w1a_r1","ago1w1a_r2","ago1w1a_r3")])
data_tpm_avg$hela.w6a = rowMeans(data_tpm[,c("ago1w6a_r1","ago1w6a_r2","ago1w6a_r3")])
data_tpm_avg = log10(data_tpm_avg+0.001)


## Sample Reproducibility
# PCA of all samples
col_map = data.frame("ago1gfp"=paste(colors()[323]),
					 "ago1w1a"=paste(colors()[586]),
					 "ago1w6a"=paste(colors()[186]),
					 stringsAsFactors=F)
celltype = c(gsub("_r[123]","",colnames(data_tpm)))
replicate = c(gsub("[^_]+_r","",colnames(data_tpm)))

#PCA analysis
log_data = log10(data_tpm+0.001)
fit <- prcomp(t(log_data),scale=FALSE)
pc1_var_e = round((fit$sdev^2 / sum(fit$sdev^2))[1],2)
pc2_var_e = round((fit$sdev^2 / sum(fit$sdev^2))[2],2)

#PCA plot
plot(fit$x[,1:2],pch=21,xlab=paste("Component 1 (",pc1_var_e*100,"% variance explained)",sep=""),ylab=paste("Component 2 (",pc2_var_e * 100,"% variance explained)",sep=""),cex.axis=1.2, cex.lab=1.3, cex=3,bg=paste(col_map[celltype]), main="mRNA-seq")
text(fit$x[,1:2],replicate,col="black",cex=0.8)
legend("bottomleft",legend=names(col_map),pt.bg=paste(col_map),inset=0.02, pch=21,pt.cex=1.5,bty='n')


#########
##### Differential expression analyses: W1A vs Control (gfp)
#########

## Estimate w1a/gfp fold-changes using DESeq2
countTable = data_counts[,c("ago1gfp_r1", "ago1gfp_r2", "ago1gfp_r3", "ago1w1a_r1", "ago1w1a_r2", "ago1w1a_r3")]
colData = data.frame(row.names=colnames(countTable), condition = c( gsub("_r[123]*","",colnames(countTable))) )
dds = DESeqDataSetFromMatrix(countTable, colData, formula(~ condition))
dds = DESeq(dds)
resDDS = results(dds)
mcols(resDDS,use.names=TRUE)

res = data.frame(resDDS)
res = res[!is.na(res$padj),]

up = rownames(res[res$log2FoldChange > 1 & res$padj < 0.01,])
dw = rownames(res[res$log2FoldChange < -1 & res$padj < 0.01,])
length(up)
length(dw)

res_rnaseq_w1a = res
up_rnaseq_w1a = up
dw_rnaseq_w1a = dw

# Save to file
write.csv(res_rnaseq_w1a,"./results/hela.rnaseq.diff.exp.w1a.vs.control.csv")
res_rnaseq_w1a=read.csv("./results/hela.rnaseq.diff.exp.w1a.vs.control.csv",row.names=1)

# Scatter plot of gene expression
pdf("./figures/hela.rnaseq.w1a.vs.control.tpm.scatterplot.pdf",width=8,height=8,paper='special') 
plot(data_tpm_avg$hela.gfp, data_tpm_avg$hela.w1a,pch=20,las=1,cex.axis=1.3,cex.lab=1.3,xlab="mRNA expression in HeLa Control (TPM, log10)",ylab="mRNA expression in HeLa W1A (TPM, log10)",col="grey")
abline(0,1,col="black",lty=2);
points(data_tpm_avg[up,"hela.gfp"], data_tpm_avg[up,"hela.w1a"],col=colors()[35],pch=20);
points(data_tpm_avg[dw,"hela.gfp"], data_tpm_avg[dw,"hela.w1a"],col=colors()[131],pch=20);
dev.off()

# Create table with gene names
mart <- useMart("ENSEMBL_MART_ENSEMBL", dataset="hsapiens_gene_ensembl", host="www.ensembl.org")
att=c("refseq_mrna","external_gene_name","description")
genes = getBM(att,"refseq_mrna",rownames(res_rnaseq_w1a),mart,uniqueRows=T); genes=genes[genes$refseq_mrna!="",];

res_rnaseq_w1a_names = merge(res_rnaseq_w1a, genes,by.x="row.names",by.y="refseq_mrna"); res_rnaseq_w1a_names=res_rnaseq_w1a_names[!duplicated(res_rnaseq_w1a_names$Row.names),]; rownames(res_rnaseq_w1a_names)=res_rnaseq_w1a_names$Row.names; res_rnaseq_w1a_names$Row.names=NULL; 
res_rnaseq_w1a_names=res_rnaseq_w1a_names[,c("external_gene_name","log2FoldChange","lfcSE","pvalue","padj")]

# Expression of ribosomal proteins
tab = read.table("./data/auxiliary/human.rp.genes.all.csv",sep="\t",header=T)
geneList=intersect(paste(tab$refseq_mrna), rownames(res_rnaseq_w1a))	

pdf("./figures/hela.rnaseq.w1a.vs.control.rps.boxplot.pdf",width=8,height=8,paper='special') 
boxplot(res_rnaseq_w1a[,"log2FoldChange"],res_rnaseq_w1a[geneList,"log2FoldChange"],names=c("All","Ribosomal proteins"),ylab="mRNA W1A/Control fold-change (log2)",cex.lab=1.3,col="grey",ylim=c(-3,3))
wilcox.test(res_rnaseq_w1a[,"log2FoldChange"],res_rnaseq_w1a[geneList,"log2FoldChange"])
dev.off()


#########
##### Differential expression analyses: W6A vs Control (gfp)
#########

## Estimate w6a/gfp fold-changes using DESeq2
countTable = data_counts[,c("ago1gfp_r1", "ago1gfp_r2", "ago1gfp_r3", "ago1w6a_r1", "ago1w6a_r2", "ago1w6a_r3")]
colData = data.frame(row.names=colnames(countTable), condition = c( gsub("_r[123]*","",colnames(countTable))) )
dds = DESeqDataSetFromMatrix(countTable, colData, formula(~ condition))
dds = DESeq(dds)
resDDS = results(dds)
mcols(resDDS,use.names=TRUE)

res = data.frame(resDDS)
res = res[!is.na(res$padj),]

up = rownames(res[res$log2FoldChange > 1 & res$padj < 0.01,])
dw = rownames(res[res$log2FoldChange < -1 & res$padj < 0.01,])
length(up)
length(dw)

res_rnaseq_w6a = res
up_rnaseq_w6a = up
dw_rnaseq_w6a = dw

# Save to file
write.csv(res_rnaseq_w6a,"./results/hela.rnaseq.diff.exp.w6a.vs.control.csv")
res_rnaseq_w6a =read.csv("./results/hela.rnaseq.diff.exp.w6a.vs.control.csv",row.names=1)

# Scatter plot of gene expression
pdf("./figures/hela.rnaseq.w6a.vs.control.tpm.scatterplot.pdf",width=8,height=8,paper='special') 
plot(data_tpm_avg$hela.gfp, data_tpm_avg$hela.w6a,pch=20,las=1,cex.axis=1.3,cex.lab=1.3,xlab="mRNA expression in HeLa Control (TPM, log10)",ylab="mRNA expression in HeLa W6A (TPM, log10)",col="grey")
abline(0,1,col="black",lty=2);
points(data_tpm_avg[up,"hela.gfp"], data_tpm_avg[up,"hela.w6a"],col=colors()[35],pch=20);
points(data_tpm_avg[dw,"hela.gfp"], data_tpm_avg[dw,"hela.w6a"],col=colors()[131],pch=20);
dev.off()

# Create table with gene names
mart <- useMart("ENSEMBL_MART_ENSEMBL", dataset="hsapiens_gene_ensembl", host="www.ensembl.org")
att=c("refseq_mrna","external_gene_name","description")
genes = getBM(att,"refseq_mrna",rownames(res_rnaseq_w6a),mart,uniqueRows=T); genes=genes[genes$refseq_mrna!="",];

res_rnaseq_w6a_names = merge(res_rnaseq_w6a, genes,by.x="row.names",by.y="refseq_mrna"); res_rnaseq_w6a_names=res_rnaseq_w6a_names[!duplicated(res_rnaseq_w6a_names$Row.names),]; rownames(res_rnaseq_w6a_names)=res_rnaseq_w6a_names$Row.names; res_rnaseq_w6a_names$Row.names=NULL; 
res_rnaseq_w6a_names=res_rnaseq_w6a_names[,c("external_gene_name","log2FoldChange","lfcSE","pvalue","padj")]

# Expression of ribosomal proteins
tab = read.table("./data/auxiliary/human.rp.genes.all.csv",sep="\t",header=T)
geneList=intersect(paste(tab$refseq_mrna), rownames(res_rnaseq_w6a))	

pdf("./figures/hela.rnaseq.w6a.vs.control.rps.boxplot.pdf",width=8,height=8,paper='special') 
boxplot(res_rnaseq_w6a[,"log2FoldChange"],res_rnaseq_w6a[geneList,"log2FoldChange"],names=c("All","Ribosomal proteins"),ylab="mRNA W6A/Control fold-change (log2)",cex.lab=1.3,col="grey",ylim=c(-3,3))
wilcox.test(res_rnaseq_w6a[,"log2FoldChange"],res_rnaseq_w6a[geneList,"log2FoldChange"])
dev.off()


#########
##### Differential expression analyses: Comparison between mutant cell lines
#########

res_rnaseq_mutants = merge(res_rnaseq_w1a, res_rnaseq_w6a, by="row.names",suffix=c("_w1a","_w6a"))

pdf("./figures/hela.rnaseq.w6a.vs.w1a.scatter.plot.pdf",width=8,height=8,paper='special') 
plot(res_rnaseq_mutants$log2FoldChange_w1a,res_rnaseq_mutants$log2FoldChange_w6a,pch=20,las=1,cex.axis=1.3,cex.lab=1.3,xlab="mRNA fold-change in HeLa W1A/Control (log2)",ylab="mRNA fold-change in HeLa W6A/Control (log2)",col=rgb(0.5,0.5,0.5,0.5))
abline(0,1,col="black",lty=2);
ct = cor.test(res_rnaseq_mutants $log2FoldChange_w1a, res_rnaseq_mutants$log2FoldChange_w6a)
text(-4,4,paste("R =",round(ct$estimate,2)))
dev.off()

res_rnaseq_mutants_names = merge(res_rnaseq_w1a_names, res_rnaseq_w6a_names, by="row.names",suffix=c("_w1a","_w6a"))
res_rnaseq_mutants_names$external_gene_name_w6a = NULL; rownames(res_rnaseq_mutants_names)=res_rnaseq_mutants_names$Row.names; res_rnaseq_mutants_names$Row.names=NULL;
colnames(res_rnaseq_mutants_names)[1]="gene_name"

# Save to file
write.csv(res_rnaseq_mutants_names,"./results/hela.rnaseq.diff.exp.w6a.vs.w1a.with.gnames.csv")
