## ---------------------------
##
## Purpose of script: Analyses of genomic datasets from manuscript "AGO1x prevents dsRNA-induced interferon signaling to promote proliferation of breast cancer cells" (doi:YYY)
##
## Author: Dr. Joao C Guimaraes
##
## Date Created: 2020-02-27
##
## Email: joaoguima@gmail.com
##
## Tested with:
##	- R 3.3.3 [macOS 10.15.2 : x86_64-apple-darwin13.4.0 (64-bit)]
##	- DESeq2_1.12.4
##	- biomaRt_2.28.0
##
## ---------------------------

##### Load/Install dependencies #####
if (!require('DESeq2')) install.packages('DESeq2'); library('DESeq2')
if (!require('biomaRt')) install.packages('biomaRt'); library('biomaRt')
if (!require('VennDiagram')) install.packages('VennDiagram'); library('VennDiagram')

##### Create folders to save output #####
dir.create(file.path(".", "figures"), showWarnings = TRUE)
dir.create(file.path(".", "results"), showWarnings = TRUE)

##### Auxiliary function #####

# Merge multiple DF
mergeDF <- function(listDF){
	tmpDF = NULL
	for(df in listDF){
		if(is.null(tmpDF)){
			tmpDF = df
		} else {
			tmpDF = merge(tmpDF,df,by="gene",all=T)
		}
	}
	tmpDF[is.na(tmpDF)]=0
	return(tmpDF)
}

#########
##### Load gene counts tables of mRNA-seq samples from different cell lines
#########

# Read file
data_counts=read.csv("./data/rnaseq.mdb231-cas9-triplicates/txome/mda.mb.231.rnaseq.raw.counts.csv",row.names=1)
data_tpm=read.csv("./data/rnaseq.mdb231-cas9-triplicates/txome/mda.mb.231.rnaseq.tpm.csv",row.names=1)


# TPMs (average for each cell line)
data_tpm_avg = data.frame(row.names=rownames(data_tpm))
data_tpm_avg$mdb231.gfp = rowMeans(data_tpm[,c("ago1gfp_r1","ago1gfp_r2","ago1gfp_r3")])
data_tpm_avg$mdb231.w1a = rowMeans(data_tpm[,c("ago1w1a_r1","ago1w1a_r2","ago1w1a_r3")])
data_tpm_avg$mdb231.w6a = rowMeans(data_tpm[,c("ago1w6a_r1","ago1w6a_r2","ago1w6a_r3")])
data_tpm_avg = log10(data_tpm_avg+0.001)

## Sample Reproducibility
# PCA of all samples
col_map = data.frame("ago1gfp"=paste(colors()[323]),
					 "ago1w1a"=paste(colors()[586]),
					 "ago1w6a"=paste(colors()[186]),
					 stringsAsFactors=F)
celltype = c(gsub("_r[123]","",colnames(data_tpm)))
replicate = c(gsub("[^_]+_r","",colnames(data_tpm)))

#PCA analysis
log_data = log10(data_tpm+0.001)
fit <- prcomp(t(log_data),scale=FALSE)
pc1_var_e = round((fit$sdev^2 / sum(fit$sdev^2))[1],2)
pc2_var_e = round((fit$sdev^2 / sum(fit$sdev^2))[2],2)

#PCA plot
plot(fit$x[,1:2],pch=21,xlab=paste("Component 1 (",pc1_var_e*100,"% variance explained)",sep=""),ylab=paste("Component 2 (",pc2_var_e * 100,"% variance explained)",sep=""),cex.axis=1.2, cex.lab=1.3, cex=3,bg=paste(col_map[celltype]), main="mRNA-seq")
text(fit$x[,1:2],replicate,col="black",cex=0.8)
legend("bottomleft",legend=names(col_map),pt.bg=paste(col_map),inset=0.02, pch=21,pt.cex=1.5,bty='n')


#########
##### Differential expression analyses: W1A vs Control (gfp)
#########

## Estimate w1a/control fold-changes using DESeq2
countTable = data_counts[,c("ago1gfp_r1", "ago1gfp_r2", "ago1gfp_r3", "ago1w1a_r1", "ago1w1a_r2", "ago1w1a_r3")]
colData = data.frame(row.names=colnames(countTable), condition = c( gsub("_r[123]*","",colnames(countTable))) )
dds = DESeqDataSetFromMatrix(countTable, colData, formula(~ condition))
dds = DESeq(dds)
resDDS = results(dds)
mcols(resDDS,use.names=TRUE)

res = data.frame(resDDS)
res = res[!is.na(res$padj),]

up = rownames(res[res$log2FoldChange > 1 & res$padj < 0.01,])
dw = rownames(res[res$log2FoldChange < -1 & res$padj < 0.01,])
length(up)
length(dw)

res_rnaseq_w1a = res
up_rnaseq_w1a = up
dw_rnaseq_w1a = dw

# Save to file
write.csv(res_rnaseq_w1a,"./results/mda.mb.231.rnaseq.diff.exp.w1a.vs.control.csv")
res_rnaseq_w1a=read.csv("./results/mda.mb.231.rnaseq.diff.exp.w1a.vs.control.csv",row.names=1)

# Scatter plot of gene expression
pdf("./figures/mda.mb.231.rnaseq.w1a.vs.control.tpm.scatterplot.pdf",width=8,height=8,paper='special') 
plot(data_tpm_avg$mdb231.gfp, data_tpm_avg$mdb231.w1a,pch=20,las=1,cex.axis=1.3,cex.lab=1.3,xlab="mRNA expression in MDA-MB-231 Control (TPM, log10)",ylab="mRNA expression in MDA-MB-231 W1A (TPM, log10)",col="grey")
abline(0,1,col="black",lty=2);
points(data_tpm_avg[up,"mdb231.gfp"], data_tpm_avg[up,"mdb231.w1a"],col=colors()[35],pch=20);
points(data_tpm_avg[dw,"mdb231.gfp"], data_tpm_avg[dw,"mdb231.w1a"],col=colors()[131],pch=20);
dev.off()

# Create table with gene names
mart <- useMart("ENSEMBL_MART_ENSEMBL", dataset="hsapiens_gene_ensembl", host="www.ensembl.org")
att=c("refseq_mrna","external_gene_name","description")
genes = getBM(att,"refseq_mrna",rownames(res_rnaseq_w1a),mart,uniqueRows=T); genes=genes[genes$refseq_mrna!="",];

res_rnaseq_w1a_names = merge(res_rnaseq_w1a, genes,by.x="row.names",by.y="refseq_mrna"); res_rnaseq_w1a_names=res_rnaseq_w1a_names[!duplicated(res_rnaseq_w1a_names$Row.names),]; rownames(res_rnaseq_w1a_names)=res_rnaseq_w1a_names$Row.names; res_rnaseq_w1a_names$Row.names=NULL; 
res_rnaseq_w1a_names=res_rnaseq_w1a_names[,c("external_gene_name","log2FoldChange","lfcSE","pvalue","padj")]

# Expression of ribosomal proteins
tab = read.table("./data/auxiliary/human.rp.genes.all.csv",sep="\t",header=T)
geneList=intersect(paste(tab$refseq_mrna), rownames(res_rnaseq_w1a))	

pdf("./figures/mda.mb.231.rnaseq.w1a.vs.control.rps.boxplot.pdf",width=8,height=8,paper='special') 
boxplot(res_rnaseq_w1a[,"log2FoldChange"],res_rnaseq_w1a[geneList,"log2FoldChange"],names=c("All","Ribosomal proteins"),ylab="mRNA W1A/Control fold-change (log2)",cex.lab=1.3,col="grey",ylim=c(-3,3))
wilcox.test(res_rnaseq_w1a[,"log2FoldChange"],res_rnaseq_w1a[geneList,"log2FoldChange"])
dev.off()


#########
##### Differential expression analyses: W6A vs Control (gfp)
#########

## Estimate w6a/control fold-changes using DESeq2
countTable = data_counts[,c("ago1gfp_r1", "ago1gfp_r2", "ago1gfp_r3", "ago1w6a_r1", "ago1w6a_r2", "ago1w6a_r3")]
colData = data.frame(row.names=colnames(countTable), condition = c( gsub("_r[123]*","",colnames(countTable))) )
dds = DESeqDataSetFromMatrix(countTable, colData, formula(~ condition))
dds = DESeq(dds)
resDDS = results(dds)
mcols(resDDS,use.names=TRUE)

res = data.frame(resDDS)
res = res[!is.na(res$padj),]

up = rownames(res[res$log2FoldChange > 1 & res$padj < 0.01,])
dw = rownames(res[res$log2FoldChange < -1 & res$padj < 0.01,])
length(up)
length(dw)

res_rnaseq_w6a = res
up_rnaseq_w6a = up
dw_rnaseq_w6a = dw

# Save to file
write.csv(res_rnaseq_w6a,"./results/mda.mb.231.rnaseq.diff.exp.w6a.vs.control.csv")
res_rnaseq_w6a =read.csv("./results/mda.mb.231.rnaseq.diff.exp.w6a.vs.control.csv",row.names=1)

# Scatter plot of gene expression
pdf("./figures/mda.mb.231.rnaseq.w6a.vs.control.tpm.scatterplot.pdf",width=8,height=8,paper='special') 
plot(data_tpm_avg$mdb231.gfp, data_tpm_avg$mdb231.w6a,pch=20,las=1,cex.axis=1.3,cex.lab=1.3,xlab="mRNA expression in MDA-MB-231 Control (TPM, log10)",ylab="mRNA expression in MDA-MB-231 W6A (TPM, log10)",col="grey")
abline(0,1,col="black",lty=2);
points(data_tpm_avg[up,"mdb231.gfp"], data_tpm_avg[up,"mdb231.w6a"],col=colors()[35],pch=20);
points(data_tpm_avg[dw,"mdb231.gfp"], data_tpm_avg[dw,"mdb231.w6a"],col=colors()[131],pch=20);
dev.off()

# Create table with gene names
mart <- useMart("ENSEMBL_MART_ENSEMBL", dataset="hsapiens_gene_ensembl", host="www.ensembl.org")
att=c("refseq_mrna","external_gene_name","description")
genes = getBM(att,"refseq_mrna",rownames(res_rnaseq_w6a),mart,uniqueRows=T); genes=genes[genes$refseq_mrna!="",];

res_rnaseq_w6a_names = merge(res_rnaseq_w6a, genes,by.x="row.names",by.y="refseq_mrna"); res_rnaseq_w6a_names=res_rnaseq_w6a_names[!duplicated(res_rnaseq_w6a_names$Row.names),]; rownames(res_rnaseq_w6a_names)=res_rnaseq_w6a_names$Row.names; res_rnaseq_w6a_names$Row.names=NULL; 
res_rnaseq_w6a_names=res_rnaseq_w6a_names[,c("external_gene_name","log2FoldChange","lfcSE","pvalue","padj")]

# Expression of ribosomal proteins
tab = read.table("./data/auxiliary/human.rp.genes.all.csv",sep="\t",header=T)
geneList=intersect(paste(tab$refseq_mrna), rownames(res_rnaseq_w6a))	

pdf("./figures/mda.mb.231.rnaseq.w6a.vs.control.rps.boxplot.pdf",width=8,height=8,paper='special') 
boxplot(res_rnaseq_w6a[,"log2FoldChange"],res_rnaseq_w6a[geneList,"log2FoldChange"],names=c("All","Ribosomal proteins"),ylab="mRNA W6A/Control fold-change (log2)",cex.lab=1.3,col="grey",ylim=c(-3,3))
wilcox.test(res_rnaseq_w6a[,"log2FoldChange"],res_rnaseq_w6a[geneList,"log2FoldChange"])
dev.off()


#########
##### Differential expression analyses: Comparison between mutant cell lines
#########

res_rnaseq_mutants = merge(res_rnaseq_w1a, res_rnaseq_w6a, by="row.names",suffix=c("_w1a","_w6a"))

pdf("./figures/mda.mb.231.rnaseq.w6a.vs.w1a.scatter.plot.pdf",width=8,height=8,paper='special') 
plot(res_rnaseq_mutants$log2FoldChange_w1a,res_rnaseq_mutants$log2FoldChange_w6a,pch=20,las=1,cex.axis=1.3,cex.lab=1.3,xlab="mRNA fold-change in MDA-MB-231 W1A/Control (log2)",ylab="mRNA fold-change in MDA-MB-231 W6A/Control (log2)",col=rgb(0.5,0.5,0.5,0.5))
abline(0,1,col="black",lty=2);
ct = cor.test(res_rnaseq_mutants$log2FoldChange_w1a, res_rnaseq_mutants$log2FoldChange_w6a)
text(-8,8,paste("R =",round(ct$estimate,2)))
dev.off()

res_rnaseq_mutants_names = merge(res_rnaseq_w1a_names, res_rnaseq_w6a_names, by="row.names",suffix=c("_w1a","_w6a"))
res_rnaseq_mutants_names$external_gene_name_w6a = NULL; rownames(res_rnaseq_mutants_names)=res_rnaseq_mutants_names$Row.names; res_rnaseq_mutants_names$Row.names=NULL;
colnames(res_rnaseq_mutants_names)[1]="gene_name"

# Save to file
write.csv(res_rnaseq_mutants_names,"./results/mda.mb.231.rnaseq.diff.exp.w6a.and.w1a.with.gnames.csv")


#########
##### Analysis of repetitive elements expression
#########

#Read repetitive element counts
data_rep=read.csv("./data/rnaseq.mdb231-cas9-triplicates/repetitive.elements/mda.mb.231.rnaseq.rep.raw.counts.csv",row.names=1)

#mrna and rrna counts
data_all = rbind(data_counts,data_rep)

# get annotation files for human genes and repetitive elements
hg19_an = read.table("./data/auxiliary/human.mrnas.an",col.names=c("gene","cds_st","cds_end","length","uorf_st","u_orf_end"),sep="\t")[,1:4]
rep_an = read.table("./data/auxiliary/human.rep.an",col.names=c("gene","cds_st","cds_end","length"),sep="\t")
all_an = rbind(hg19_an,rep_an)
data_all_an = merge(data_all, all_an, by.x="row.names", by.y="gene"); rownames(data_all_an) = data_all_an$Row.names; 
data_all_an$cds_st = NULL; data_all_an$cds_end = NULL; data_all_an$Row.names = NULL;

# convert counts to TPM
rpk_mat = sweep(data_all_an[,1:9],1,data_all_an$length/1000,"/")
scaling_factor = colSums(rpk_mat)/1e6
data_tpm = sweep(rpk_mat,2,scaling_factor,"/")

# assign RNA class to different rows
data_tpm$class = NA
data_tpm[grep("^NM",rownames(data_tpm)),"class"]="mrna"
data_tpm[grep("^RN",rownames(data_tpm)),"class"]="rrna"
data_tpm$class[is.na(data_tpm$class)] = "repetitive"

data_agg_class = aggregate( . ~ class, data_tpm, sum)
rownames(data_agg_class)=data_agg_class$class; data_agg_class$class<-NULL; data_agg_class=data.frame(t(data_agg_class)); data_agg_class$genotype = gsub("_.*","",rownames(data_agg_class));
mean_tpm=aggregate( . ~ genotype, data_agg_class, mean); rownames(mean_tpm)=mean_tpm$genotype; mean_tpm$genotype=NULL;
barplot(t(mean_tpm),beside=T,log="y")


#########
##### Load gene counts tables of dsRNA-seq samples from different cell lines
#########

# Read files
data_counts=read.csv("./data/dsrnaseq.mdb231-cas9-triplicates/txome/mda.mb.231.dsrnaseq.raw.counts.csv",row.names=1)
data_tpm=read.csv("./data/dsrnaseq.mdb231-cas9-triplicates/txome/mda.mb.231.dsrnaseq.tpm.csv",row.names=1)

# TPMs (average for each cell line)
data_tpm_avg = data.frame(row.names=rownames(data_tpm))
data_tpm_avg$mdb231.gfp = rowMeans(data_tpm[,c("ago1gfp_r1","ago1gfp_r2","ago1gfp_r3")])
data_tpm_avg$mdb231.w1a = rowMeans(data_tpm[,c("ago1w1a_r1","ago1w1a_r2","ago1w1a_r3")])
data_tpm_avg$mdb231.w6a = rowMeans(data_tpm[,c("ago1w6a_r1","ago1w6a_r2","ago1w6a_r3")])
data_tpm_avg = log10(data_tpm_avg+0.001)

## Sample Reproducibility
# PCA of all samples
col_map = data.frame("ago1gfp"=paste(colors()[323]),
					 "ago1w1a"=paste(colors()[586]),
					 "ago1w6a"=paste(colors()[186]),
					 stringsAsFactors=F)
celltype = c(gsub("_r[123]","",colnames(data_tpm)))
replicate = c(gsub("[^_]+_r","",colnames(data_tpm)))

#PCA analysis
log_data = log10(data_tpm+0.001)
fit <- prcomp(t(log_data),scale=FALSE)
pc1_var_e = round((fit$sdev^2 / sum(fit$sdev^2))[1],2)
pc2_var_e = round((fit$sdev^2 / sum(fit$sdev^2))[2],2)

#PCA plot
plot(fit$x[,1:2],pch=21,xlab=paste("Component 1 (",pc1_var_e*100,"% variance explained)",sep=""),ylab=paste("Component 2 (",pc2_var_e * 100,"% variance explained)",sep=""),cex.axis=1.2, cex.lab=1.3, cex=3,bg=paste(col_map[celltype]), main="dsRNA-seq")
text(fit$x[,1:2],replicate,col="black",cex=0.8)
legend("bottomleft",legend=names(col_map),pt.bg=paste(col_map),inset=0.02, pch=21,pt.cex=1.5,bty='n')


#########
##### Differential expression analyses: W1A vs Control (gfp)
#########

## Estimate w1a/control fold-changes using DESeq2
countTable = data_counts[,c("ago1gfp_r1", "ago1gfp_r2", "ago1gfp_r3", "ago1w1a_r1", "ago1w1a_r2", "ago1w1a_r3")]
colData = data.frame(row.names=colnames(countTable), condition = c( gsub("_r[123]*","",colnames(countTable))) )
dds = DESeqDataSetFromMatrix(countTable, colData, formula(~ condition))
dds = DESeq(dds)
resDDS = results(dds)
mcols(resDDS,use.names=TRUE)

res = data.frame(resDDS)
res = res[!is.na(res$padj),]

up = rownames(res[res$log2FoldChange > 1 & res$padj < 0.01,])
dw = rownames(res[res$log2FoldChange < -1 & res$padj < 0.01,])
length(up)
length(dw)

res_dsrnaseq_w1a = res
up_dsrnaseq_w1a = up
dw_dsrnaseq_w1a = dw

# Save to file
write.csv(res_dsrnaseq_w1a,"./results/mda.mb.231.dsrnaseq.diff.exp.w1a.vs.control.csv")
res_dsrnaseq_w1a=read.csv("./results/mda.mb.231.dsrnaseq.diff.exp.w1a.vs.control.csv",row.names=1)


#########
##### Differential expression analyses: W6A vs Control (gfp)
#########

## Estimate w6a/control fold-changes using DESeq2
countTable = data_counts[,c("ago1gfp_r1", "ago1gfp_r2", "ago1gfp_r3", "ago1w6a_r1", "ago1w6a_r2", "ago1w6a_r3")]
colData = data.frame(row.names=colnames(countTable), condition = c( gsub("_r[123]*","",colnames(countTable))) )
dds = DESeqDataSetFromMatrix(countTable, colData, formula(~ condition))
dds = DESeq(dds)
resDDS = results(dds)
mcols(resDDS,use.names=TRUE)

res = data.frame(resDDS)
res = res[!is.na(res$padj),]

up = rownames(res[res$log2FoldChange > 1 & res$padj < 0.01,])
dw = rownames(res[res$log2FoldChange < -1 & res$padj < 0.01,])
length(up)
length(dw)

res_dsrnaseq_w6a = res
up_dsrnaseq_w6a = up
dw_dsrnaseq_w6a = dw

# Save to file
write.csv(res_dsrnaseq_w6a,"./results/mda.mb.231.dsrnaseq.diff.exp.w6a.vs.control.csv")
res_dsrnaseq_w6a =read.csv("./results/mda.mb.231.dsrnaseq.diff.exp.w6a.vs.control.csv",row.names=1)


#########
##### Differential expression analyses: Comparison between mutant cell lines
#########

res_dsrnaseq_mutants = merge(res_dsrnaseq_w1a, res_dsrnaseq_w6a, by="row.names",suffix=c("_w1a","_w6a"))

pdf("./figures/mda.mb.231.dsrnaseq.w6a.vs.w1a.scatter.plot.pdf",width=8,height=8,paper='special') 
plot(res_dsrnaseq_mutants$log2FoldChange_w1a, res_dsrnaseq_mutants$log2FoldChange_w6a,pch=20,las=1,cex.axis=1.3,cex.lab=1.3,xlab="dsRNA fold-change in MDA-MB-231 W1A/Control (log2)",ylab="dsRNA fold-change in MDA-MB-231 W6A/Control (log2)",col=rgb(0.5,0.5,0.5,0.5),xlim=c(-10,10),ylim=c(-10,10))
abline(0,1,col="black",lty=2);
ct = cor.test(res_dsrnaseq_mutants$log2FoldChange_w1a, res_dsrnaseq_mutants$log2FoldChange_w6a)
text(-8,8,paste("R =",round(ct$estimate,2)))
dev.off()


#########
##### Differential expression analyses: Comparison between dsRNAs- and mRNA-seq
#########

## Estimate w1a/control fold-changes
res_rnaseq_w1a=read.csv("./results/mda.mb.231.rnaseq.diff.exp.w1a.vs.control.csv",row.names=1)

res_delta_dsrnaseq=merge(res_rnaseq_w1a, res_dsrnaseq_w1a,suffix=c("_mrna","_dsrna"),by="row.names")
res_delta_dsrnaseq$delta = res_delta_dsrnaseq$log2FoldChange_dsrna - res_delta_dsrnaseq$log2FoldChange_mrna
res_delta_dsrnaseq$delta = (res_delta_dsrnaseq$delta - mean(res_delta_dsrnaseq$delta))/sd(res_delta_dsrnaseq$delta) #z-score normalization of delta

up = res_delta_dsrnaseq[res_delta_dsrnaseq$delta > 2.5,]
dw = res_delta_dsrnaseq[res_delta_dsrnaseq$delta < -2.5,]
dim(up)
dim(dw)

pdf("./figures/mda.mb.231.dsrnaseq.vs.mrnaseq.w1a.vs.control.scatter.plot.pdf",width=8,height=8) 
plot(res_delta_dsrnaseq$log2FoldChange_mrna,res_delta_dsrnaseq$log2FoldChange_dsrna,pch=20,las=1,cex.axis=1.3,cex.lab=1.3,xlab="mRNA fold-change W1A/Control (log2)",ylab="dsRNA fold-change W1A/Control (log2)",col="grey",xlim=c(-12,12),ylim=c(-12,12)) 
abline(0,1,col="black",lty=2);
points(up[,c("log2FoldChange_mrna", "log2FoldChange_dsrna")],col=colors()[35],pch=20);
points(dw[,c("log2FoldChange_mrna", "log2FoldChange_dsrna")],col=colors()[131],pch=20);
dev.off()

up_dsrna_mrna_w1a = up$Row.names
dw_dsrna_mrna_w1a = dw$Row.names

# Save to file
write.csv(res_delta_dsrnaseq,"./results/mda.mb.231.dsrnaseq.vs.mrnaseq.diff.exp.w1a.vs.control.csv")
res_delta_dsrnaseq =read.csv("./results/mda.mb.231.dsrnaseq.vs.mrnaseq.diff.exp.w1a.vs.control.csv",row.names=1)

## Estimate w6a/control fold-changes
res_rnaseq_w6a=read.csv("./results/mda.mb.231.rnaseq.diff.exp.w6a.vs.control.csv",row.names=1)

res_delta_dsrnaseq=merge(res_rnaseq_w6a, res_dsrnaseq_w6a,suffix=c("_mrna","_dsrna"),by="row.names")
res_delta_dsrnaseq$delta = res_delta_dsrnaseq$log2FoldChange_dsrna - res_delta_dsrnaseq$log2FoldChange_mrna
res_delta_dsrnaseq$delta = (res_delta_dsrnaseq$delta - mean(res_delta_dsrnaseq$delta))/sd(res_delta_dsrnaseq$delta) #z-score normalization of delta

up = res_delta_dsrnaseq[res_delta_dsrnaseq$delta > 2.5,]
dw = res_delta_dsrnaseq[res_delta_dsrnaseq$delta < -2.5,]
dim(up)
dim(dw)

pdf("./figures/mda.mb.231.dsrnaseq.vs.mrnaseq.w6a.vs.control.scatter.plot.pdf",width=8,height=8) 
plot(res_delta_dsrnaseq$log2FoldChange_mrna,res_delta_dsrnaseq$log2FoldChange_dsrna,pch=20,las=1,cex.axis=1.3,cex.lab=1.3,xlab="mRNA fold-change W6A/Control (log2)",ylab="dsRNA fold-change W6A/Control (log2)",col="grey",xlim=c(-12,12),ylim=c(-12,12)) 
abline(0,1,col="black",lty=2);
points(up[,c("log2FoldChange_mrna", "log2FoldChange_dsrna")],col=colors()[35],pch=20);
points(dw[,c("log2FoldChange_mrna", "log2FoldChange_dsrna")],col=colors()[131],pch=20);
dev.off()

up_dsrna_mrna_w6a = up$Row.names
dw_dsrna_mrna_w6a = dw$Row.names

# Save to file
write.csv(res_delta_dsrnaseq,"./results/mda.mb.231.dsrnaseq.vs.mrnaseq.diff.exp.w6a.vs.control.csv")
res_delta_dsrnaseq =read.csv("./results/mda.mb.231.dsrnaseq.vs.mrnaseq.diff.exp.w6a.vs.control.csv",row.names=1)


#########
##### Analysis of enriched/depleted dsRNAs/mRNAs in the mutant cell lines
#########

# Intersection between dsRNAs
up_int = intersect(up_dsrna_mrna_w1a, up_dsrna_mrna_w6a)
dw_int = intersect(dw_dsrna_mrna_w1a, dw_dsrna_mrna_w6a)

# Venn diagram of enriched dsRNAs
area1 = length(up_dsrna_mrna_w1a); area2 = length(up_dsrna_mrna_w6a);
n12 = length(up_int); 

pdf("./figures/mda.mb.231.dsrnaseq.vs.mrnaseq.enriched.w1a.vs.w6a.venn.pdf",width=8,height=8) 
venn.plot <- draw.pairwise.venn(
    area1 = area1,
    area2 = area2,
    cross.area = n12,
    category = c("W1A/Control", "W6A/Control"),
    reverse = FALSE, #mirroring
    fill = c(colors()[565], colors()[59]),
    cat.cex = 2,
    margin = 0.05,
    ind = TRUE
    );
grid.draw(venn.plot);
dev.off()

# Venn diagram of depleted dsRNAs
area1 = length(dw_dsrna_mrna_w1a); area2 = length(dw_dsrna_mrna_w6a);
n12 = length(dw_int); 

pdf("./figures/mda.mb.231.dsrnaseq.vs.mrnaseq.depleted.w1a.vs.w6a.venn.pdf",width=8,height=8) 
venn.plot <- draw.pairwise.venn(
    area1 = area1,
    area2 = area2,
    cross.area = n12,
    category = c("W1A/Control", "W6A/Control"),
    reverse = FALSE, #mirroring
    fill = c(colors()[565], colors()[59]),
    cat.cex = 2,
    margin = 0.05,
    ind = TRUE
    );
grid.draw(venn.plot);
dev.off()

# Features of enriched/depleted dsRNAs
h19_mrna_features = read.table("./data/auxiliary/human.mrnas.features.csv")
h19_mrna_features$gc_p = h19_mrna_features$gc/h19_mrna_features$tx_len

res_delta_dsrnaseq_w1a=read.csv("./results/mda.mb.231.dsrnaseq.vs.mrnaseq.diff.exp.w1a.vs.control.csv",row.names=1)
res_delta_dsrnaseq_w6a=read.csv("./results/mda.mb.231.dsrnaseq.vs.mrnaseq.diff.exp.w6a.vs.control.csv",row.names=1)

res_delta_mutants = merge(res_delta_dsrnaseq_w1a, res_delta_dsrnaseq_w6a, by.x="Row.names", by.y="Row.names", suffix=c("_w1a","_w6a"))
res_delta_mutants = merge(res_delta_mutants, h19_mrna_features, by.x="Row.names", by.y="row.names")
res_delta_mutants = res_delta_mutants[,c("Row.names","delta_w1a","delta_w6a","gc_p")]

# Save to file
write.csv(res_delta_mutants,"./results/mda.mb.231.dsrnaseq.vs.mrnaseq.delta.w6a.vs.w1a.csv")
write.csv(res_delta_mutants[res_delta_mutants$Row.names %in% up_int,],"./results/mda.mb.231.dsrnaseq.vs.mrnaseq.delta.enriched.w6a.vs.w1a.csv")
write.csv(res_delta_mutants[res_delta_mutants$Row.names %in% dw_int,],"./results/mda.mb.231.dsrnaseq.vs.mrnaseq.delta.depleted.w6a.vs.w1a.csv")
res_delta_mutants =read.csv("./results/mda.mb.231.dsrnaseq.vs.mrnaseq.delta.w6a.vs.w1a.csv",row.names=1)

pdf("./figures/mda.mb.231.dsrnaseq.vs.mrnaseq.delta.w6a.vs.w1a.scatter.plot.pdf",width=8,height=8) 
plot(res_delta_mutants$delta_w1a, res_delta_mutants$delta_w6a,pch=20,las=1,cex.axis=1.3,cex.lab=1.3,xlab="dsRNA/mRNA delta W1A/Control (log2)",ylab="dsRNA/mRNA delta W6A/Control (log2)",col="grey",xlim=c(-7,7),ylim=c(-7,7))
abline(lm(delta_w6a ~ delta_w1a, res_delta_mutants),lty=2,col="red")
ct = cor.test(res_delta_mutants$delta_w1a, res_delta_mutants$delta_w6a)
text(-6,6,paste("R =",round(ct$estimate,2)))
dev.off()

pdf("./figures/mda.mb.231.dsrnaseq.vs.mrnaseq.delta.w6a.vs.w1a.gc.box.plot.pdf",width=8,height=8) 
boxplot(res_delta_mutants$gc_p,res_delta_mutants[res_delta_mutants$Row.names %in% dw_int,"gc_p"],res_delta_mutants[res_delta_mutants$Row.names %in% up_int,"gc_p"], names=c("All","Depleted","Enriched"),ylab="GC content fraction",notch=T,col=c("darkgrey","grey","lightgrey"))
dev.off()
wilcox.test(res_delta_mutants$gc_p,res_delta_mutants[res_delta_mutants$Row.names %in% up_int,"gc_p"])
wilcox.test(res_delta_mutants[res_delta_mutants$Row.names %in% dw_int,"gc_p"],res_delta_mutants[res_delta_mutants$Row.names %in% up_int,"gc_p"])
wilcox.test(res_delta_mutants$gc_p,res_delta_mutants[res_delta_mutants$Row.names %in% dw_int,"gc_p"])


#########
##### Analysis of repetitive elements expression
#########

#Read repetitive element counts
data_rep=read.csv("./data/dsrnaseq.mdb231-cas9-triplicates/repetitive.elements/mda.mb.231.dsrnaseq.rep.raw.counts.csv",row.names=1)

#mrna and rrna counts
data_counts=read.csv("./data/dsrnaseq.mdb231-cas9-triplicates/txome/mda.mb.231.dsrnaseq.raw.counts.csv",row.names=1)
data_all = rbind(data_counts,data_rep)

# get annotation files for human genes and repetitive elements
hg19_an = read.table("./data/auxiliary/human.mrnas.an",col.names=c("gene","cds_st","cds_end","length","uorf_st","u_orf_end"),sep="\t")[,1:4]
rep_an = read.table("./data/auxiliary/human.rep.an",col.names=c("gene","cds_st","cds_end","length"),sep="\t")
all_an = rbind(hg19_an,rep_an)
data_all_an = merge(data_all, all_an, by.x="row.names", by.y="gene"); rownames(data_all_an) = data_all_an$Row.names; 
data_all_an$cds_st = NULL; data_all_an$cds_end = NULL; data_all_an$Row.names = NULL;

# convert counts to TPM
rpk_mat = sweep(data_all_an[,1:9],1,data_all_an$length/1000,"/")
scaling_factor = colSums(rpk_mat)/1e6
data_tpm = sweep(rpk_mat,2,scaling_factor,"/")

# assign RNA class to different rows
data_tpm$class = NA
data_tpm[grep("^NM",rownames(data_tpm)),"class"]="mrna"
data_tpm[grep("^RN",rownames(data_tpm)),"class"]="rrna"
data_tpm$class[is.na(data_tpm$class)] = "repetitive"

data_agg_class = aggregate( . ~ class, data_tpm, sum)
rownames(data_agg_class)=data_agg_class$class; data_agg_class$class<-NULL; data_agg_class=data.frame(t(data_agg_class)); data_agg_class$genotype = gsub("_.*","",rownames(data_agg_class));

# Save to file
write.csv(data_agg_class,"./results/mda.mb.231.dsrnaseq.rna.classes.mutants.csv")

mean_tpm=aggregate( . ~ genotype, data_agg_class, mean); rownames(mean_tpm)=mean_tpm$genotype; mean_tpm$genotype=NULL;
barplot(t(mean_tpm),beside=T,log="y")

