library('rtracklayer')
library('GenomicRanges')
library(Rsamtools)
library(bamsignals)
library(randomForest)
library(doParallel)
library(reshape2)
library(ggplot2)
library(data.table)
library(caret)
library(openxlsx)
library(ROCR)
gene_path='/path/to/gencode.v30.primary_assembly.annotation.gtf'
rRNA_path='download from http://sourceforge.net/projects/rseqc/files/BED/Human_Homo_sapiens/GRCh38_rRNA.bed.gz/download' # encode table
blacklist_path='/path/to/blacklist.bed'

###
OCR_summit_path='/path/to/OCR_peak_summit'
epi_peak_paths=list(OCR='/path/to/OCR_peak',
                    H3K27ac='/path/to/OCR_peak')
CAGE_bam_path='/path/to/CAGE.bam'
bamlist=list(OCR='/path/to/OCR.bam',
             H3K4me3='/path/to/H3K4me3.bam',
             H3K27ac='/path/to/H3K27ac.bam',
             H3K27me3='/path/to/H3K27me3.bam',
             RNASeq='/path/to/RNASeq.bam')

fantom_usage_path='https://fantom.gsc.riken.jp/5/datafiles/reprocessed/hg38_latest/extra/enhancer/F5.hg38.enhancers.expression.usage.matrix.gz'
fantom_info_paths=list('https://fantom.gsc.riken.jp/5/datafiles/reprocessed/hg38_latest/basic/human.cell_line.LQhCAGE/00_human.cell_line.LQhCAGE.hg38.assay_sdrf.txt',
                      'https://fantom.gsc.riken.jp/5/datafiles/reprocessed/hg38_latest/basic/human.cell_line.hCAGE/00_human.cell_line.hCAGE.hg38.assay_sdrf.txt',
                        'https://fantom.gsc.riken.jp/5/datafiles/reprocessed/hg38_latest/basic/human.fractionation.hCAGE/00_human.fractionation.hCAGE.hg38.assay_sdrf.txt',
                        'https://fantom.gsc.riken.jp/5/datafiles/reprocessed/hg38_latest/basic/human.primary_cell.LQhCAGE/00_human.primary_cell.LQhCAGE.hg38.assay_sdrf.txt',
                      'https://fantom.gsc.riken.jp/5/datafiles/reprocessed/hg38_latest/basic/human.primary_cell.hCAGE/00_human.primary_cell.hCAGE.hg38.assay_sdrf.txt',
                      'https://fantom.gsc.riken.jp/5/datafiles/reprocessed/hg38_latest/basic/human.timecourse.LQhCAGE/00_human.timecourse.LQhCAGE.hg38.assay_sdrf.txt',
                      'https://fantom.gsc.riken.jp/5/datafiles/reprocessed/hg38_latest/basic/human.timecourse.hCAGE/00_human.timecourse.hCAGE.hg38.assay_sdrf.txt',
                      'https://fantom.gsc.riken.jp/5/datafiles/reprocessed/hg38_latest/basic/human.tissue.hCAGE/00_human.tissue.hCAGE.hg38.assay_sdrf.txt')

tes <- function(x, upstream=0, downstream=1000, ...){
  #similar to granges promoters function, get the gene downstream region
  on_plus <- which(strand(x) == "+")
  on_plus_TSS <- end(x)[on_plus]
  start(x)[on_plus] <- on_plus_TSS - upstream
  end(x)[on_plus] <- on_plus_TSS + downstream - 1L
  on_minus <- which(strand(x) == "-")
  on_minus_TSS <- start(x)[on_minus]
  end(x)[on_minus] <- on_minus_TSS + upstream
  start(x)[on_minus] <- on_minus_TSS - downstream + 1L
  return(x)
}
bam_5p_profile <- function(bampath,mylist,marker){
  tot=system(paste0('samtools idxstats ',bampath),intern = T)
  tot=sum(as.numeric(sapply(strsplit(tot,'\\t'),'[',3)))
  denum=tot/2e6
  ss=marker%in%c('CAGE','RNA')
  mylist$assay=marker
  if(marker%in%c('H3K4me3','H3K27ac','H3K27me3')){
    sigs <- bamProfile(bampath, mylist, ss=ss,binsize=10,shift=75,paired.end="filter", verbose=FALSE)
  }
  if(marker%in%c('ATAC')){
    sigs <- bamProfile(bampath, mylist, ss=ss,binsize=10,shift=100, paired.end="filter",verbose=FALSE)
  }
  if(marker%in%c('RNA','CAGE')){
    sigs <- bamProfile(bampath, mylist, ss=ss,binsize=10,paired.end="filter", verbose=FALSE)
    res=rbind(data.frame(do.call(rbind,lapply(sigs@signals,function(x)x[1,]))/denum,strand='pos'),
              data.frame(do.call(rbind,lapply(sigs@signals,function(x)x[2,]))/denum,strand='neg'))
  }else{
    res=data.frame(do.call(rbind,sigs@signals)/denum,strand='both')
  }
  res
}
get_feature <- function(bampath,mylist,marker){
  list0=GenomicRanges::shift(resize(mylist,width=400,fix='center'),-300)
  list1=resize(mylist,width=200,fix='center')
  list2=GenomicRanges::shift(resize(mylist,width=400,fix='center'),300)
  
  ss=marker%in%c('CAGE','RNA')
  if(marker%in%c('H3K4me3','H3K27ac','H3K27me3')){
    sigs=cbind(bamCount(bampath, list0, ss=ss,shift=75,paired.end="filter", verbose=FALSE),
               bamCount(bampath, list1, ss=ss,shift=75,paired.end="filter", verbose=FALSE),
               bamCount(bampath, list2, ss=ss,shift=75,paired.end="filter", verbose=FALSE))
    colnames(sigs)=paste0(marker,c('pre','center','down'))
    
  }
  if(marker%in%c('ATAC')){
    sigs=cbind(bamCount(bampath, list0, ss=ss,shift=100,paired.end="filter", verbose=FALSE),
               bamCount(bampath, list1, ss=ss,shift=100,paired.end="filter", verbose=FALSE),
               bamCount(bampath, list2, ss=ss,shift=100,paired.end="filter", verbose=FALSE))
    colnames(sigs)=paste0(marker,c('pre','center','down'))
  }
  if(marker%in%c('RNA','CAGE')){
    sigs=t(rbind(bamCount(bampath, list0, ss=ss,paired.end="filter", verbose=FALSE),
                 bamCount(bampath, list1, ss=ss,paired.end="filter", verbose=FALSE),
                 bamCount(bampath, list2, ss=ss,paired.end="filter", verbose=FALSE)))
    
    colnames(sigs)=paste0(marker,c('sense','antisense'),
                          rep(c('pre','center','down'),each=2))
    
  }
  sigs
}
df2gr <- function(x)GRanges(Rle(x[,1]),IRanges(start=x[,2],end=x[,3]))
fantom_usage <- as.data.frame(fread(fantom_usage_path),rownames = 1)
row.names(fantom_usage) <- fantom_usage$V1;fantom_usage$V1=NULL
z=lapply(fantom_info_paths,read.table,as.is=T,sep='\t',quote='',comment.char="",header=T)
fantom_info=do.call(rbind,lapply(z,function(x)x[,c('Comment..sample_name.','Library.Name')]))
    
keyword <- c('brain','neuro','cortex','strocyt','cerebellum')
sel <- sapply(keyword,function(x)grep(x,fantom_info[,1]))
sel <- sort(unique(do.call('c',sel)))
fantom_info = fantom_info[sel,]
colnames(fantom_info)=c('anno','ID')
rownames(fantom_info)=fantom_info$ID
fantom_info$group=gsub(' ','_',gsub('differentiation','diff',gsub(',.*','',gsub(' - .*','',gsub(' cell line.*','',fantom_info$anno)))))
fantom_usage= fantom_usage[,fantom_info$ID]
fantom_eRNA=rownames(fantom_usage)
fantom_eRNA <- lapply(fantom_eRNA,function(x){y=unlist(strsplit(x,':'));c(y[1],unlist(strsplit(y[2],"-")))})
fantom_eRNA <- do.call(rbind,fantom_eRNA)
fantom_eRNA <- makeGRangesFromDataFrame(data.frame(chr=fantom_eRNA[,1],start=as.numeric(fantom_eRNA[,2]),
                                                   end=as.numeric(fantom_eRNA[,3])))
z=tapply(fantom_info$ID,fantom_info$group,c)
#z$all_brain=unlist(z)
fantom_brain=lapply(z,function(x)fantom_eRNA[rowSums(fantom_usage[,x,drop=F])>0])
fantom_allbrain=fantom_eRNA[rowSums(fantom_usage[,unlist(z),drop=F])>0]
gene=import.gff2(gene_path)
transcript=gene[gene$type=='gene']
pr=promoters(transcript,upstream=5000,downstream=0)
pas=tes(transcript,upstream=0,downstream=5000)
exon=gene[gene$type=='exon']
exon_extend=reduce(c(exon,promoters(exon,upstream = 1000,downstream = 1),
                     tes(exon,upstream = 1,downstream = 1000)),ignore.strand=T)
rRNA=import.bed(rRNA_path)
blacklist=df2gr(read.table(blacklist_path,sep='\t'))
filter=reduce(c(exon_extend,rRNA,blacklist),ignore.strand=T)


#######profiling around the fantom5 eRNA
cell='neuron'
summit=read.table(OCR_summit_path)
summit=GRanges(Rle(summit[,1]),IRanges(start=summit[,2],end=summit[,3]))

#negative set; OCRs with no CAGE tags 
OCR=df2gr(read.table(epi_peak_paths[['OCR']],as.is=T,sep='\t',quote=""))
OCR=OCR[!overlapsAny(OCR,filter)]
H3K27ac=df2gr(read.table(epi_peak_paths[['H3K27ac']],as.is=T,sep='\t',quote=""))
pos=resize(fantom_allbrain[overlapsAny(fantom_allbrain,OCR) &
                             !overlapsAny(fantom_allbrain,filter)],
           width=500,fix='center')
pos$set='pos'
summit=summit[overlapsAny(summit,OCR)]
test=resize(summit,width=500,fix='center')
testa=resize(test,width=250,fix='start')
testb=resize(test,width=250,fix='end')
cage_taga=bamCount(CAGE_bam_path,testa,verbose=FALSE,ss=T)
cage_tagb=bamCount(CAGE_bam_path,testb,verbose=FALSE,ss=T)
ind=which(cage_taga[2,]>0 & cage_tagb[1,]>0 & !overlapsAny(test,pos))
pos2=test[ind][sort(sample(length(test[ind]),length(pos)))]

pos2$set='pos'
cage_tags=bamCount(CAGE_bam_path,test,verbose=FALSE)
ind=which(cage_tags==0 & !overlapsAny(test,resize(fantom_eRNA,width=4000+width(fantom_eRNA),fix='center')))

neg=test[ind][sort(sample(length(test[ind]),length(pos)*2))]
neg$set='neg'
train=sort(c(pos,pos2,neg))

train$anno='intergenic'
train$anno[overlapsAny(train,pr)]='upstream'
train$anno[overlapsAny(train,pas)]='downstream'
train$anno[overlapsAny(train,gene,ignore.strand=T)]='intron'

test$anno='intergenic'
test$anno[overlapsAny(test,pr)]='upstream'
test$anno[overlapsAny(test,pas)]='downstream'
test$anno[overlapsAny(test,gene,ignore.strand=T)]='intron'

sel1=which(cage_taga[2,]>0 & cage_tagb[1,]>0 & !test%in%train)
sel2=sample(which(cage_taga[2,]==0 & cage_tagb[1,]==0 & !test%in%train),length(sel1))


allfeature=data.frame(do.call(cbind,mcmapply(function(x,y)scales(get_feature(x,c(train,test),y)),
                                             bamlist,names(bamlist),SIMPLIFY = F)))
train_feature=allfeature[1:length(train),]
test_feature=allfeature[-c(1:length(train)),]
validation=test_feature[c(sel1,sel2),]
train_feature$anno=factor(train$anno)
train_feature$set=factor(train$set)

test_feature$anno=factor(test$anno)

set.seed(1)
control <- trainControl(method="repeatedcv", number=10,repeats=2,search = 'random')
rf_cv <- train(set~., data=train_feature, method="rf", tuneLength=5,
               metric="Accuracy", trControl=control)
print(rf_cv)
rf <- randomForest(set ~ ., data=train_feature,importance=T,mtry=rf_cv$bestTune[[1]])
test$set = predict(rf, newdata=test_feature)

eRNA=reduce(sort(c(train[train$set=='pos'],
                   test[test$set=='pos'][!overlapsAny(test[test$set=='pos'],pos)])))
table(overlapsAny(eRNA,H3K27ac))
table(overlapsAny(eRNA,gene,ignore.strand=T))

pred1=predict(rf,validation,type = "prob")

perf = prediction(pred1[,2], rep(c(1,0),each=length(sel1)))
# 1. Area under curve
auc = performance(perf, "auc")
auc
names(eRNA)=paste0(cell,'_',as.vector(seqnames(eRNA)),'_',start(eRNA),'_',end(eRNA))

pdf(paste0('importance_RF_',cell,'.pdf'),width=10,height=5)
varImpPlot(rf)
dev.off()
pred1=predict(rf,type = "prob")
perf = prediction(pred1[,2], rep(c(1,0),each=length(sel1)))
# 1. Area under curve
auc = performance(perf, "auc")
auc
# 2. True Positive and Negative Rate
pred2 = performance(perf, "tpr","fpr")

pdf(paste0('ROC_RF_',cell,'.pdf'),width=5,height=5)
plot(pred2,main=paste0("AUC = ", round(auc@y.values[[1]],3)),col=2,lwd=2)
abline(a=0,b=1,lwd=2,lty=2,col="gray")
dev.off()
save(train_feature,test_feature,train,test,OCR,H3K27ac,rf_cv,rf,file=paste0('RF_',cell,'_predict.Rdata'))
