# Script to process and analyze NIJK04 NFkB CiBER-seq data

# Note: Need to run dual SFFV script and retrieve final processed counts table in order to run this script

library(Biostrings)
library(ggplot2)

# Download counts tables into current directory from GEO
download.file(url = "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE276055&format=file&file=GSE276055%5FNIJK04%5FiRFP%5Fbccounts%5Ffinal%2Etxt%2Egz",
              destfile = "./NIJK04_iRFP_bccounts_final.txt")
download.file(url = "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE276055&format=file&file=GSE276055%5FNIJK04%5FiRFP%5Fbccounts%5Freseq%2Etxt%2Egz",
              destfile = "./NIJK04_iRFP_bccounts_reseq.txt")
download.file(url = "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE276055&format=file&file=GSE276055%5FNIJK04%5FmCherry%5Fbccounts%5Ffinal%2Etxt%2Egz",
              destfile = "./NIJK04_mCherry_bccounts_final.txt")
download.file(url = "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE276055&format=file&file=GSE276055%5FNIJK04%5FmCherry%5Fbccounts%5Freseq%5Ffinal%2Etxt%2Egz",
              destfile = "./NIJK04_mCherry_bccounts_reseq_final.txt")

#######################################################
### STEP 1: PROCESSING DATA TO MAKE NIJK04_COUNTS_FINAL
#######################################################
### iRFP counts table ###
iRfpCounts <- read.delim("./NIJK04_iRFP_bccounts_final.txt", header = TRUE, stringsAsFactors = FALSE)
colnames(iRfpCounts) <- c('iRFP_barcode', 'iRFP1_0h', 'iRFP1_2h', 'iRFP1_5h',
                          'iRFP2_0h', 'iRFP2_2h', 'iRFP2_5h',
                          'iRFP3_0h', 'iRFP3_2h', 'iRFP3_5h')
iRfpCounts$sum = rowSums(iRfpCounts[ , c(2:10)], na.rm = TRUE)   # New column with the sum of all counts
iRfpCountsFiltered <- iRfpCounts[iRfpCounts$sum >= 100, ]    # Remove rows where sum < 100

View(iRfpCounts)
View(iRfpCountsFiltered)

### mCherry counts table ###
# Note: mCherry barcodes need to be converted to the reverse complement.
mCherryCounts <- read.delim("~/NIJK04/NIJK04_mCherry_bccounts_final.txt", header = TRUE, stringsAsFactors = FALSE)
colnames(mCherryCounts) <- c('mCherry_barcode', 'mCherry1_0h', 'mCherry1_2h', 'mCherry1_5h',
                             'mCherry2_0h', 'mCherry2_2h', 'mCherry2_5h',
                             'mCherry3_0h', 'mCherry3_2h', 'mCherry3_5h')
mCherryCounts$sum = rowSums(mCherryCounts[ , c(2:10)], na.rm = TRUE)   # New column with the sum of all counts
mCherryCountsFiltered <- mCherryCounts[mCherryCounts$sum >= 80, ]    # Remove rows where sum < 100
View(mCherryCounts)
View(mCherryCountsFiltered)

# Write reverse comp of barcode to match to look up table
mCherryCountsFiltered$RevComp <- sapply(mCherryCountsFiltered$mCherry_barcode, 
                                        function(x) as.character(reverseComplement(DNAString(x))))

# Read in final lookup table made from NIJL009 PacBio sequencing
final_lookup <- read.csv("../../pLJK06_v4_NIJL009_final_lookup_v2.csv", 
                         stringsAsFactors = FALSE)
View(final_lookup)

# Merge iRfpCountsFiltered with final lookup
lookup_iRFP <- merge(x = final_lookup, y = iRfpCountsFiltered[ , c('iRFP_barcode', 'iRFP1_0h', 'iRFP1_2h', 'iRFP1_5h',
                                                                   'iRFP2_0h', 'iRFP2_2h', 'iRFP2_5h',
                                                                   'iRFP3_0h', 'iRFP3_2h', 'iRFP3_5h')], 
                     by.x='bc1_neighborhood', by.y='iRFP_barcode', all.x = TRUE, all.y = TRUE)
View(lookup_iRFP)

# Merge lookup_iRFP with mCherryCountsFiltered
lookup_iRFP_mCherry <- merge(x = lookup_iRFP, 
                             y = mCherryCountsFiltered[ , c("RevComp", 'mCherry1_0h', 'mCherry1_2h', 'mCherry1_5h',
                                                            'mCherry2_0h', 'mCherry2_2h', 'mCherry2_5h',
                                                            'mCherry3_0h', 'mCherry3_2h', 'mCherry3_5h')], 
                             by.x='bc2_neighborhood', by.y='RevComp', all.x = TRUE, all.y = TRUE)
View(lookup_iRFP_mCherry)

# Remove rows where bc1 or bc2 is NA, meaning it wasn't seen in the PacBio sequencing run and we don't know
# which guides bc1 and bc2 link to
counts_final <- lookup_iRFP_mCherry %>% filter(!is.na(bc1_neighborhood) & !is.na(bc2_neighborhood))
counts_final[is.na(counts_final)] <- 0  # Change all NA values to 0
colnames(counts_final) <- c('bc2_neighborhood', 'bc1_neighborhood', 'sgRNA1', 'sgRNA2',
                            'count', 'distinct_count_bc1', 'distinct_count_bc2', 
                            'distinct_count_bc1_true', 'distinct_count_bc2_true', 
                            'bcs_perguide',
                            'iRFP1_0h', 'iRFP1_2h', 'iRFP1_5h', 'iRFP2_0h', 'iRFP2_2h', 'iRFP2_5h',
                            'iRFP3_0h', 'iRFP3_2h', 'iRFP3_5h', 'mCherry1_0h', 'mCherry1_2h', 'mCherry1_5h',
                            'mCherry2_0h', 'mCherry2_2h', 'mCherry2_5h', 'mCherry3_0h', 'mCherry3_2h', 'mCherry3_5h')
View(counts_final)    # Same length as final_lookup

# Remove rows where avg count < 12 to get rid of noise
View(NIJK04_counts_final)
NIJK04_counts_final <- mutate(counts_final, 
                              avg_count_experiment = (iRFP1_0h + iRFP1_2h + iRFP1_5h + iRFP2_0h + iRFP2_2h + iRFP2_5h +
                                                        iRFP3_0h + iRFP3_2h + iRFP3_5h + mCherry1_0h + mCherry1_2h + mCherry1_5h +
                                                        mCherry2_0h + mCherry2_2h + mCherry2_5h + mCherry3_0h + mCherry3_2h + mCherry3_5h) / 18)
NIJK04_counts_final <- NIJK04_counts_final[NIJK04_counts_final$avg_count_experiment >= 12, ]

######################################################################
### STEP 1B: PROCESSING RE-SEQUENCING DATA TO MAKE NIJK04_COUNTS_FINAL 
######################################################################
# Note: resequenced data is to be added on for more coverage

### iRFP counts table ###
reiRfpCounts<- read.delim("./NIJK04_iRFP_bccounts_reseq.txt", header = TRUE, stringsAsFactors = FALSE)
head(reiRfpCounts)
colnames(reiRfpCounts) <- c('iRFP_barcode', '2h_iRFP1', '5h_iRFP1',
                            '2h_iRFP2', '5h_iRFP2',
                            '2h_iRFP3', '5h_iRFP3')
reiRfpCounts$sum = rowSums(reiRfpCounts[ , c(2:7)], na.rm = TRUE)   # New column with the sum of all counts
reiRfpCountsFiltered <- reiRfpCounts[reiRfpCounts$sum >= 24, ]    # Remove rows where sum < 24. This is low b/c I'm adding to another dataset

View(reiRfpCounts)
View(reiRfpCountsFiltered)

### mCherry counts table ###
# Note: mCherry barcodes need to be converted to the reverse complement.
remCherryCounts <- read.delim("./NIJK04_mCherry_bccounts_reseq.txt", header = TRUE, stringsAsFactors = FALSE)
colnames(remCherryCounts) <- c('mCherry_barcode', '2h_mCherry1', '5h_mCherry1',
                               '2h_mCherry2', '5h_mCherry2',
                               '2h_mCherry3', '5h_mCherry3')
remCherryCounts$sum = rowSums(remCherryCounts[ , c(2:7)], na.rm = TRUE)   # New column with the sum of all counts
remCherryCountsFiltered <- remCherryCounts[remCherryCounts$sum >= 24, ]    # Remove rows where sum < 24
View(remCherryCounts)
View(remCherryCountsFiltered)

# Write reverse comp of barcode to match to look up table
remCherryCountsFiltered$RevComp <- sapply(remCherryCountsFiltered$mCherry_barcode, 
                                          function(x) as.character(reverseComplement(DNAString(x))))
View(counts_final)

# Merge reiRfpCountsFiltered with counts_final
reiRfp_counts_final <- merge(x = NIJK04_counts_final, 
                             y = reiRfpCountsFiltered[ , c("iRFP_barcode", '2h_iRFP1', '5h_iRFP1',
                                                           '2h_iRFP2', '5h_iRFP2',
                                                           '2h_iRFP3', '5h_iRFP3')], 
                             by.x='bc1_neighborhood', by.y='iRFP_barcode', all.x = TRUE, all.y = TRUE)
View(reiRfp_counts_final)

# Merge remCherryCountsFiltered with reiRfp_counts_final
remCh_counts_final <- merge(x = reiRfp_counts_final, 
                            y = remCherryCountsFiltered[ , c('RevComp', '2h_mCherry1', '5h_mCherry1',
                                                             '2h_mCherry2', '5h_mCherry2',
                                                             '2h_mCherry3', '5h_mCherry3')], 
                            by.x='bc2_neighborhood', by.y='RevComp', all.x = TRUE, all.y = TRUE)
View(remCh_counts_final)

# Now we've finished merging the re-sequenced barcode counts with the original barcode counts.

# Remove rows where bc1 or bc2 is NA, meaning it wasn't seen in the PacBio sequencing run and we don't know
# which guides bc1 and bc2 link to
counts_final2 <- remCh_counts_final %>% filter(!is.na(bc1_neighborhood) & !is.na(bc2_neighborhood))
counts_final2[is.na(counts_final2)] <- 0  # Change all NA values to 0

counts_final2_summed <- counts_final2 %>%
  rowwise() %>%
  mutate(sum_iRFP1_2h = sum(c(iRFP1_2h, `2h_iRFP1`)),
         sum_iRFP1_5h = sum(c(iRFP1_5h, `5h_iRFP1`)),
         sum_iRFP2_2h = sum(c(iRFP2_2h, `2h_iRFP2`)),
         sum_iRFP2_5h = sum(c(iRFP2_5h, `5h_iRFP2`)),
         sum_iRFP3_2h = sum(c(iRFP3_2h, `2h_iRFP3`)),
         sum_iRFP3_5h = sum(c(iRFP3_5h, `5h_iRFP3`)),
         sum_mCherry1_2h = sum(c(mCherry1_2h, `2h_mCherry1`)),
         sum_mCherry1_5h = sum(c(mCherry1_5h, `5h_mCherry1`)),
         sum_mCherry2_2h = sum(c(mCherry2_2h, `2h_mCherry2`)),
         sum_mCherry2_5h = sum(c(mCherry2_5h, `5h_mCherry2`)),
         sum_mCherry3_2h = sum(c(mCherry3_2h, `2h_mCherry3`)),
         sum_mCherry3_5h = sum(c(mCherry3_5h, `5h_mCherry3`)))

counts_final2_clean <- counts_final2_summed %>%
  select(bc2_neighborhood, bc1_neighborhood, sgRNA1, sgRNA2, count, distinct_count_bc1, distinct_count_bc2,
         distinct_count_bc1_true, distinct_count_bc2_true, bcs_perguide, iRFP1_0h, sum_iRFP1_2h, sum_iRFP1_5h,
         iRFP2_0h, sum_iRFP2_2h, sum_iRFP2_5h, iRFP3_0h, sum_iRFP3_2h, sum_iRFP3_5h, 
         mCherry1_0h, sum_mCherry1_2h, sum_mCherry1_5h, mCherry2_0h,
         sum_mCherry2_2h, sum_mCherry2_5h, mCherry3_0h, sum_mCherry3_2h, sum_mCherry3_5h)

counts_final2_clean_sum <- counts_final2_clean %>%
  mutate(avg_BC = rowSums(across(iRFP1_0h:sum_mCherry3_5h)) / 18) # find avg bc count

# Final NIJK04 barcode counts
NIJK04_counts_final <- counts_final2_clean_sum[counts_final2_clean_sum$avg_BC >= 12, ]    # Remove rows where avg count < 12
View(NIJK04_counts_final)

# Final NIJK04 gRNA counts
NIJK04_countsbygRNA <- select(NIJK04_counts_final, bc2_neighborhood, bc1_neighborhood, 
                              sgRNA1, iRFP1_0h, sum_iRFP1_2h, sum_iRFP1_5h,
                              iRFP2_0h, sum_iRFP2_2h, sum_iRFP2_5h,
                              iRFP3_0h, sum_iRFP3_2h, sum_iRFP3_5h, mCherry1_0h, sum_mCherry1_2h, sum_mCherry1_5h,
                              mCherry2_0h, sum_mCherry2_2h, sum_mCherry2_5h, mCherry3_0h, sum_mCherry3_2h, sum_mCherry3_5h)

NIJK04_countsbygRNA <- NIJK04_counts_final %>%
  group_by(sgRNA1) %>%
  summarize(iRFP1_0h = sum(iRFP1_0h, na.rm = TRUE),
            iRFP1_2h = sum(sum_iRFP1_2h, na.rm = TRUE),
            iRFP1_5h = sum(sum_iRFP1_5h, na.rm = TRUE),
            iRFP2_0h = sum(iRFP2_0h, na.rm = TRUE),
            iRFP2_2h = sum(sum_iRFP2_2h, na.rm = TRUE),
            iRFP2_5h = sum(sum_iRFP2_5h, na.rm = TRUE),
            iRFP3_0h = sum(iRFP3_0h, na.rm = TRUE),
            iRFP3_2h = sum(sum_iRFP3_2h, na.rm = TRUE),
            iRFP3_5h = sum(sum_iRFP3_5h, na.rm = TRUE),
            mCherry1_0h = sum(mCherry1_0h, na.rm = TRUE),
            mCherry1_2h = sum(sum_mCherry1_2h, na.rm = TRUE),
            mCherry1_5h = sum(sum_mCherry1_5h, na.rm = TRUE),
            mCherry2_0h = sum(mCherry2_0h, na.rm = TRUE),
            mCherry2_2h = sum(sum_mCherry2_2h, na.rm = TRUE),
            mCherry2_5h = sum(sum_mCherry2_5h, na.rm = TRUE),
            mCherry3_0h = sum(mCherry3_0h, na.rm = TRUE),
            mCherry3_2h = sum(sum_mCherry3_2h, na.rm = TRUE),
            mCherry3_5h = sum(sum_mCherry3_5h, na.rm = TRUE))
View(NIJK04_countsbygRNA)

tiff("~/NIJK04/comparison1.tiff", units="in", width=5, height=4, res=300)
ggplot(NIJK04_countsbygRNA, aes(x=log2(iRFP1_2h), y=log2(mCherry1_2h))) + geom_point(alpha = 0.1) +
  xlim(0,15) + ylim(0,15) + xlab("log2(Rep1 iRFP 2h-TNFa counts)") + ylab("log2(Rep1 mCherry 2h-TNFa counts)") + 
  theme(text = element_text(size = 15), panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.background = element_blank(), axis.line = element_line(colour = "black"))
dev.off()
cor.test(NIJK04_countsbygRNA$iRFP1_2h, NIJK04_countsbygRNA$mCherry1_2h, method=c("spearman"))

tiff("~/NIJK04/comparison2.tiff", units="in", width=5, height=4, res=300)
ggplot(NIJK04_countsbygRNA, aes(x=log2(iRFP2_2h), y=log2(mCherry2_2h))) + geom_point(alpha = 0.1) +
  xlim(0,15) + ylim(0,15) + xlab("log2(Rep2 iRFP 2h-TNFa counts)") + ylab("log2(Rep2 mCherry 2h-TNFa counts)") + 
  theme(text = element_text(size = 15), panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.background = element_blank(), axis.line = element_line(colour = "black"))
dev.off()
cor.test(NIJK04_countsbygRNA$iRFP2_2h, NIJK04_countsbygRNA$mCherry2_2h, method=c("spearman"))

tiff("~/NIJK04/comparison3.tiff", units="in", width=5, height=4, res=300)
ggplot(NIJK04_countsbygRNA, aes(x=log2(iRFP3_2h), y=log2(mCherry3_2h))) + geom_point(alpha = 0.1) +
  xlim(0,15) + ylim(0,15) + xlab("log2(Rep3 iRFP 2h-TNFa counts)") + ylab("log2(Rep3 mCherry 2h-TNFa counts)") + 
  theme(text = element_text(size = 15), panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.background = element_blank(), axis.line = element_line(colour = "black"))
dev.off()
cor.test(NIJK04_countsbygRNA$iRFP3_2h, NIJK04_countsbygRNA$mCherry3_2h, method=c("spearman"))

tiff("~/NIJK04/comparison4.tiff", units="in", width=5, height=4, res=300)
ggplot(NIJK04_countsbygRNA, aes(x=log2(iRFP1_2h), y=log2(iRFP1_5h))) + geom_point(alpha = 0.1) +
  xlim(0,15) + ylim(0,15) + xlab("log2(Rep1 iRFP 2h-TNFa counts)") + ylab("log2(Rep1 iRFP 5h-TNFa counts)") + 
  theme(text = element_text(size = 15), panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.background = element_blank(), axis.line = element_line(colour = "black"))
dev.off()
cor.test(NIJK04_countsbygRNA$iRFP1_2h, NIJK04_countsbygRNA$iRFP1_5h, method=c("spearman"))

tiff("~/NIJK04/comparison5.tiff", units="in", width=5, height=4, res=300)
ggplot(NIJK04_countsbygRNA, aes(x=log2(iRFP2_2h), y=log2(iRFP2_5h))) + geom_point(alpha = 0.1) +
  xlim(0,15) + ylim(0,15) + xlab("log2(Rep2 iRFP 2h-TNFa counts)") + ylab("log2(Rep2 iRFP 5h-TNFa counts)") + 
  theme(text = element_text(size = 15), panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.background = element_blank(), axis.line = element_line(colour = "black"))
dev.off()
cor.test(NIJK04_countsbygRNA$iRFP2_2h, NIJK04_countsbygRNA$iRFP2_5h, method=c("spearman"))

tiff("~/NIJK04/comparison6.tiff", units="in", width=5, height=4, res=300)
ggplot(NIJK04_countsbygRNA, aes(x=log2(iRFP3_2h), y=log2(iRFP3_5h))) + geom_point(alpha = 0.1) +
  xlim(0,15) + ylim(0,15) + xlab("log2(Rep3 iRFP 2h-TNFa counts)") + ylab("log2(Rep3 iRFP 5h-TNFa counts)") + 
  theme(text = element_text(size = 15), panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.background = element_blank(), axis.line = element_line(colour = "black"))
dev.off()
cor.test(NIJK04_countsbygRNA$iRFP3_2h, NIJK04_countsbygRNA$iRFP3_5h, method=c("spearman"))

#######################################################
### STEP 2: NORMALIZING NFKB COUNTS TO DUAL SFFV COUNTS
#######################################################

# Read in dual SFFV file
NIJK03_SFFV_counts <- read.csv("./NIJK03_counts_fc.csv", header = TRUE)
View(NIJK03_SFFV_counts)

# Only want post-dox (CRISPRi induced) conditions
NIJK03_SFFV_counts <- select(NIJK03_SFFV_counts, bc2_neighborhood, bc1_neighborhood,
                             sgRNA1, sgRNA2, post_iRFP1, post_iRFP2, post_iRFP3,
                             post_mCherry1, post_mCherry2, post_mCherry3)

# Need bc_id for merging
NIJK03_SFFV_counts$bc_id <- paste(NIJK03_SFFV_counts$bc1_neighborhood, NIJK03_SFFV_counts$bc2_neighborhood, 
                                  NIJK03_SFFV_counts$sgRNA1, sep="-")
sum(duplicated(NIJK03_SFFV_counts$bc_id))
NIJK03_SFFV_counts <- NIJK03_SFFV_counts %>%
  rename(
    post_iRFP1_SFFV = post_iRFP1,
    post_iRFP2_SFFV = post_iRFP2,
    post_iRFP3_SFFV = post_iRFP3,
    post_mCherry1_SFFV = post_mCherry1,
    post_mCherry2_SFFV = post_mCherry2,
    post_mCherry3_SFFV = post_mCherry3
  )

NIJK04_NFKB_counts <- select(NIJK04_counts_final, bc2_neighborhood, bc1_neighborhood,
                             sgRNA1, sgRNA2, iRFP1_0h, sum_iRFP1_2h, sum_iRFP1_5h, iRFP2_0h, sum_iRFP2_2h, sum_iRFP2_5h,
                             iRFP3_0h, sum_iRFP3_2h, sum_iRFP3_5h, mCherry1_0h, sum_mCherry1_2h, sum_mCherry1_5h,
                             mCherry2_0h, sum_mCherry2_2h, sum_mCherry2_5h, mCherry3_0h, sum_mCherry3_2h, sum_mCherry3_5h)
NIJK04_NFKB_counts$bc_id <- paste(NIJK04_NFKB_counts$bc1_neighborhood, NIJK04_NFKB_counts$bc2_neighborhood, 
                                  NIJK04_NFKB_counts$sgRNA1, sep="-")
sum(duplicated(NIJK03_SFFV_counts$bc_id))
View(NIJK04_NFKB_counts)

NIJK03_04_counts <- merge(x = NIJK03_SFFV_counts, y = NIJK04_NFKB_counts, 
                          by.x='bc_id', by.y='bc_id', all.x = TRUE, all.y = TRUE)
NIJK03_04_counts_clean <- na.omit(NIJK03_04_counts)
View(NIJK03_04_counts_clean)

# Sum barcodes by gRNA
NIJK03_04_countsbygRNA <- NIJK03_04_counts_clean %>%
  group_by(sgRNA1.x) %>%
  summarize(post_iRFP1_SFFV = sum(post_iRFP1_SFFV),
            post_iRFP2_SFFV = sum(post_iRFP2_SFFV),
            post_iRFP3_SFFV = sum(post_iRFP3_SFFV),
            post_mCherry1_SFFV = sum(post_mCherry1_SFFV),
            post_mCherry2_SFFV = sum(post_mCherry2_SFFV),
            post_mCherry3_SFFV = sum(post_mCherry3_SFFV), 
            iRFP1_0h = sum(iRFP1_0h, na.rm = TRUE),
            iRFP1_2h = sum(sum_iRFP1_2h, na.rm = TRUE),
            iRFP1_5h = sum(sum_iRFP1_5h, na.rm = TRUE),
            iRFP2_0h = sum(iRFP2_0h, na.rm = TRUE),
            iRFP2_2h = sum(sum_iRFP2_2h, na.rm = TRUE),
            iRFP2_5h = sum(sum_iRFP2_5h, na.rm = TRUE),
            iRFP3_0h = sum(iRFP3_0h, na.rm = TRUE),
            iRFP3_2h = sum(sum_iRFP3_2h, na.rm = TRUE),
            iRFP3_5h = sum(sum_iRFP3_5h, na.rm = TRUE),
            mCherry1_0h = sum(mCherry1_0h, na.rm = TRUE),
            mCherry1_2h = sum(sum_mCherry1_2h, na.rm = TRUE),
            mCherry1_5h = sum(sum_mCherry1_5h, na.rm = TRUE),
            mCherry2_0h = sum(mCherry2_0h, na.rm = TRUE),
            mCherry2_2h = sum(sum_mCherry2_2h, na.rm = TRUE),
            mCherry2_5h = sum(sum_mCherry2_5h, na.rm = TRUE),
            mCherry3_0h = sum(mCherry3_0h, na.rm = TRUE),
            mCherry3_2h = sum(sum_mCherry3_2h, na.rm = TRUE),
            mCherry3_5h = sum(sum_mCherry3_5h, na.rm = TRUE))
View(NIJK03_04_countsbygRNA)

View(NIJK03_04_countsbygRNA)

#########################################################################################
### STEP 3A: Deseq2 on NFKB 5h TNF-alpha induction timepoint normalized to dual SFFV data
#########################################################################################

deseq2_v1 <- select(NIJK03_04_countsbygRNA, sgRNA1.x, post_iRFP1_SFFV, post_iRFP2_SFFV, post_iRFP3_SFFV,
                    post_mCherry1_SFFV, post_mCherry2_SFFV, post_mCherry3_SFFV,
                    iRFP1_5h, iRFP2_5h, iRFP3_5h, mCherry1_5h, mCherry2_5h, mCherry3_5h)
deseq2_v1 <- data.frame(deseq2_v1)
View(deseq2_v1)

coldata_v1 <- data.frame(id = c("post_iRFP1_SFFV", "post_iRFP2_SFFV", "post_iRFP3_SFFV",
                                "post_mCherry1_SFFV", "post_mCherry2_SFFV", "post_mCherry3_SFFV", 
                                "iRFP1_5h_NFKB", "iRFP2_5h_NFKB", "iRFP3_5h_NFKB",
                                "mCherry1_5h_NFKB", "mCherry2_5h_NFKB", "mCherry3_5h_NFKB"),
                         replicate = c("1", "2", "3", "1", "2", "3", 
                                       "1", "4", "5", "1", "4", "5"),
                         barcode = c("iRFP", "iRFP", "iRFP", "mCherry", "mCherry", "mCherry", 
                                     "iRFP", "iRFP", "iRFP", "mCherry","mCherry","mCherry"),
                         promoter = c("SFFV", "SFFV", "SFFV", "SFFV", "SFFV", "SFFV", 
                                      "NFKB", "NFKB", "NFKB", "NFKB", "NFKB", "NFKB"))
View(coldata_v1)

dds_v1 <- DESeqDataSetFromMatrix(countData = deseq2_v1, 
                                 colData = coldata_v1, 
                                 design = ~replicate + barcode * promoter, tidy = TRUE)
# Design formula takes into account the factor variables replicate, condition, and barcode
# and also accounts for the interaction between condition and barcode (aka, the post-mCherry condition)

# Set custom reference levels
dds_v1$barcode <- factor(dds_v1$barcode, levels = c("iRFP","mCherry"))
dds_v1$promoter <- factor(dds_v1$promoter, levels = c("SFFV","NFKB"))

# Run DESeq
dds_v1 <- DESeq(dds_v1)
resultsNames(dds_v1)
plotDispEsts(dds_v1)

res_v1 <- results(dds_v1, name = "barcodemCherry.promoterNFKB")
summary(res_v1)
res_v1_barcodemCherry_promoterNFKB <- as.data.frame(res_v1)
res_v1_barcodemCherry_promoterNFKB <- tibble::rownames_to_column(res_v1_barcodemCherry_promoterNFKB, "sgRNA")

res_v1_barcodemCherry_promoterNFKB$sgRNA_name <- sapply(strsplit(as.character(res_v1_barcodemCherry_promoterNFKB$sgRNA), "_"), function(x) x[1])
View(res_v1_barcodemCherry_promoterNFKB)

# Clean plot by setting padj values < 1e-20 to 1e-20
res_v1_barcodemCherry_promoterNFKBadj <- res_v1_barcodemCherry_promoterNFKB
res_v1_barcodemCherry_promoterNFKBadj$padj[res_v1_barcodemCherry_promoterNFKBadj$padj < 1e-20] <- 1e-20
res_v1_barcodemCherry_promoterNFKBadj$log2FoldChange[res_v1_barcodemCherry_promoterNFKBadj$log2FoldChange > 10] <- 10
View(res_v1_barcodemCherry_promoterNFKBadj)

res_v1_barcodemCherry_promoterNFKBadj <- res_v1_barcodemCherry_promoterNFKBadj %>% 
  mutate(sgRNA_label = ifelse(sgRNA_name %in% go_0043122_reg_canonical_nfkb &
                                log2FoldChange < -1 & padj <0.01, sgRNA_name, ""))

# Volcano plot
tiff("~/NIJK04/mCherry_vs_iRFP_5h.tiff", units="in", width=5, height=4, res=300)
ggplot(res_v1_barcodemCherry_promoterNFKBadj, aes(x=log2FoldChange, y=-log10(padj))) +
  geom_point(na.rm = TRUE, size=2, colour = "gray", alpha = 0.3) +
  geom_point(data = subset(res_v1_barcodemCherry_promoterNFKBadj, padj < 0.01 & (log2FoldChange > 1 | log2FoldChange < -1)),
             aes(x=log2FoldChange, y=-log10(padj)), size=2, colour = "#ad82a4", alpha = 0.3) +
  geom_point(data = subset(res_v1_barcodemCherry_promoterNFKBadj, sgRNA_name == "non-targeting"),
             aes(x=log2FoldChange, y=-log10(padj)), size=2, colour = "black") + # Color points for non-targeting
  geom_point(data = subset(res_v1_barcodemCherry_promoterNFKBadj, sgRNA_name %in% go_0043122_reg_canonical_nfkb),
             aes(x=log2FoldChange, y=-log10(padj)), size=2, colour = "#6368B0") +
  geom_point(data = subset(res_v1_barcodemCherry_promoterNFKBadj, sgRNA_name == "GNG12"),
             aes(x=log2FoldChange, y=-log10(padj)), size=2, colour = "#D5B60A") + 
  geom_point(data = subset(res_v1_barcodemCherry_promoterNFKBadj, sgRNA_name == "NELFA"),
             aes(x=log2FoldChange, y=-log10(padj)), size=2, colour = "#D5B60A") + 
  geom_hline(yintercept=2, linetype="dashed", color = "black", size=1) +
  geom_vline(xintercept=-1, linetype="dashed", color = "black", size=1) +
  geom_vline(xintercept=1, linetype="dashed", color = "black", size=1) +
  theme_bw() + xlim(-11, 11) +
  labs(x="log2FoldChange", y="-log10(adjusted p-value)") +
  theme(axis.text.x=element_text(hjust=1, size = 14), axis.text.y = element_text(size = 14),
        axis.title = element_text(size = 15),
        panel.border = element_blank(), panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"))
dev.off()

nrow(res_v1_barcodemCherry_promoterNFKB)
nrow(subset(res_v1_barcodemCherry_promoterNFKB, sgRNA_name %in% go_0043122_reg_canonical_nfkb))
nrow(subset(res_v1_barcodemCherry_promoterNFKB, padj < 0.01 & (log2FoldChange < -1 | log2FoldChange > 1)))
nrow(subset(res_v1_barcodemCherry_promoterNFKB, sgRNA_name == "non-targeting"))


#########################################################################################
### STEP 3B: Deseq2 on NFKB 2h TNF-alpha induction timepoint normalized to dual SFFV data
#########################################################################################

deseq2_v2 <- select(NIJK03_04_countsbygRNA, sgRNA1.x, post_iRFP1_SFFV, post_iRFP2_SFFV, post_iRFP3_SFFV,
                    post_mCherry1_SFFV, post_mCherry2_SFFV, post_mCherry3_SFFV,
                    iRFP1_2h, iRFP2_2h, iRFP3_2h, mCherry1_2h, mCherry2_2h, mCherry3_2h)
deseq2_v2 <- data.frame(deseq2_v2)
View(deseq2_v2)

coldata_v2 <- data.frame(id = c("post_iRFP1_SFFV", "post_iRFP2_SFFV", "post_iRFP3_SFFV",
                                "post_mCherry1_SFFV", "post_mCherry2_SFFV", "post_mCherry3_SFFV", 
                                "iRFP1_2h_NFKB", "iRFP2_2h_NFKB", "iRFP3_2h_NFKB",
                                "mCherry1_2h_NFKB", "mCherry2_2h_NFKB", "mCherry3_2h_NFKB"),
                         replicate = c("1", "2", "3", "1", "2", "3", 
                                       "1", "4", "5", "1", "4", "5"),
                         barcode = c("iRFP", "iRFP", "iRFP", "mCherry", "mCherry", "mCherry", 
                                     "iRFP", "iRFP", "iRFP", "mCherry","mCherry","mCherry"),
                         promoter = c("SFFV", "SFFV", "SFFV", "SFFV", "SFFV", "SFFV", 
                                      "NFKB", "NFKB", "NFKB", "NFKB", "NFKB", "NFKB"))
View(coldata_v2)

dds_v2 <- DESeqDataSetFromMatrix(countData = deseq2_v2, 
                                 colData = coldata_v2, 
                                 design = ~replicate + barcode * promoter, tidy = TRUE)
# Design formula takes into account the factor variables replicate, condition, and barcode
# and also accounts for the interaction between condition and barcode (aka, the post-mCherry condition)

# Set custom reference levels
dds_v2$barcode <- factor(dds_v2$barcode, levels = c("iRFP","mCherry"))
dds_v2$promoter <- factor(dds_v2$promoter, levels = c("SFFV","NFKB"))

# Run DESeq
dds_v2 <- DESeq(dds_v2)
resultsNames(dds_v2)
plotDispEsts(dds_v2)

res_v2 <- results(dds_v2, name = "barcodemCherry.promoterNFKB")
summary(res_v2)
res_v2_barcodemCherry_promoterNFKB <- as.data.frame(res_v2)
res_v2_barcodemCherry_promoterNFKB <- tibble::rownames_to_column(res_v2_barcodemCherry_promoterNFKB, "sgRNA")

res_v2_barcodemCherry_promoterNFKB$sgRNA_name <- sapply(strsplit(as.character(res_v2_barcodemCherry_promoterNFKB$sgRNA), "_"), function(x) x[1])

View(res_v2_barcodemCherry_promoterNFKB)

nrow(res_v2_barcodemCherry_promoterNFKB)
nrow(subset(res_v2_barcodemCherry_promoterNFKB, sgRNA_name %in% go_0043122_reg_canonical_nfkb))
nrow(subset(res_v2_barcodemCherry_promoterNFKB, padj < 0.01 & (log2FoldChange < -1 | log2FoldChange > 1)))
nrow(subset(res_v2_barcodemCherry_promoterNFKB, sgRNA_name == "non-targeting"))

# Clean plot by setting padj values < 1e-20 to 1e-20
res_v2_barcodemCherry_promoterNFKBadj <- res_v2_barcodemCherry_promoterNFKB
res_v2_barcodemCherry_promoterNFKBadj$padj[res_v2_barcodemCherry_promoterNFKBadj$padj < 1e-20] <- 1e-20
res_v2_barcodemCherry_promoterNFKBadj$log2FoldChange[res_v2_barcodemCherry_promoterNFKBadj$log2FoldChange > 10] <- 10

res_v2_barcodemCherry_promoterNFKBadj <- res_v2_barcodemCherry_promoterNFKBadj %>% 
  mutate(sgRNA_label = ifelse(sgRNA_name %in% go_0043122_reg_canonical_nfkb &
                                log2FoldChange < -1 & padj <0.01, sgRNA_name, ""))
View(res_v2_barcodemCherry_promoterNFKBadj)

# Volcano plot, 2h treatment
tiff("~/NIJK04/mCherry_vs_iRFP_2h.tiff", units="in", width=5, height=4, res=300)
ggplot(res_v2_barcodemCherry_promoterNFKBadj, aes(x=log2FoldChange, y=-log10(padj))) +
  geom_point(na.rm = TRUE, size=2, colour = "gray", alpha = 0.3) +
  geom_point(data = subset(res_v2_barcodemCherry_promoterNFKBadj, padj < 0.01 & (log2FoldChange > 1 | log2FoldChange < -1)),
             aes(x=log2FoldChange, y=-log10(padj)), size=2, colour = "#ad82a4", alpha = 0.3) +
  geom_point(data = subset(res_v2_barcodemCherry_promoterNFKBadj, sgRNA_name == "non-targeting"),
             aes(x=log2FoldChange, y=-log10(padj)), size=2, colour = "black") + # Color points for non-targeting
  geom_point(data = subset(res_v2_barcodemCherry_promoterNFKBadj, sgRNA_name %in% go_0043122_reg_canonical_nfkb),
             aes(x=log2FoldChange, y=-log10(padj)), size=2, colour = "#6368B0") +
  geom_point(data = subset(res_v2_barcodemCherry_promoterNFKBadj, sgRNA_name == "GNG12"),
             aes(x=log2FoldChange, y=-log10(padj)), size=2, colour = "#D5B60A") + 
  geom_point(data = subset(res_v2_barcodemCherry_promoterNFKBadj, sgRNA_name == "NELFA"),
             aes(x=log2FoldChange, y=-log10(padj)), size=2, colour = "#D5B60A") + 
  geom_hline(yintercept=2, linetype="dashed", color = "black", size=1) +
  geom_vline(xintercept=-1, linetype="dashed", color = "black", size=1) +
  geom_vline(xintercept=1, linetype="dashed", color = "black", size=1) +
  theme_bw() + xlim(-11, 11) +
  labs(x="log2FoldChange", y="-log10(adjusted p-value)") +
  theme(axis.text.x=element_text(hjust=1, size = 14), axis.text.y = element_text(size = 14),
        axis.title = element_text(size = 15),
        panel.border = element_blank(), panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"))
dev.off()

# Make GO list
go1_res_v2_barcodemCherry_promoterNFKB <- 
  res_v2_barcodemCherry_promoterNFKB[res_v2_barcodemCherry_promoterNFKB$log2FoldChange <= -1.5 & 
                                       res_v2_barcodemCherry_promoterNFKB$padj <= 0.01, 
                                     c("sgRNA_name", "log2FoldChange")]
View(go1_res_v2_barcodemCherry_promoterNFKB)
write.table(go1_res_v2_barcodemCherry_promoterNFKB, "./DESeq_go1_res_v2_barcodemCherry_promoterNFKB.tsv",
            quote = FALSE, row.names=FALSE, col.names = FALSE, sep="\t")

###############################################################################################
### STEP 3C: Deseq2 on NFKB 5h vs 2h TNF-alpha induction timepoint normalized to dual SFFV data
###############################################################################################

deseq2_v5 <- select(NIJK03_04_countsbygRNA, sgRNA1.x, post_iRFP1_SFFV, post_iRFP2_SFFV, post_iRFP3_SFFV,
                    post_mCherry1_SFFV, post_mCherry2_SFFV, post_mCherry3_SFFV,
                    iRFP1_2h, iRFP2_2h, iRFP3_2h, mCherry1_2h, mCherry2_2h, mCherry3_2h,
                    iRFP1_5h, iRFP2_5h, iRFP3_5h, mCherry1_5h, mCherry2_5h, mCherry3_5h)
deseq2_v5 <- data.frame(deseq2_v5)
View(deseq2_v5)

coldata_v5 <- data.frame(id = c("post_iRFP1_SFFV", "post_iRFP2_SFFV", "post_iRFP3_SFFV",
                                "post_mCherry1_SFFV", "post_mCherry2_SFFV", "post_mCherry3_SFFV",
                                "iRFP1_2h_NFKB", "iRFP2_2h_NFKB", "iRFP3_2h_NFKB",
                                "mCherry1_2h_NFKB", "mCherry2_2h_NFKB", "mCherry3_2h_NFKB",
                                "iRFP1_5h_NFKB", "iRFP2_5h_NFKB", "iRFP3_5h_NFKB",
                                "mCherry1_5h_NFKB", "mCherry2_5h_NFKB", "mCherry3_5h_NFKB"),
                         replicate = c("1", "2", "3", "1", "2", "3",
                                       "1", "4", "5", "1", "4", "5",
                                       "1", "4", "5", "1", "4", "5"),
                         barcode = c("iRFP", "iRFP", "iRFP", "mCherry", "mCherry", "mCherry", 
                                     "iRFP", "iRFP", "iRFP", "mCherry", "mCherry", "mCherry", 
                                     "iRFP", "iRFP", "iRFP", "mCherry","mCherry","mCherry"),
                         promoter = c("SFFV", "SFFV", "SFFV", "SFFV", "SFFV", "SFFV", 
                                      "NFKB_2h", "NFKB_2h", "NFKB_2h", "NFKB_2h", "NFKB_2h", "NFKB_2h",
                                      "NFKB_5h", "NFKB_5h", "NFKB_5h", "NFKB_5h", "NFKB_5h", "NFKB_5h"))
View(coldata_v5)

dds_v5 <- DESeqDataSetFromMatrix(countData = deseq2_v5, 
                                 colData = coldata_v5, 
                                 design = ~replicate + barcode * promoter, tidy = TRUE)
# Design formula takes into account the factor variables replicate, condition, and barcode
# and also accounts for the interaction between condition and barcode (aka, the post-mCherry condition)

# Set custom reference levels
dds_v5$barcode <- factor(dds_v5$barcode, levels = c("iRFP","mCherry"))
dds_v5$promoter <- factor(dds_v5$promoter, levels = c("SFFV","NFKB_2h", "NFKB_5h"))

# Run DESeq
dds_v5 <- DESeq(dds_v5)
resultsNames(dds_v5)
plotDispEsts(dds_v5)

res_v5 <- results(dds_v5, contrast=list(c("barcodemCherry.promoterNFKB_2h"), 
                                        c("barcodemCherry.promoterNFKB_5h")))
summary(res_v5)
res_v5_barcodemCherry_promoterNFKB_2hvs5h <- as.data.frame(res_v5)
res_v5_barcodemCherry_promoterNFKB_2hvs5h <- tibble::rownames_to_column(res_v5_barcodemCherry_promoterNFKB_2hvs5h, "sgRNA")

res_v5_barcodemCherry_promoterNFKB_2hvs5h$sgRNA_name <- sapply(strsplit(as.character(res_v5_barcodemCherry_promoterNFKB_2hvs5h$sgRNA), "_"), function(x) x[1])

View(res_v5_barcodemCherry_promoterNFKB_2hvs5h)

tiff("~/NIJK04/mCherry_vs_iRFP_2hvs5h.tiff", units="in", width=5, height=4, res=300)
ggplot(res_v5_barcodemCherry_promoterNFKB_2hvs5h, aes(x=log2FoldChange, y=-log10(padj))) +
  geom_point(na.rm = TRUE, size=2, colour = "gray", alpha = 1) +
  geom_point(data = subset(res_v5_barcodemCherry_promoterNFKB_2hvs5h, padj < 0.01 & (log2FoldChange > 1 | log2FoldChange < -1)),
             aes(x=log2FoldChange, y=-log10(padj)), size=2, colour = "#ad82a4", alpha = 0.3) +
  geom_point(data = subset(res_v5_barcodemCherry_promoterNFKB_2hvs5h, sgRNA_name == "non-targeting"),
             aes(x=log2FoldChange, y=-log10(padj)), size=2, colour = "black") + # Color points for non-targeting
  geom_hline(yintercept=2, linetype="dashed", color = "black", size=1) +
  geom_vline(xintercept=-1, linetype="dashed", color = "black", size=1) +
  geom_vline(xintercept=1, linetype="dashed", color = "black", size=1) +
  theme_bw() + xlim(-11, 11) +
  labs(x="log2FoldChange", y="-log10(adjusted p-value)") +
  theme(axis.text.x=element_text(hjust=1, size = 14), axis.text.y = element_text(size = 14),
        axis.title = element_text(size = 15),
        panel.border = element_blank(), panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"))
dev.off()

####################################
### STEP 4: ATTACH GENE DESCRIPTIONS
####################################

# https://www.bioconductor.org/help/course-materials/2015/UseBioconductorFeb2015/A01.5_Annotation.html

library("biomaRt")

listMarts()
ensembl <- useMart(biomart="ENSEMBL_MART_ENSEMBL", dataset="hsapiens_gene_ensembl")
View(listFilters(ensembl))
myFilter <- "entrezgene_accession"
View(listAttributes(ensembl))
annots <- getBM(attributes=c("external_gene_name", "gene_biotype", "description"),
                mart=ensembl)
View(annots)
wantedAnnots <- annots[annots$gene_biotype == "protein_coding", ]
View(wantedAnnots)

# Attach gene descriptions to previous Deseq results

res_v1_barcodemCherry_promoterNFKB_annots <- merge(x = res_v1_barcodemCherry_promoterNFKB, y = wantedAnnots, 
                                                   by.x='sgRNA_name', by.y='external_gene_name', all.x = TRUE, all.y = TRUE)
View(res_v1_barcodemCherry_promoterNFKB_annots)

res_v2_barcodemCherry_promoterNFKB_annots <- merge(x = res_v2_barcodemCherry_promoterNFKB, y = wantedAnnots, 
                                                   by.x='sgRNA_name', by.y='external_gene_name', all.x = TRUE, all.y = TRUE)
View(res_v2_barcodemCherry_promoterNFKB_annots)

