### GOAL: Build lookup table using data from NIJL009 that contains read_name, guide1, guide2, bc1, bc2
# Doing this with guides aligned using constant regions +/- 10 nt from protospacer sequence
# pLJK06_v4

## FIRST: Merge guides

## Read in guide name & read text files from guides called by bowtie2
guide1 <- read.delim("./guide1_read.txt", header = FALSE)
colnames(guide1) <- c('read', 'sgRNA1')
guide1$read = substr(guide1$read, 1, nchar(guide1$read) - 5) # get rid of last 5 characters of read name
nrow(guide1)
head(guide1)

guide2 <- read.delim("./guide2_read.txt", header = FALSE)
colnames(guide2) <- c('read', 'sgRNA2')
guide2$read = substr(guide2$read, 1, nchar(guide2$read) - 5) # get rid of last 5 characters of read name
nrow(guide2)
head(guide2)

## Merge guides by read name
guide_table <- merge(guide1, guide2, by="read")
nrow(guide_table)
View(guide_table)
nonmatching_guides <- guide_table[guide_table$sgRNA1 != guide_table$sgRNA2, ]
nrow(nonmatching_guides) / nrow(guide_table) # 5% of guides are not matching, likely bowtie matching is wrong

table(guide_table$sgRNA1 != guide_table$sgRNA2 & guide_table$sgRNA1 != '*' & guide_table$sgRNA2 != '*')

### SECOND: Merge bc1 and bc2 with guides

# Read and organize read inserts good table
read_inserts <- read.delim("./pLJK06_v4-read-inserts-good.txt", header = FALSE)
colnames(read_inserts) <- c('read', 'dir', 'V3', 'bc1', 'V5', 'bc2',
                            'V7', 'sgRNA1_read', 'V9', 'sgRNA2_read')
View(read_inserts)
nrow(read_inserts)

new_read_inserts <- select(read_inserts, read, dir, bc1, bc2, sgRNA1_read, sgRNA2_read)
View(new_read_inserts)

# Read in barcode neighborhoods from hash table
bc1_raw <- read.delim("./pLJK06_v4_bc1-raw-barcodes.txt", header = TRUE)
bc2_raw <- read.delim("./pLJK06_v4_bc2-raw-barcodes.txt", header = TRUE)

colnames(bc1_raw) <- c('bc1', 'bc1_neighborhood', 'count', 'total', 'fraction')
head(bc1_raw)
colnames(bc2_raw) <- c('bc2', 'bc2_neighborhood', 'count', 'total', 'fraction')
head(bc2_raw)

# Merge the bc neighborhoods with read inserts good
reads_bc1 <- merge(x = new_read_inserts, y = bc1_raw[ , c("bc1", "bc1_neighborhood")], 
                   by.x='bc1', by.y='bc1', all.x=TRUE)

reads_bc1_bc2 <- merge(x = reads_bc1, y = bc2_raw[ , c("bc2", "bc2_neighborhood")], 
                       by.x = 'bc2', by.y = 'bc2', all.x = TRUE)

# Merge guide table with read inserts good
raw_lookup_table_NIJL009 <- merge(x = reads_bc1_bc2, y = guide_table,
                                  by.x = 'read', by.y = 'read', all.x = TRUE) # Raw lookup table, has duplicates if multiple reads on the same plasmid
View(raw_lookup_table_NIJL009)

## THIRD: Clean lookup table

# Make new column that has the count of how many times a row appears in the data frame
# (How many times is the same plasmid read)
prelookup_table <- select(raw_lookup_table_NIJL009, sgRNA1, sgRNA2,
                          bc1_neighborhood, bc2_neighborhood)

# Count the occurrences of each row
df_counts <- prelookup_table %>%
  group_by_all() %>%
  tally(name = "count")

View(df_counts)

# Join the counts with the original dataframe
lookup_table <- left_join(prelookup_table, df_counts, by = names(prelookup_table))

# Figure out how many distinct bc2 there are for a single bc1,
# how many distinct bc1 there are for a single bc2
lookup_table3 <- lookup_table %>%
  group_by(bc1_neighborhood) %>%
  mutate(distinct_count_bc1 = n_distinct(bc2_neighborhood))
View(lookup_table3)

lookup_table4 <- lookup_table3 %>%
  group_by(bc2_neighborhood) %>%
  mutate(distinct_count_bc2 = n_distinct(bc1_neighborhood))
View(lookup_table4)

lookup_table5 <- unique(lookup_table4)

# CATEGORIES FOR FILTERING "SCRAMBLED BARCODES"
# Note: These are likely not true scrambles, but rather, sequencing errors that make it look as if 
# bc1 maps to two different bc2, and vice versa
# Rows where bc1 maps to multiple bc2s & count = 1 are likely sequencing errors
# Get rid of them

lookup_table6 <- lookup_table5[!(lookup_table5$count == 1 & 
                                   (lookup_table5$distinct_count_bc1 > 1 | lookup_table5$distinct_count_bc2 > 1)), ]

# Get rid of rows where gRNA names are not called
lookup_table7 <- lookup_table6[lookup_table6$sgRNA1 != '*' & lookup_table6$sgRNA2 != '*', ]
View(lookup_table7)

# Get rid of rows where sgRNA1 != sgRNA2
lookup_table8 <- lookup_table7[lookup_table7$sgRNA1 == lookup_table7$sgRNA2, ] # Cleaned lookup table, got rid of elements where the two guides were different
View(lookup_table8)   

# What are the true scrambled barcodes now?
lookup_table9 <- lookup_table8 %>%
  group_by(bc1_neighborhood) %>%
  mutate(distinct_count_bc1_true = n_distinct(bc2_neighborhood))

lookup_table10 <- lookup_table9 %>%
  group_by(bc2_neighborhood) %>%
  mutate(distinct_count_bc2_true = n_distinct(bc1_neighborhood))
View(lookup_table10)

final_lookup <- lookup_table10 %>%
  group_by(sgRNA1) %>%
  mutate(bcs_perguide = n_distinct(paste0(bc1_neighborhood, bc2_neighborhood))) %>%
  ungroup()
View(final_lookup)

final_lookupv2 <- final_lookup[!(final_lookup$count < 2 | 
                                   final_lookup$distinct_count_bc1_true > 1 | 
                                   final_lookup$distinct_count_bc2_true > 1), ]

write.csv(final_lookupv2, "./pLJK06_v4_NIJL009_final_lookup_v2.csv", row.names=FALSE)