
#-------------------------------------------------
# Functions to filter on gaps (N) and mappability
#-------------------------------------------------

withinRegion <- function(region, position){
  return(as.numeric(position) >= as.numeric(region[2]) & as.numeric(position) <= as.numeric(region[3]))
}

containsRegion <- function(region, start, end){
  return(as.numeric(region[2]) >= as.numeric(start) & as.numeric(region[3]) <= as.numeric(end))
}

getFraction <- function(chunk, regions){
  start_lies_in_region <- apply(regions, 1, withinRegion, chunk$start)
  end_lies_in_region <- apply(regions, 1, withinRegion, chunk$end)
  contains_region <- apply(regions, 1, containsRegion, chunk$start, chunk$end)
  
  if (sum(start_lies_in_region) > 1){ stop("Start matches multiple regions!") }
  if (sum(end_lies_in_region) > 1){ stop("End matches multiple regions!") }
  
  # calculate fraction of N
  frac <- 0
  if (sum(start_lies_in_region) == 1 & sum(end_lies_in_region) == 1){ 
    # chunk lies fully within region
    frac <- 1
  } else if (sum(start_lies_in_region) == 1){
    # start lies in region but end doesn't
    start <- chunk$start
    end <- regions[start_lies_in_region,]$end
    frac <- (end - start) / (chunk$end - start)
  } else if (sum(end_lies_in_region) == 1){
    # end lies in region but start doesn't
    start <- regions[end_lies_in_region,]$start
    end <- chunk$end
    frac <- (end - start) / (end - chunk$start)
  } else if (sum(contains_region) > 0){
    # region(s) are fully contained within chunk
    start <- regions[contains_region,]$start
    end <- regions[contains_region,]$end
    frac <- sum(end - start) / (chunk$end - chunk$start)
  } else {
    # no region overlapping with chunk
    frac <- 0
  }
  return(frac)
}

filter <- function(chunks, gaps, lowMap, threshold_frac_N, threshold_frac_lowMap){
  chr <- c(1:22, "X", "Y")
  
  frac_N <- c()
  frac_map <- c()
  
  new_chunks <- matrix(0, nrow = 0, ncol = 4)
  for (c in 1:length(chr)){
    chunks_this_chr <- chunks[chunks$chr == chr[c],]
    gaps_this_chr <- gaps[gaps$chr == paste0("chr", chr[c]),]
    lowMap_this_chr <- lowMap[lowMap$chr == paste0("chr", chr[c]),]
    for (d in 1:nrow(chunks_this_chr)){
      f <- getFraction(chunks_this_chr[d,], gaps_this_chr)
      m <- getFraction(chunks_this_chr[d,], lowMap_this_chr)
      frac_N <- c(frac_N, f)
      frac_map <- c(frac_map, m)
      if (f < threshold_frac_N & m < threshold_frac_lowMap){
        new_chunks <- rbind(new_chunks, chunks_this_chr[d,])
      }
    }
  }
  return(list(chunks=new_chunks, frac_N=frac_N, frac_map=frac_map))
}

#--------------------------------------------
# Run!
#--------------------------------------------

# Read chunks (previously generated)
chunks <- read.table("~/ownCloud - Madleina Caduff (unifr.ch)@drive.switch.ch/sexEstimation/lowQualityReference/chunks.txt")
names(chunks) <- c("chr", "chunk", "start", "end")

# Read gap file (contains regions with N's)
# downloaded from https://hgdownload.soe.ucsc.edu/goldenPath/hg38/database/gap.txt.gz
gaps <- read.table("~/ownCloud - Madleina Caduff (unifr.ch)@drive.switch.ch/sexEstimation/lowQualityReference/gap.txt")
gaps <- gaps[,-c(1, 5:ncol(gaps))]
names(gaps) <- c("chr", "start", "end")

# Read mappability files
# downloaded from https://github.com/Boyle-Lab/Blacklist/tree/master/lists/hg38-blacklist.v2.bed.gz
lowMap <- read.csv("~/ownCloud - Madleina Caduff (unifr.ch)@drive.switch.ch/sexEstimation/lowQualityReference/hg38-blacklist.v2.bed", sep = "\t", header = F)
lowMap$V4 <- NULL
names(lowMap) <- c("chr", "start", "end")

# Run!
threshold_frac_N <- 0.9
threshold_frac_lowMap <- 0.9
res <- filter(chunks, gaps, lowMap, threshold_frac_N, threshold_frac_lowMap)

#write.table(res$chunks, file = "~/ownCloud - Madleina Caduff (unifr.ch)@drive.switch.ch/sexEstimation/lowQualityReference/chunks_filteredNMap.txt", append = F, quote = F, row.names = F, col.names = F)



#--------------------------------------------
# Check against the zero-counts contigs
#--------------------------------------------
names <- paste0(chunks$chr, "_", chunks$chunk)
excluded <- names[res$frac_map > threshold_frac_lowMap | res$frac_N > threshold_frac_N]
zero_counts <- as.character(read.table("~/ownCloud - Madleina Caduff (unifr.ch)@drive.switch.ch/sexEstimation/lowQualityReference/zero_counts_chunks.txt"))
length(zero_counts) # 517
length(excluded) # 636
zero_counts[!(zero_counts %in% excluded)] # get rid of all except X_548


