basedir <- "."
goldfile <- file.path(basedir, "PoPS_UKBB_noncoding_validation_1348CSs.txt.gz")
traitfile <- file.path(basedir, "UKBB_94traits_release1.traits.efo_tagged")
allmethodsfile <- file.path(basedir, "UKB_AllMethods_GenePrioritization.txt.gz")
if (interactive()) {
  outfile <- ""
}else {
  args <- commandArgs(TRUE)
  outfile_step1 <- args[[1]]
  outfile_step2_for_llm <- args[[2]]
  outfile_step2_labels <- args[[3]]
  window_size <- ifelse(length(args) == 4, as.numeric(args[[4]]), 500e3)
}

gene_string_from_list <- function(gene_list) {
  gene_str <- paste(gene_list, collapse = "},{")
  gene_str <- paste0("{", gene_str, "}")
  gene_str
}

library(data.table)

gold_df <- fread(goldfile)
allres_df <- fread(allmethodsfile)
select_cols <- c("trait", "region", "cs_id", "lead_variant", "distance_genebody", "ensgid", "gene") # nolint: line_length_linter.
allres_df_sub <- unique(allres_df[, ..select_cols])
setnames(allres_df_sub, "lead_variant", "variant")
gene_str_df <- allres_df_sub[, .(genelist = list(sort(unique(gene[distance_genebody <= window_size]))), # nolint: line_length_linter.
                                 ensglist = list(sort(unique(ensgid[distance_genebody <= window_size])))), # nolint: line_length_linter.
                             by = .(trait, region, cs_id)]
gene_str_df[, `:=`(symbol_gene_string = lapply(genelist, gene_string_from_list),
                   ensembl_gene_string = lapply(ensglist, gene_string_from_list), # nolint: line_length_linter.
                   genelist = NULL, ensglist = NULL)]


maxpip <- gold_df[, .(variant = variant[which.max(pip)],
                      pip = pip[which.max(pip)],
                      ensgid = unique(ensgid[coding]),
                      gene = unique(gene[coding])), by = .(locus_id, trait, region, cs_id)] # nolint: line_length_linter.
maxpip[, position := lapply(strsplit(variant, ":"), function(x) x[2])]
maxpip[, chromosome := lapply(strsplit(variant, ":"), function(x) x[1])]
maxpip[, `:=`(position = as.numeric(position))]


setkey(maxpip, trait, region, cs_id)
setkey(gene_str_df, trait, region, cs_id)
candidates <- maxpip[gene_str_df, nomatch = 0]
setkey(candidates, trait)

select_cols <- c("trait", "description", "efo")
trait_info <- fread(traitfile, header = TRUE, sep = "\t")
trait_info <- trait_info[, ..select_cols]
setkey(trait_info, trait)

final_candidates <- candidates[trait_info, nomatch = 0]
setkey(final_candidates, locus_id)

final_candidates[, genome_reference := "hg19"]
final_candidates[, row_number := .I]
setnames(final_candidates, "gene", "symbol")
setnames(final_candidates, "ensgid", "gene")
setnames(final_candidates, "trait", "phenotype")

step1 <- copy(final_candidates)
step1[, `:=`(symbol_gene_string = NULL, ensembl_gene_string = NULL)]
fwrite(step1, outfile_step1, quote = TRUE, sep = "\t")

select_cols <- c("row_number", "description", "symbol_gene_string", "ensembl_gene_string") # nolint: line_length_linter.
label_cols <- c("symbol", "gene")

final_llm_df <- final_candidates[, ..select_cols]
final_labels <- final_candidates[, ..label_cols]

fwrite(final_llm_df, outfile_step2_for_llm, quote = TRUE, row.names = FALSE, sep = "\t") # nolint: line_length_linter.
fwrite(final_labels, outfile_step2_labels, quote = TRUE, row.names = FALSE, sep = "\t") # nolint: line_length_linter.
