library(fst)
library(data.table)
library(scales)

args <- commandArgs(trailingOnly=TRUE)

input_path <- args[1]
output_path <- args[2]

print("Start to load reg table")
df <- read_fst(input_path, as.data.table=T)

print("New variables")
df[, ':='(
        saw.doc = as.numeric(iv.birank > 0),
        c.age = ifelse(c.age < 0, 0,
                ifelse(c.age > 100, 100, c.age))
)]

print("Create logged and 99th percentile versions of some skewed variables")
#i) identify variables to transform
distr <- as.data.frame(t(df[, sapply(.SD, summary), .SDcols = names(df)]))
distr$var <- row.names(distr)
probs <- (distr$`3rd Qu.` * 5 < distr$Max.) & distr$Max. != 1
skewed.vars <- distr$var[probs]
skewed.vars <- unique(c(skewed.vars, grep("bgrm", distr$var, value = T)))
#ii) create logged versions of those variables
log.skewed.vars <- paste(skewed.vars, ".log", sep = "")
df[, (log.skewed.vars) := lapply(.SD, function(x) log(x+1)), .SDcols = skewed.vars]

print("Create time_id")
df[, time.id := 1 + (c.year.quarter - min(c.year.quarter)) * 4]

print("Standardize vars")
#a) identify continous variables to standardize
cont.vars <- df[, lapply(.SD, function(x) length(unique(x)))]
cont.vars <- names(cont.vars)[cont.vars > 2]
cont.vars <- setdiff(cont.vars, c("time.id"))

#b) treat pagerank variables differently
rank.vars <- grep("iv.", cont.vars, value = T)

#c) standardize
for(j in 1:length(cont.vars)){
    if(cont.vars[j] %in% rank.vars){
        df[saw.doc == 1, (cont.vars[j]) := lapply(.SD, scale), .SDcols = cont.vars[j]]
        df[saw.doc == 0, (cont.vars[j]) := 0]
    }else{
        df[, (cont.vars[j]) := lapply(.SD, scale), .SDcols = cont.vars[j]]
    }
    print(j)
}

print("Order variable names")
df <- df[, order(names(df)), with = F]
front.vars <- c("time.id")
df <- df[, c(front.vars, setdiff(names(df), front.vars)), with = F]

print("Dump the file")
write_fst(df, output_path)