library(ggplot2)
library(dplyr)
library(mgcv)
library(scam)
library(caret)
library(readr)

# Import the scored training data for calibration
scored_dat <- read_csv("binned_labeled_samples_20191121.csv") %>%
  arrange(cargo) %>%
  select(-X1) %>%
  distinct() %>%
  mutate(cargo = as.factor(cargo)) %>%
  mutate(puff = ifelse(score == 1, "puff", "nonpuff"))


# fit a shape-constrained additive model (scam) to predict actual probability
# that an event is a puff, given cargo and random forest output probability
# (scam works like gam, but with an additional monotonicity constraint)
cal_gam <- scored_dat %>%
  scam(score ~ cargo + s(adj_proba, by=cargo, bs="mpi"),
       family=binomial(), data=.)

# specify data to apply model to, and calculate adjusted probabilities
eval_data <- data.frame(cargo = scored_dat$cargo,
                        adj_proba = scored_dat$adj_proba)

# these are the adjusted probabilities for each event
cal_prob <- predict(cal_gam, 
                   newdata = eval_data,
                   type="response")

# here are the number of adjusted probabilities in each bin, for each cargo
scored_dat %>%
  mutate(cal_prob = cal_prob) %>%
  group_by(cargo) %>%
  summarize(bin1 = sum(cal_prob < 0.1),
            bin2 = sum(cal_prob >= 0.1 & cal_prob < 0.2),
            bin3 = sum(cal_prob >= 0.2 & cal_prob < 0.3),
            bin4 = sum(cal_prob >= 0.3 & cal_prob < 0.4),
            bin5 = sum(cal_prob >= 0.4 & cal_prob < 0.5),
            bin6 = sum(cal_prob >= 0.5 & cal_prob < 0.6),
            bin7 = sum(cal_prob >= 0.6 & cal_prob < 0.7),
            bin8 = sum(cal_prob >= 0.7 & cal_prob < 0.8),
            bin9 = sum(cal_prob >= 0.8 & cal_prob < 0.9),
            bin10 = sum(cal_prob >= 0.9 & cal_prob <= 1))

# here's some calibration curves, with some pointwise confidence bands,
# of the adjusted probabilities for each cargo
# (not necessary for the sampling)
cal_dat_b2 <- (scored_dat %>%
                  mutate(cal_prob = cal_prob) %>%
                  filter(cargo == "B2") %>%
                  calibration(puff ~ cal_prob, class="puff", cuts=10,
                              data=.))$data

cal_dat_mor <- (scored_dat %>%
                  mutate(cal_prob = cal_prob) %>%
                  filter(cargo == "MOR") %>%
                  calibration(puff ~ cal_prob, class="puff", cuts=10,
                              data=.))$data

cal_dat_tfr <- (scored_dat %>%
                  mutate(cal_prob = cal_prob) %>%
                  filter(cargo == "TfR") %>%
                  calibration(puff ~ cal_prob, class="puff", cuts=10,
                              data=.))$data

cal_dat_mor %>%
  mutate(cargo = "MOR") %>%
  rbind(cal_dat_b2 %>% mutate(cargo = "B2")) %>%
  rbind(cal_dat_tfr %>% mutate(cargo = "TfR")) %>%
  mutate(Upper = ifelse(is.na(Upper), 100, Upper),
         Lower = ifelse(is.na(Lower), 0, Lower)) %>%
  ggplot(aes(x = midpoint)) +
  geom_point(aes(y = Percent)) +
  geom_line(aes(y = Percent)) +
  geom_line(aes(y = Lower), color="orange", alpha=0.8) +
  geom_line(aes(y = Upper), color="orange", alpha=0.8) +
  geom_abline(intercept=0, slope=1, color="blue",
              alpha=0.4) +
  geom_rug(data = scored_dat, aes(x = 100*cal_prob), sides = "b", alpha=0.2) +
  facet_wrap(~cargo, nrow=2)


# Sampling plan:
# 1) Make eval_data from full training data, rather than scored_dat
# 2) Use cal_gam (trained on scored_dat) to predict on eval_data
# 3) cal_prob is then the calibration-adjusted probabilities for every event
#    in the training data
# 4) divide cal_prob values into bins [0, 0.1], (0.1, 0.2], etc. within each cargo
# 5) sample without replacement in each bin, for each cargo. Aiming for the 
#    same numbers as before in each bin. We already have some observations in
#    most of the bins (see table above), so we just need to sample additional 
#    observations until we've got the numbers we want (or run out of things to sample)


