# packages
library(metaDigitise)
library(magrittr)
library(tibble)
library(dplyr)
library(lubridate)
library(dpeatdecomposition)
library(dm)
library(RMariaDB)

1 Preparations

Connect to database

# connect to database
con <-
  RMariaDB::dbConnect(
    drv = RMariaDB::MariaDB(),
    dbname = "dpeatdecomposition",
    default.file = "~/my.cnf"
  )

# get database as dm object
dm_dpeatdecomposition <-
  dpeatdecomposition::dp_get_dm(con, learn_keys = TRUE)

Get most current IDs

id_last <- 
  list(
    id_dataset = 
      dm_dpeatdecomposition %>%
      dm::pull_tbl(datasets) %>%
      dplyr::pull(id_dataset) %>%
      tail(1),
    id_sample = 
      dm_dpeatdecomposition %>%
      dm::pull_tbl(samples) %>%
      dplyr::pull(id_sample) %>%
      tail(1),
    id_measurement = 
      dm_dpeatdecomposition %>%
      dm::pull_tbl(data) %>%
      dplyr::pull(id_measurement) %>%
      tail(1)
    ) %>%
  purrr::map(function(.x) {
    if(length(.x) == 0) {
      0L
    } else {
      .x
    }
  })

Create directories

dir_name <- "d20"
dir_source <- "../raw_data/data/d20"
dir_target <- paste0("../derived_data/", id_last$id_dataset + 1L)

if(!dir.exists(dir_target)) {
  dir.create(dir_target)
}

2 Data wrangling

2.1 dataset

datasets <- 
  tibble::tibble(
    id_dataset = id_last$id_dataset + 1L
  )

2.2 citations_to_datasets

citations_to_datasets <- 
  dplyr::bind_rows(
    db_template_tables$citations_to_datasets,
    tibble::tibble(
      id_dataset = datasets$id_dataset,
      id_citation = c("Prevost.1997") 
    )
  )

2.3 samples

# mass remaining
samples2 <- 
  dplyr::bind_rows(
    readRDS(paste0(dir_source, "/raw/caldat/Prevost.1997-Fig7a"))$processed_data,
    readRDS(paste0(dir_source, "/raw/caldat/Prevost.1997-Fig7b"))$processed_data,
    readRDS(paste0(dir_source, "/raw/caldat/Prevost.1997-Fig7c"))$processed_data,
    readRDS(paste0(dir_source, "/raw/caldat/Prevost.1997-Fig7d"))$processed_data,
    readRDS(paste0(dir_source, "/raw/caldat/Prevost.1997-Fig7e"))$processed_data,
    readRDS(paste0(dir_source, "/raw/caldat/Prevost.1997-Fig7f"))$processed_data
  ) %>%
  dplyr::mutate(
    id =
      id %>%
      stringr::str_replace(pattern = "treatmentontrol", replacement = "treatmentcontrol"), #---note: correct id for some samples
    incubation_duration = 
      purrr::map_int(x, function(.x) {
        which.min(abs(.x - c(0:3))) - 1L
      }) %>%
      as.numeric(),
    mass_relative_mass = (100 - y)/100,
    mass_absolute = NA_real_,
    sample_depth_upper = 
      dplyr::case_when(
        stringr::str_detect(id, "depth10") ~ 10,
        stringr::str_detect(id, "depth30") ~ 30
      ),
    sample_depth_lower = sample_depth_upper,
    taxon_rank_value =
      dplyr::case_when(
        stringr::str_detect(id, "Sphagnum") ~ "Sphagnum",
        stringr::str_detect(id, "Betula") ~ "Betula papyrifera",
        stringr::str_detect(id, "cellulose") ~ NA_character_,
      ),
    taxon_rank_name =
      dplyr::case_when(
        stringr::str_detect(id, "Sphagnum") ~ "genus",
        stringr::str_detect(id, "Betula") ~ "species",
        stringr::str_detect(id, "cellulose") ~ NA_character_,
      ),
    sample_type = 
      dplyr::case_when(
        stringr::str_detect(id, "Sphagnum") ~ "peat",
        stringr::str_detect(id, "Betula") ~ "litter",
        stringr::str_detect(id, "cellulose") ~ "cellulose",
      ),
    sample_type2 = 
      dplyr::case_when(
        stringr::str_detect(id, "Sphagnum") ~ NA_character_,
        stringr::str_detect(id, "Betula") ~ "Betula papyrifera (non-peatland species) wood sticks of 0.6 cm diameter",
        stringr::str_detect(id, "cellulose") ~ "Whatman no. 1 filter papers, 9-cm in diameter",
      ),
    taxon_organ = 
      dplyr::case_when(
        stringr::str_detect(id, "Sphagnum") ~ "whole plant",
        stringr::str_detect(id, "Betula") ~ "branches",
        stringr::str_detect(id, "cellulose") ~ NA_character_,
      ),
    distance_to_ditch =
      id %>%
      stringr::str_extract(pattern = "distance\\d+$") %>%
      stringr::str_extract(pattern = "\\d+$") %>%
      as.numeric(),
    sample_treatment = 
      dplyr::case_when(
        stringr::str_detect(id, pattern = "control") ~ "control",
        TRUE ~ "drainage"
      ),
    sampling_date =
      as.Date("1991-10-15") + lubridate::years(incubation_duration),
    incubation_duration = 
      incubation_duration %>%
      lubridate::dyears() %>%
      lubridate::time_length(unit = "days"),
    sampling_year = lubridate::year(sampling_date),
    sampling_month = lubridate::month(sampling_date),
    sampling_longitude =
      "69°15'W" %>%
      sp::char2dms(chd = "°", chm = "'", chs = "''") %>%
      as.numeric(),
    sampling_latitude =
      "47°49'N" %>%
      sp::char2dms(chd = "°", chm = "'", chs = "''") %>%
      as.numeric(),
    is_incubated = TRUE,
    incubation_environment = "peat",
    id_dataset = id_last$id_dataset + 1,
    mesh_size_absolute = 1,
    experimental_design = 
      paste0(
        as.numeric(as.factor(sample_treatment)), "_",
        as.numeric(as.factor(ifelse(is.na(distance_to_ditch), "a", distance_to_ditch)))
      ),
    comment_samples = "Coordinates are only the approximate location of the study site, but not sampling points.",
    comment_samples =
      paste0(
        comment_samples,
        dplyr::case_when(
          stringr::str_detect(id, "control") ~ " treatment = 'control' means that these litter bags were buried in a location next to the drained areas of the peatland.",
          TRUE ~ ""
        )
      )
  ) %>%
  dplyr::select(-x, -y, -id, -group, -col, -pch, -y_variable, -x_variable)


# initial masses
samples1 <- 
  samples2 %>%
  dplyr::filter(!duplicated(paste0(experimental_design, "_", sample_type, "_", sample_depth_upper))) %>%
  dplyr::mutate(
    mass_relative_mass = 1.0,
    mass_remaining_error = 0.0,
    incubation_duration = 0.0,
    sampling_year = 1991,
    sampling_month = 10,
    sampling_date = as.Date("1991-10-15"),
    id_sample = seq_len(nrow(.)) + id_last$id_sample,
    id_sample_parent = id_sample,
    id_sample_origin = id_sample,
    id_sample_incubation_start = id_sample
  )

samples2 <- 
  samples2 %>%
  dplyr::mutate(
    id_sample = seq_len(nrow(.)) + max(samples1$id_sample),
    id_sample_origin = 
      dplyr::left_join(
        samples2 %>% dplyr::select(experimental_design, sample_type, sample_depth_upper),
        samples1 %>% dplyr::select(experimental_design, sample_type, sample_depth_upper, id_sample),
        by = c("experimental_design", "sample_type", "sample_depth_upper")
      ) %>%
      dplyr::pull(id_sample),
    id_sample_incubation_start = id_sample_origin,
    id_sample_parent = 
      purrr::map_int(seq_len(nrow(.)), function(i) {
        index <- paste0(experimental_design, "_", sample_type, "_", sample_depth_upper) == paste0(experimental_design, "_", sample_type, "_", sample_depth_upper)[[i]] & incubation_duration < incubation_duration[[i]]
        if(! any(index)) {
          id_sample_origin[[i]]
        } else {
          target_incubation_duration <- max(incubation_duration[index])
          index <- index & incubation_duration == target_incubation_duration
          id_sample[index]
        }
      })
  )


# water table depth and peat subsidence
samples3 <- 
  dplyr::bind_rows(
    readRDS(paste0(dir_source, "/raw/caldat/Prevost.1997-Fig2k"))$processed_data,
    readRDS(paste0(dir_source, "/raw/caldat/Prevost.1997-Fig2l"))$processed_data,
    readRDS(paste0(dir_source, "/raw/caldat/Prevost.1997-Fig2m"))$processed_data,
    readRDS(paste0(dir_source, "/raw/caldat/Prevost.1997-Fig2n"))$processed_data,
    readRDS(paste0(dir_source, "/raw/caldat/Prevost.1997-Fig2o"))$processed_data
  ) %>%
  dplyr::mutate(
    distance_to_ditch = c(rep(c(0, 10, 20), 4), rep(rep(c(0, 1.5, 5, 10, 15, 20), 4), 4)),
    sample_treatment = 
      dplyr::case_when(
        stringr::str_detect(id, pattern = "control") ~ "control",
        TRUE ~ "drainage"
      ),
    sampling_year = 
      id %>%
      stringr::str_extract(pattern = "year\\d+$") %>%
      stringr::str_remove(pattern = "^year") %>%
      as.numeric(),
    variable =
      dplyr::case_when(
        stringr::str_detect(id, "peat") ~ "peat_surface_depth", #---note: I think, based on Fig. 2, that this is the depth of the peat surface relative to the respective peat surface at the left point of each drainage transect (before drainage). This means that absolute peat heights are meaningless and therefore I record in the final version only peat height changes and water table depths
        stringr::str_detect(id, "min") ~ "minimum_water_table_depth",
        stringr::str_detect(id, "max") ~ "maximum_water_table_depth",
        stringr::str_detect(id, "mean") ~ "mean_water_table_depth"
      )
  ) %>%
  dplyr::arrange(sampling_year, distance_to_ditch) %>%
  dplyr::mutate(
    index = paste0(sampling_year, "_", distance_to_ditch),
    water_table_depth = 
      purrr::map(unique(index), function(.x) {
        c(NA_real_, y[index == .x][-1] - y[index == .x][[1]])
      }) %>% 
      unlist(),
    index = paste0(variable, "_", distance_to_ditch)
  ) %>%
  dplyr::arrange(index) %>%
  dplyr::mutate(
    peat_subsidence_relative_to_1990 =
      purrr::map(unique(index), function(.x) {
        if(stringr::str_detect(.x, pattern = "peat_surface")) {
          c(0, y[index == .x][-1] - y[index == .x][[1]])
        } else {
          rep(NA_real_, sum(index == .x))
        }
      }) %>% 
      unlist(),
    sampling_longitude =
      "69°15'W" %>%
      sp::char2dms(chd = "°", chm = "'", chs = "''") %>%
      as.numeric(),
    sampling_latitude =
      "47°49'N" %>%
      sp::char2dms(chd = "°", chm = "'", chs = "''") %>%
      as.numeric(),
    sample_type = "peat",
    sample_depth_upper = 0,
    sample_depth_lower = 0,
    id_dataset = datasets$id_dataset[[1]],
    id_sample = seq_len(nrow(.)) + max(samples2$id_sample),
    id_sample_origin = id_sample,
    id_sample_parent = id_sample,
    is_incubated = FALSE,
    incubation_duration = 0.0,
    comment_samples = "Coordinates are only the approximate location of the study site, but not sampling points.",
    comment_samples =
      paste0(
        comment_samples,
        dplyr::case_when(
          stringr::str_detect(id, "peat") ~ " Peat subsidence values are relative to the height of the peat at the same location in 1990.",
          !stringr::str_detect(id, "peat") ~ " Water table depths are summary statistics (minimum, mean, or maximum) of multiple measurements in the sampling year."
        )
      ),
    comment_samples =
      paste0(
        comment_samples,
        dplyr::case_when(
          stringr::str_detect(id, "control") & !stringr::str_detect(id, "peat") ~ " treatment = 'control' means that this are the same locations in the peatland, but before drainage.",
          TRUE ~ ""
        )
      )
  ) %>%
  dplyr::select(-x, -y, -id, -group, -col, -pch, -y_variable, -x_variable, -index) %>%
  dplyr::filter((sample_treatment == "drainage" & distance_to_ditch %in% na.omit(samples2$distance_to_ditch)) | (sample_treatment == "control" & distance_to_ditch == 0)) %>%
  dplyr::mutate(
    distance_to_ditch = 
      dplyr::case_when(
        sample_treatment == "drainage" ~ distance_to_ditch,
        TRUE ~ NA_real_
      )
  ) %>%
  dplyr::left_join(
    samples2 %>%
      dplyr::filter(!duplicated(experimental_design)) %>%
      dplyr::select(sample_treatment, distance_to_ditch, experimental_design),
    by = c("sample_treatment", "distance_to_ditch")
  ) %>%
  dplyr::filter(variable == "mean_water_table_depth")


## combine
samples <- 
  dplyr::bind_rows(
    db_template_tables$samples,
    samples1 %>%
      dplyr::mutate(
        type = "samples1"
      ),
    samples2 %>%
      dplyr::mutate(
        type = "samples2"
      ),
    samples3 %>%
      dplyr::mutate(
        type = "samples3"
      )
  )

2.4 samples_to_samples

samples_to_samples <- 
  samples %>%
  dplyr::filter(! id_sample %in% id_sample_origin) %>%
  dplyr::mutate(
    transition_description =
      dplyr::case_when(
        type %in% c("samples2") ~ "wait",
        TRUE ~ NA_character_
      )
  ) %>%
  dplyr::select(id_sample_parent, id_sample, transition_description) %>%
  dplyr::rename(
    id_sample_child = "id_sample"
  )

2.5 data

d1 <- 
  dplyr::bind_rows(
    samples1, samples2  
  ) %>%
  tidyr::pivot_longer(
    cols = dplyr::all_of(c("mass_absolute", "mass_relative_mass", "mesh_size_absolute")),
    names_to = "attribute_name",
    values_to = "value"
  ) %>%
  dplyr::mutate(
    id_measurement = seq_len(nrow(.)) + id_last$id_measurement,
    id_measurement_numerator =
      purrr::map_int(seq_len(nrow(.)), function(i) {
        switch(
          attribute_name[[i]],
          "mass_relative_mass" = {
            id_measurement[id_sample == id_sample[[i]] & attribute_name == "mass_absolute"]
          },
          NA_integer_
        )
      }),
    id_measurement_denominator =
      purrr::map_int(seq_len(nrow(.)), function(i) {
        switch(
          attribute_name[[i]],
          "mass_relative_mass" = {
            id_measurement[id_sample == id_sample_parent[[i]] & attribute_name == "mass_absolute"]
          },
          NA_integer_
        )
      }),
    value_type = 
      dplyr::case_when(
        attribute_name == "mesh_size_absolute" ~ "point", 
        TRUE ~ "mean"
      ),
    error_type = 
      dplyr::case_when(
        attribute_name %in% c("mesh_size_absolute", "mass_absolute") ~ NA_character_, 
        TRUE ~ "se"
      ),
    sample_size = 
       dplyr::case_when(
        attribute_name %in% c("mesh_size_absolute") ~ NA_integer_, 
        TRUE ~ 5L
      )
  )


d3 <- 
  samples3 %>%
  tidyr::pivot_longer(
    cols = dplyr::all_of(c("water_table_depth")),
    names_to = "attribute_name",
    values_to = "value"
  ) %>%
  dplyr::mutate(
    id_measurement = seq_len(nrow(.)) + max(d1$id_measurement),
    value_type = "mean",
    sample_size = NA_integer_
  )

# combine
d <- 
  dplyr::bind_rows(
    db_template_tables$data,
    d1,
    d3
  ) %>%
  dplyr::select(dplyr::all_of(colnames(db_template_tables$data)))

2.6 experimental_design_format

experimental_design_format <- 
  tibble::tibble(
    id_dataset = datasets$id_dataset,
    file = paste0(id_last$id_dataset + 1L, "/experimental_design_format.csv"),
    experimental_design_description = "`sample_treatment`: A label for the drainage conditions. `distance_to_drainage_ditch`: The distance [m] to the next drainage ditch."
  )

# csv file to export
experimental_design_format2 <-
  samples %>%
  dplyr::filter(! is.na(experimental_design)) %>%
  dplyr::filter(! duplicated(experimental_design)) %>%
  dplyr::select(experimental_design, sample_treatment, distance_to_ditch)

# export
write.csv(experimental_design_format2, paste0(dir_target, "/experimental_design_format.csv"), row.names = FALSE)

3 Export to database

# list all tables
dm_insert_in <-
  list(
    datasets = 
      datasets %>% 
      dplyr::select(dplyr::all_of(colnames(dm_dpeatdecomposition$datasets))),
    samples = 
      samples %>% 
      dplyr::select(dplyr::all_of(colnames(dm_dpeatdecomposition$samples))),
    data = 
      d %>% 
      dplyr::select(dplyr::all_of(colnames(dm_dpeatdecomposition$data))),
    samples_to_samples = 
      samples_to_samples %>% 
      dplyr::select(dplyr::all_of(colnames(dm_dpeatdecomposition$samples_to_samples))),
    citations_to_datasets = 
      citations_to_datasets %>% 
      dplyr::select(dplyr::all_of(colnames(dm_dpeatdecomposition$citations_to_datasets))),
    experimental_design_format = 
      experimental_design_format %>% 
      dplyr::select(dplyr::all_of(colnames(dm_dpeatdecomposition$experimental_design_format)))
  )

# check whether all column names as present in table attributes
all_column_names <- 
  purrr::map(dm_insert_in, colnames) %>%
  unlist() %>%
  unique()

if(! all(all_column_names %in% (dm_dpeatdecomposition %>% dm::pull_tbl(attributes) %>% dplyr::pull(attribute_name)))) {
  cond <- purrr::map_lgl(all_column_names, function(.x) ! .x %in% (dm_dpeatdecomposition %>% dm::pull_tbl(attributes) %>% dplyr::pull(attribute_name)))
  RMariaDB::dbDisconnect(con)
  stop(paste0("New `attribute_name`s discovered: ", paste(all_column_names[cond], collapse = ", ")))
}

all_data_attributes <- unique(dm_insert_in$data$attribute_name)

if(! all(all_data_attributes %in% (dm_dpeatdecomposition %>% dm::pull_tbl(attributes) %>% dplyr::pull(attribute_name)))) {
  RMariaDB::dbDisconnect(con)
  cond <- purrr::map_lgl(all_data_attributes, function(.x) ! .x %in% (dm_dpeatdecomposition %>% dm::pull_tbl(attributes) %>% dplyr::pull(attribute_name)))
  stop(paste0("New `attribute_name`s discovered: ", paste(all_data_attributes[cond], collapse = ", ")))
}


# filter empty tables
dm_insert_in_check <-
  dm_insert_in[purrr::map_lgl(dm_insert_in, function(x) nrow(x) > 0)] %>%
  dm::as_dm() %>%
  dp_dm_add_keys(dm_dpeatdecomposition)

# copy into dm_pmird
for(i in seq_along(dm_insert_in)) {
  RMariaDB::dbAppendTable(con, name = names(dm_insert_in)[[i]], value = dm_insert_in[[i]])
}

RMariaDB::dbDisconnect(con)

4 Notes