# packages
library(metaDigitise)
library(magrittr)
library(tibble)
library(dplyr)
library(lubridate)
library(dpeatdecomposition)
library(dm)
library(RMariaDB)

1 Preparations

Connect to database

# connect to database
con <-
  RMariaDB::dbConnect(
    drv = RMariaDB::MariaDB(),
    dbname = "dpeatdecomposition",
    default.file = "~/my.cnf"
  )

# get database as dm object
dm_dpeatdecomposition <-
  dpeatdecomposition::dp_get_dm(con, learn_keys = TRUE)

Get most current IDs

id_last <- 
  list(
    id_dataset = 
      dm_dpeatdecomposition %>%
      dm::pull_tbl(datasets) %>%
      dplyr::pull(id_dataset) %>%
      tail(1),
    id_sample = 
      dm_dpeatdecomposition %>%
      dm::pull_tbl(samples) %>%
      dplyr::pull(id_sample) %>%
      tail(1),
    id_measurement = 
      dm_dpeatdecomposition %>%
      dm::pull_tbl(data) %>%
      dplyr::pull(id_measurement) %>%
      tail(1)
    ) %>%
  purrr::map(function(.x) {
    if(length(.x) == 0) {
      0L
    } else {
      .x
    }
  })

Create directories

dir_name <- "d19"
dir_source <- "../raw_data/data/d19"
dir_target <- paste0("../derived_data/", id_last$id_dataset + 1L)

if(!dir.exists(dir_target)) {
  dir.create(dir_target)
}

2 Data wrangling

2.1 dataset

datasets <- 
  tibble::tibble(
    id_dataset = id_last$id_dataset + 1L
  )

2.2 citations_to_datasets

citations_to_datasets <- 
  dplyr::bind_rows(
    db_template_tables$citations_to_datasets,
    tibble::tibble(
      id_dataset = datasets$id_dataset,
      id_citation = c("Szumigalski.1996") 
    )
  )

2.3 samples

## mass remaining
samples3 <- 
  dplyr::bind_rows(
    readRDS(paste0(dir_source, "/raw/caldat/Szumigalski.1996-Fig2a"))$processed_data,
    readRDS(paste0(dir_source, "/raw/caldat/Szumigalski.1996-Fig2b"))$processed_data,
    readRDS(paste0(dir_source, "/raw/caldat/Szumigalski.1996-Fig2c"))$processed_data,
    readRDS(paste0(dir_source, "/raw/caldat/Szumigalski.1996-Fig2d"))$processed_data,
    readRDS(paste0(dir_source, "/raw/caldat/Szumigalski.1996-Fig2e"))$processed_data,
    readRDS(paste0(dir_source, "/raw/caldat/Szumigalski.1996-Fig3"))$processed_data
  ) %>%
  dplyr::mutate(
    id_dataset = datasets$id_dataset[[1]],
    id = 
      dplyr::case_when( #---note: manually correct an erroneous label
        id == "siteopenrichfen_speciesDrepanocladus_vernicosus" ~ "siteopenrichfen_speciesDrepanocladus_vernicosus_year2",
        id == "siteopenrichfen_speciesCraex_lasiocarpa_year1" ~ "siteopenrichfen_speciesCarex_lasiocarpa_year1",
        TRUE ~ id
      ),
    site_label = 
      id %>%
      stringr::str_extract(pattern = "site[:alpha:]+") %>%
      stringr::str_remove(pattern = "^site"),
    site_name = site_label,
    taxon_rank_value = 
      id %>%
      stringr::str_extract(pattern = "species[:alpha:]+_*[:alpha:]*_year") %>%
      stringr::str_remove(pattern = "^species") %>%
      stringr::str_remove(pattern = "_year$") %>%
      stringr::str_replace_all(pattern = "_", replacement = " "),
    taxon_rank_value =
      dplyr::case_when(
        taxon_rank_value == "Carex" & ! stringr::str_detect(id, "sitesedgefen_speciesCarex") ~ "Carex spec.",
        taxon_rank_value == "Carex" & stringr::str_detect(id, "sitesedgefen_speciesCarex") ~ "Carex lasiocarpa",
        taxon_rank_value == "Tomenthypnum nitens" ~ "Tomentypnum nitens",
        TRUE ~ taxon_rank_value
      ),
    taxon_rank_name =
      dplyr::case_when(
        taxon_rank_value == "Carex spec." ~ "genus",
        TRUE ~ "species"
      ),
    incubation_duration =
      dplyr::case_when(
        stringr::str_detect(id, "year1") ~ 365,
        stringr::str_detect(id, "year2") ~ 365 * 2
      ),
    sampling_date =
      dplyr::case_when(
        stringr::str_detect(id, "year1") ~ "1991-10-01",
        stringr::str_detect(id, "year2") ~ "1992-10-01"
      ) %>%
      as.Date(),
    sampling_year = lubridate::year(sampling_date),
    sampling_month = lubridate::month(sampling_date),
    sampling_latitude =
      dplyr::case_when(
        site_label %in% c("bog", "poorfen") ~ "54°41'N",
        site_label %in% c("openrichfen", "woodedrichfen") ~ "54°28'N",
        site_label %in% c("sedgefen") ~ "54°28'N",
      ) %>%
      sp::char2dms(chd = "°", chm = "'", chs = "''") %>%
      as.numeric(),
    sampling_longitude =
      dplyr::case_when(
        site_label %in% c("bog", "poorfen") ~ "113°28'W",
        site_label %in% c("openrichfen", "woodedrichfen") ~ "113°17'W",
        site_label %in% c("sedgefen") ~ "113°20'W",
      ) %>%
      sp::char2dms(chd = "°", chm = "'", chs = "''") %>%
      as.numeric(),
    mass_relative_mass = mean/100,
    mass_absolute = NA_real_,
    mesh_size_absolute = 1,
    mass_remaining_error = error/100,
    mass_remaining_sample_size = 5, #---note: the text says sample sizes were either 5 or 10, but it is not clear what number applies. I use 5 as conservative estimate here.
    mass_remaining_error_type = "se",
    is_incubated = TRUE,
    incubation_environment = "peat",
    sample_treatment = "control",
    sample_type = "litter",
    sample_type2 = 
      dplyr::case_when(
        taxon_rank_value == "Betula pumila" ~ "Upper 15 cm of branches with leaves",
        TRUE ~ NA_character_
      ),
    taxon_organ =
      dplyr::case_when(
        taxon_rank_value %in% c("Carex spec.", "Betula pumila", "Carex lasiocarpa") ~ "aboveground",
        TRUE ~ "whole plant"
      ),
    experimental_design = 
      site_label %>%
      as.factor() %>%
      as.numeric() %>%
      as.character(),
    sample_depth_upper = 2,
    sample_depth_lower = 2,
    comments_samples =
      "Coordinates are only the approximate location of the study site, but not sampling points.  `sample_depth_upper`, `sample_depth_lower`: Approximate position based on text description."
  ) %>%
  dplyr::filter(
    ! stringr::str_detect(id, pattern = "sitesedgefen_speciesCarex_y") #---note: I assume that this is the same as the data for Carex lasiocarpa since there are data on C and N content only for C. lasiocarpa for this site.
  )


# initial masses
samples2 <- 
  samples3 %>%
  dplyr::filter(!duplicated(paste0(site_label, "_", taxon_rank_value))) %>%
  dplyr::mutate(
    mass_relative_mass = 1.0,
    mass_remaining_error = 0.0,
    incubation_duration = 0.0,
    sampling_year = 1990,
    sampling_month = 10,
    sampling_date = as.Date("1990-10-01")
  )

# litter collection
samples1 <- 
  samples2 %>%
  dplyr::mutate(
    site_label =
      dplyr::case_when(
        taxon_rank_value == "Carex lasiocarpa" ~ "sedgefen",
        TRUE ~ site_label 
      ),
    site_name = site_label,
    sampling_latitude =
      dplyr::case_when(
        site_label %in% c("bog", "poorfen") ~ "54°41'N",
        site_label %in% c("openrichfen", "woodedrichfen") ~ "54°28'N",
        site_label %in% c("sedgefen") ~ "54°28'N",
      ) %>%
      sp::char2dms(chd = "°", chm = "'", chs = "''") %>%
      as.numeric(),
    sampling_longitude =
      dplyr::case_when(
        site_label %in% c("bog", "poorfen") ~ "113°28'W",
        site_label %in% c("openrichfen", "woodedrichfen") ~ "113°17'W",
        site_label %in% c("sedgefen") ~ "113°20'W",
      ) %>%
      sp::char2dms(chd = "°", chm = "'", chs = "''") %>%
      as.numeric(),
    experimental_design = 
      site_name %>%
      as.factor() %>%
      as.numeric() %>%
      as.character(),
    sample_type = 
      dplyr::case_when(
        taxon_rank_value %in% c("Carex spec.", "Betula pumila", "Carex lasiocarpa") ~ "litter",
        TRUE ~ "vegetation"
      ),
    sample_depth_upper = 
      dplyr::case_when(
        taxon_rank_value %in% c("Carex spec.", "Betula pumila", "Carex lasiocarpa") ~ NA_real_,
        TRUE ~ 0
      ),
    sample_depth_lower = 
      dplyr::case_when(
        taxon_rank_value %in% c("Carex spec.", "Betula pumila", "Carex lasiocarpa") ~ NA_real_,
        TRUE ~ 5 #---note: assumed
      ),
    sampling_year = 1990,
    sampling_month = 9,
    sampling_day = 16,
    is_incubated = FALSE,
    incubation_duration = 0.0,
    incubation_environment = NA_character_,
    comments_samples = "Coordinates are only the approximate location of the study site, but not sampling points.  `sample_depth_upper`, `sample_depth_lower`: Assumed sampling depths."
  ) %>%
  dplyr::filter(!duplicated(paste0(site_label, "_", taxon_rank_value))) %>%
  dplyr::mutate(
    id_sample = seq_len(nrow(.)) + id_last$id_sample,
    id_sample_parent = id_sample,
    id_sample_origin = id_sample,
    id_sample_incubation_start = id_sample
  )

# add missing ids
samples2 <- 
  dplyr::bind_rows(
    samples2 %>%
      dplyr::mutate(
        type = "samples2"
      ), 
    samples3 %>%
      dplyr::mutate(
        type = "samples3"
      )
  ) %>%
  dplyr::select(-error)

samples2 <- 
  samples2 %>%
  dplyr::mutate(
    id_sample = seq_len(nrow(.)) + max(samples1$id_sample),
    id_sample_origin = 
      dplyr::left_join(
        samples2 %>% dplyr::select(site_label, taxon_rank_value),
        samples1 %>% dplyr::select(site_label, taxon_rank_value, id_sample),
        by = c("site_label", "taxon_rank_value")
      ) %>%
      dplyr::pull(id_sample),
    id_sample_origin =
      dplyr::case_when(
         taxon_rank_value == "Carex lasiocarpa" ~ rep(id_sample_origin[taxon_rank_value == "Carex lasiocarpa" & site_label == "sedgefen"][[1]], nrow(samples2)),
         TRUE ~ id_sample_origin
      ),
    id_sample_incubation_start =
      purrr::map_int(seq_len(nrow(.)), function(i) {
        index <- paste0(taxon_rank_value, "_", site_label) == paste0(taxon_rank_value, "_", site_label)[[i]] & experimental_design == experimental_design[[i]] & incubation_duration == 0
        id_sample[index]
      }),
    id_sample_parent = 
      purrr::map_int(seq_len(nrow(.)), function(i) {
        index <- paste0(taxon_rank_value, "_", site_label) == paste0(taxon_rank_value, "_", site_label)[[i]] & experimental_design == experimental_design[[i]] & incubation_duration < incubation_duration[[i]]
        if(! any(index)) {
          id_sample_origin[[i]]
        } else {
          target_incubation_duration <- max(incubation_duration[index])
          index <- index & incubation_duration == target_incubation_duration
          id_sample[index]
        }
      })
  )


# average annual water table depth
samples4 <- 
  readRDS(paste0(dir_source, "/raw/caldat/Szumigalski.1996-Fig4"))$processed_data %>%
  dplyr::select(id, x) %>%
  dplyr::rename(
    water_table_depth = "x",
    site_label = "id"
  ) %>%
  dplyr::left_join(
    samples1 %>%
      dplyr::filter(!duplicated(site_label)) %>%
      dplyr::select(dplyr::any_of(c("site_label", colnames(db_template_tables$samples)))) %>%
      dplyr::select(-comments_samples),
    by = c("site_label")
  ) %>%
  dplyr::mutate(
    site_name = site_label,
    water_table_depth = water_table_depth * (-1),
    id_sample = seq_len(nrow(.)) + max(samples2$id_sample),
    id_sample_origin = id_sample,
    id_sample_parent = id_sample,
    sampling_year = NA_real_,
    sampling_month = NA_real_,
    sampling_day = NA_real_,
    sample_type = "peat",
    sample_type2 = NA_character_,
    taxon_rank_name = NA_character_,
    taxon_rank_value = NA_character_,
    taxon_organ = NA_character_,
    is_incubated = FALSE,
    incubation_duration = 0.0,
    sample_depth_upper = 0,
    sample_depth_lower = 0,
    comments_samples = "Coordinates are only the approximate location of the study site, but not sampling points."
  )

# Tab. 1
d19_C_N <- 
  readODS::read_ods(paste0(dir_source, "/derived/Szumigalski.1996-Tab1.ods")) %>%
  dplyr::rename(
    C_to_N = "CN"
  ) %>%
  dplyr::mutate(
    N_error = N_error/100,
    N_relative_mass = N/100,
    C_relative_mass = NA_real_,
    C_absolute = NA_real_,
    N_absolute = NA_real_,
    comments_samples = "Coordinates are only the approximate location of the study site, but not sampling points. Only year and month of sample collection are known."
  ) %>%
  dplyr::select(site_label, taxon_rank_value, sampling_date, C_to_N, CN_error, N_relative_mass, N_error, C_relative_mass, C_absolute, N_absolute)

samples2 <- 
  dplyr::left_join(
    samples2,
    d19_C_N,
    by = c("site_label", "taxon_rank_value", "sampling_date")
  )


## combine
samples <- 
  dplyr::bind_rows(
    db_template_tables$samples,
    samples1 %>%
      dplyr::mutate(
        type = "samples1"
      ),
    samples2,
    samples4 %>%
      dplyr::mutate(
        type = "samples4"
      )
  )

2.4 samples_to_samples

samples_to_samples <- 
  samples %>%
  dplyr::filter(! id_sample %in% id_sample_origin) %>%
  dplyr::mutate(
    transition_description =
      dplyr::case_when(
        type %in% c("samples2") ~ "translocate",
        type %in% c("samples3") ~ "wait",
        TRUE ~ NA_character_
      )
  ) %>%
  dplyr::select(id_sample_parent, id_sample, transition_description) %>%
  dplyr::rename(
    id_sample_child = "id_sample"
  )

2.5 data

d2 <- 
  samples2 %>%
  tidyr::pivot_longer(
    cols = dplyr::all_of(c("mass_absolute", "mass_relative_mass", "mesh_size_absolute", "C_to_N", "N_relative_mass", "C_relative_mass", "C_absolute", "N_absolute")),
    names_to = "attribute_name",
    values_to = "value"
  ) %>%
  dplyr::mutate(
    id_measurement = seq_len(nrow(.)) + id_last$id_measurement,
    id_measurement_numerator =
      purrr::map_int(seq_len(nrow(.)), function(i) {
        switch(
          attribute_name[[i]],
          "C_to_N" = {
            id_measurement[id_sample == id_sample[[i]] & attribute_name == "C_relative_mass"]
          },
          "N_relative_mass" = {
            id_measurement[id_sample == id_sample[[i]] & attribute_name == "N_absolute"]
          }, 
          "C_relative_mass" = {
            id_measurement[id_sample == id_sample[[i]] & attribute_name == "C_absolute"]
          },
          "mass_relative_mass" = {
            id_measurement[id_sample == id_sample[[i]] & attribute_name == "mass_absolute"]
          },
          NA_integer_
        )
      }),
    id_measurement_denominator =
      purrr::map_int(seq_len(nrow(.)), function(i) {
        switch(
          attribute_name[[i]],
          "C_to_N" = {
            id_measurement[id_sample == id_sample[[i]] & attribute_name == "N_relative_mass"]
          },
          "N_relative_mass" = ,
          "C_relative_mass" = {
            id_measurement[id_sample == id_sample[[i]] & attribute_name == "mass_absolute"]
          },
          "mass_relative_mass" = {
            id_measurement[id_sample == id_sample_incubation_start[[i]] & attribute_name == "mass_absolute"]
          },
          NA_integer_
        )
      }),
    value_type = 
      dplyr::case_when(
        attribute_name == "mesh_size_absolute" ~ "point", 
        TRUE ~ "mean"
      ),
    error_type = 
      dplyr::case_when(
        attribute_name %in% c("mesh_size_absolute", "mass_absolute") ~ NA_character_, 
        TRUE ~ "se"
      ),
    sample_size = 
       dplyr::case_when(
        attribute_name %in% c("mesh_size_absolute", "C_absolute", "N_absolute", "C_relative_mass") ~ NA_integer_, 
        attribute_name %in% c("C_to_N", "N_relative_mass") ~ 3L,
        TRUE ~ 5L
      ),
    comments_measurements =
      dplyr::case_when(
        attribute_name %in% c("mesh_size_absolute", "C_absolute", "N_absolute", "C_relative_mass", "C_to_N", "N_relative_mass") ~ NA_character_, 
        TRUE ~ "The text says sample sizes were either 5 or 10, but it is not clear what number applies. I use 5 as conservative estimate here."
      )
  )


d2_error <- 
  samples2 %>%
  tidyr::pivot_longer(
    cols = dplyr::all_of(c("CN_error", "N_error", "mass_remaining_error")),
    names_to = "attribute_name",
    values_to = "error"
  ) %>%
  dplyr::mutate(
    attribute_name = 
      dplyr::case_when(
        attribute_name == "mass_remaining_error" ~ "mass_relative_mass",
        attribute_name == "N_error" ~ "N_relative_mass",
        attribute_name == "CN_error" ~ "C_to_N"
      )
  ) %>%
  dplyr::select(id_sample, attribute_name, error)

d2 <- 
  d2 %>%
  dplyr::mutate(
    error =
      dplyr::left_join(d2, d2_error, by = c("id_sample", "attribute_name")) %>%
      dplyr::pull(error)
  )



d4 <- 
  samples4 %>%
  tidyr::pivot_longer(
    cols = dplyr::all_of(c("water_table_depth")),
    names_to = "attribute_name",
    values_to = "value"
  ) %>%
  dplyr::mutate(
    id_measurement = seq_len(nrow(.)) + max(d2$id_measurement),
    value_type = "mean",
    sample_size = NA_integer_,
    comments_measurements = "Water table depths were measured across the ice free time. The text does not mention the exact interval over which the water table depth was measured. It also does not mention whether the water table depth was measured at the same location where litter bags were incubated."
  )

# combine
d <- 
  dplyr::bind_rows(
    db_template_tables$data,
    d2,
    d4
  ) %>%
  dplyr::select(dplyr::all_of(colnames(db_template_tables$data)))

2.6 experimental_design_format

experimental_design_format <- 
  tibble::tibble(
    id_dataset = datasets$id_dataset,
    file = paste0(id_last$id_dataset + 1L, "/experimental_design_format.csv"),
    experimental_design_description = "`site_name`: Name of the site."
  )

# csv file to export
experimental_design_format2 <- 
  samples %>%
  dplyr::filter(! is.na(experimental_design)) %>%
  dplyr::filter(! duplicated(experimental_design)) %>%
  dplyr::select(experimental_design, site_name)

# export
write.csv(experimental_design_format2, paste0(dir_target, "/experimental_design_format.csv"), row.names = FALSE)

3 Export to database

# list all tables
dm_insert_in <-
  list(
    datasets = 
      datasets %>% 
      dplyr::select(dplyr::all_of(colnames(dm_dpeatdecomposition$datasets))),
    samples = 
      samples %>% 
      dplyr::select(dplyr::all_of(colnames(dm_dpeatdecomposition$samples))),
    data = 
      d %>% 
      dplyr::select(dplyr::all_of(colnames(dm_dpeatdecomposition$data))),
    samples_to_samples = 
      samples_to_samples %>% 
      dplyr::select(dplyr::all_of(colnames(dm_dpeatdecomposition$samples_to_samples))),
    citations_to_datasets = 
      citations_to_datasets %>% 
      dplyr::select(dplyr::all_of(colnames(dm_dpeatdecomposition$citations_to_datasets))),
    experimental_design_format = 
      experimental_design_format %>% 
      dplyr::select(dplyr::all_of(colnames(dm_dpeatdecomposition$experimental_design_format)))
  )

# check whether all column names as present in table attributes
all_column_names <- 
  purrr::map(dm_insert_in, colnames) %>%
  unlist() %>%
  unique()

if(! all(all_column_names %in% (dm_dpeatdecomposition %>% dm::pull_tbl(attributes) %>% dplyr::pull(attribute_name)))) {
  cond <- purrr::map_lgl(all_column_names, function(.x) ! .x %in% (dm_dpeatdecomposition %>% dm::pull_tbl(attributes) %>% dplyr::pull(attribute_name)))
  RMariaDB::dbDisconnect(con)
  stop(paste0("New `attribute_name`s discovered: ", paste(all_column_names[cond], collapse = ", ")))
}

all_data_attributes <- unique(dm_insert_in$data$attribute_name)

if(! all(all_data_attributes %in% (dm_dpeatdecomposition %>% dm::pull_tbl(attributes) %>% dplyr::pull(attribute_name)))) {
  RMariaDB::dbDisconnect(con)
  cond <- purrr::map_lgl(all_data_attributes, function(.x) ! .x %in% (dm_dpeatdecomposition %>% dm::pull_tbl(attributes) %>% dplyr::pull(attribute_name)))
  stop(paste0("New `attribute_name`s discovered: ", paste(all_data_attributes[cond], collapse = ", ")))
}


# filter empty tables
dm_insert_in_check <-
  dm_insert_in[purrr::map_lgl(dm_insert_in, function(x) nrow(x) > 0)] %>%
  dm::as_dm() %>%
  dp_dm_add_keys(dm_dpeatdecomposition)

# copy into dm_pmird
for(i in seq_along(dm_insert_in)) {
  RMariaDB::dbAppendTable(con, name = names(dm_insert_in)[[i]], value = dm_insert_in[[i]])
}

RMariaDB::dbDisconnect(con)

4 Notes