# packages
library(metaDigitise)
library(magrittr)
library(tibble)
library(dplyr)
library(lubridate)
library(dpeatdecomposition)
library(dm)
library(RMariaDB)

1 Preparations

Connect to database

# connect to database
con <-
  RMariaDB::dbConnect(
    drv = RMariaDB::MariaDB(),
    dbname = "dpeatdecomposition",
    default.file = "~/my.cnf"
  )

# get database as dm object
dm_dpeatdecomposition <-
  dpeatdecomposition::dp_get_dm(con, learn_keys = TRUE)

Get most current IDs

id_last <- 
  list(
    id_dataset = 
      dm_dpeatdecomposition %>%
      dm::pull_tbl(datasets) %>%
      dplyr::pull(id_dataset) %>%
      tail(1),
    id_sample = 
      dm_dpeatdecomposition %>%
      dm::pull_tbl(samples) %>%
      dplyr::pull(id_sample) %>%
      tail(1),
    id_measurement = 
      dm_dpeatdecomposition %>%
      dm::pull_tbl(data) %>%
      dplyr::pull(id_measurement) %>%
      tail(1)
    ) %>%
  purrr::map(function(.x) {
    if(length(.x) == 0) {
      0L
    } else {
      .x
    }
  })

Create directories

dir_name <- "d13"
dir_source <- "../raw_data/data/d13"
dir_target <- paste0("../derived_data/", id_last$id_dataset + 1L)

if(!dir.exists(dir_target)) {
  dir.create(dir_target)
}

2 Data wrangling

2.1 dataset

datasets <- 
  tibble::tibble(
    id_dataset = id_last$id_dataset + 1L
  )

2.2 citations_to_datasets

citations_to_datasets <- 
  dplyr::bind_rows(
    db_template_tables$citations_to_datasets,
    tibble::tibble(
      id_dataset = datasets$id_dataset,
      id_citation = c("Bartsch.1985") 
    )
  )

2.3 samples

## mass remaining

# extracted data
samples2 <- 
  tibble::tibble(
    plot_name = paste0(dir_source, "/raw/caldat/Bartsch.1985-Fig1", letters[1:11]),
    taxon_rank_name = c(rep("species", 8), rep("genus", 2), NA_character_),
    taxon_rank_value = c("Carex limosa", "Carex rostrata", "Carex aquatilis", "Carex chordorrhiza", "Scirpus cespitosus", "Betula glandulosa", "Salix pedicellaris", "Sphagnum lindbergii", rep("Sphagnum", 2), NA_character_),
    taxon_organ = c(rep("leaves", 7), rep("whole plant", 3), NA_character_),
    sample_type = c(rep("litter", 10), "Borregaard cellulose"),
    sample_type2 = c(rep("litter", 8), "lawn sphagna", "hummock sphagna", "Borregaard cellulose"),
    sampling_longitude = 
      "66°42'E" %>%
      sp::char2dms(chd = "°", chm = "'", chs = "''") %>%
      as.numeric(),
    sampling_latitude = 
      "54°43'N" %>%
      sp::char2dms(chd = "°", chm = "'", chs = "''") %>%
      as.numeric(),
    mesh_size_absolute = 1.4,
    extracted_data =
      purrr::map(plot_name, function(.x) {
          readRDS(.x)$processed_data %>%
          dplyr::select(x, y, group) %>%
          dplyr::mutate(
            x = 
              dplyr::case_when(
                x <= 300 ~ as.Date("1982-06-01"),
                TRUE ~ as.Date("1982-08-01")
              ),
            y = y/100
          ) %>%
          dplyr::rename(
            sampling_date = "x",
            mass_relative_mass = "y",
            id_site = "group"
          )
      }),
    comments_samples = "No exact coordinates known, the coordinates are only the approximate location (approximately 10 km uncertainty). Litter bags were placed at the peat water surface in late August 1981."
  ) %>%
  tidyr::unnest(extracted_data) %>%
  dplyr::arrange(taxon_rank_value, sample_type2, id_site, sampling_date) %>%
  dplyr::mutate(
    id_site =
      dplyr::case_when(
        taxon_rank_value == "Carex chordorrhiza" ~ 2,
        taxon_rank_value == "Carex aquatilis" & id_site == 1L ~ 2,
        taxon_rank_value == "Carex aquatilis" & id_site == 2L ~ 4,
        taxon_rank_value == "Sphagnum lindbergii" & id_site == 2L ~ 3,
        TRUE ~ id_site
      ) %>%
      as.character(),
    id_dataset = datasets$id_dataset[[1]],
    experimental_design = id_site,
    sampling_year = lubridate::year(sampling_date),
    sampling_month = lubridate::month(sampling_date),
    samplig_day = NA_real_,
    sample_treatment = "control",
    incubation_environment = "peat",
    is_incubated = TRUE,
    incubation_duration = as.numeric(sampling_date - as.Date("1981-08-01")),
    water_table_depth = 0.0,
    mass_absolute = NA_real_
  ) %>%
  dplyr::arrange(incubation_duration, sample_type2, taxon_rank_value, experimental_design)

# samples at starting time
samples1 <- 
  samples2 %>%
  dplyr::filter(!duplicated(paste0(taxon_rank_value, "_", sample_type2, "_", experimental_design))) %>%
  dplyr::mutate(
    mass_relative_mass = 1.0,
    incubation_duration = 0.0,
    sampling_year = 1981,
    sampling_month = 8,
    id_sample = seq_len(nrow(.)) + id_last$id_sample,
    id_sample_parent = id_sample,
    id_sample_origin = id_sample,
    id_sample_incubation_start = id_sample
  )

# add missing ids
samples2 <- 
  samples2 %>%
  dplyr::mutate(
    id_sample = seq_len(nrow(.)) + max(samples1$id_sample),
    id_sample_origin = 
      dplyr::left_join(
        samples2 %>% dplyr::select(sample_type2, taxon_rank_value, experimental_design),
        samples1 %>% dplyr::select(sample_type2, taxon_rank_value, experimental_design, id_sample),
        by = c("sample_type2", "taxon_rank_value", "experimental_design")
      ) %>%
      dplyr::pull(id_sample),
    id_sample_incubation_start = id_sample_origin,
    id_sample_parent = 
      purrr::map_int(seq_len(nrow(.)), function(i) {
        index <- paste0(taxon_rank_value, "_", sample_type2) == paste0(taxon_rank_value, "_", sample_type2)[[i]] & experimental_design == experimental_design[[i]] & incubation_duration < incubation_duration[[i]]
        if(! any(index)) {
          id_sample_origin[[i]]
        } else {
          target_incubation_duration <- max(incubation_duration[index])
          index <- index & incubation_duration == target_incubation_duration
          id_sample[index]
        }
      })
  )

samples3 <- 
  samples2 %>%
  dplyr::filter(incubation_duration == 365)

samples2 <- 
  samples2 %>%
  dplyr::filter(incubation_duration == 304)

## add element contents
samples1 <- 
  dplyr::left_join(
    samples1,
    dplyr::bind_rows(
      readODS::read_ods(paste0(dir_source, "/derived/Bartsch.1985-Tab5.ods")) %>%
        dplyr::mutate(
          sampling_date = as.Date("1981-08-01"),
          sampling_year = lubridate::year(sampling_date),
          sampling_month = lubridate::month(sampling_date)
        )
    ) %>%
      dplyr::select(sample_type2, taxon_rank_value, taxon_organ, id_site, P, N, K, Ca, Mg, sampling_year, sampling_month) %>%
      dplyr::rename_with(.cols = dplyr::all_of(c("P", "N", "K", "Ca", "Mg")), .fn = function(.x) paste0(.x, "_relative_mass")) %>%
      dplyr::mutate(
        dplyr::across(
          dplyr::all_of(paste0(c("P", "N", "K", "Ca", "Mg"), "_relative_mass")),
          function(.x) .x/100
        ),
        id_site = as.character(id_site),
        P_absolute = NA_real_,
        N_absolute = NA_real_,
        K_absolute = NA_real_,
        Ca_absolute = NA_real_,
        Mg_absolute = NA_real_,
        sample_type2 =
          dplyr::case_when(
            sample_type2 == "vegetation" ~ "litter",
            TRUE ~ sample_type2
          )
      ),
    by = c("sample_type2", "taxon_rank_value", "taxon_organ", "id_site", "sampling_year", "sampling_month")
  )
  
samples3 <- 
  dplyr::left_join(
    samples3,
    dplyr::bind_rows(
      readODS::read_ods(paste0(dir_source, "/derived/Bartsch.1985-Tab6.ods")) %>%
        dplyr::mutate(
          sampling_date = as.Date("1982-08-01"),
          sampling_year = lubridate::year(sampling_date),
          sampling_month = lubridate::month(sampling_date)
        )
    ) %>%
      dplyr::select(sample_type2, taxon_rank_value, taxon_organ, id_site, P, N, K, Ca, Mg, P_error, N_error, K_error, Ca_error, Mg_error, sampling_year, sampling_month) %>%
      dplyr::rename_with(.cols = dplyr::all_of(c("P", "N", "K", "Ca", "Mg")), .fn = function(.x) paste0(.x, "_relative_mass")) %>%
      dplyr::mutate(
        dplyr::across(
          dplyr::all_of(paste0(c("P", "N", "K", "Ca", "Mg"), "_relative_mass")),
          function(.x) .x/100
        ),
        dplyr::across(
          dplyr::all_of(paste0(c("P", "N", "K", "Ca", "Mg"), "_error")),
          function(.x) .x/100
        ),
        id_site = as.character(id_site),
        P_absolute = NA_real_,
        N_absolute = NA_real_,
        K_absolute = NA_real_,
        Ca_absolute = NA_real_,
        Mg_absolute = NA_real_,
        sample_type2 =
          dplyr::case_when(
            sample_type2 == "vegetation" ~ "litter",
            TRUE ~ sample_type2
          )
      ),
    by = c("sample_type2", "taxon_rank_value", "taxon_organ", "id_site", "sampling_year", "sampling_month")
  )


samples4 <- 
  dplyr::bind_rows(
    tibble::tibble(
      value_type = "min",
      pH = c(4.6, 5.2, 5.4, 7.0)
    ),
    tibble::tibble(
      value_type = "max",
      pH = c(5.5, 7.4, 7.5, 7.5)
    )
  ) %>%
  dplyr::mutate(
    id_dataset = datasets$id_dataset[[1]],
    id_sample = seq_len(nrow(.)) + max(samples3$id_sample),
    id_sample_origin = id_sample,
    id_sample_parent = id_sample,
    id_site = as.character(rep(1:4, 2)),
    experimental_design = as.character(id_site),
    sampling_longitude = 
      "66°42'E" %>%
      sp::char2dms(chd = "°", chm = "'", chs = "''") %>%
      as.numeric(),
    sampling_latitude = 
      "54°43'E" %>%
      sp::char2dms(chd = "°", chm = "'", chs = "''") %>%
      as.numeric(),
    sample_type = "water",
    sample_type2 = "pore_water",
    treatment = "control",
    incubation_duration = 0,
    is_incubated = FALSE,
    comments_samples = "No exact coordinates known, the coordinates are only the approximate location (approximately 10 km uncertainty). No sampling depth or date known."
  )

## combine
samples <- 
  dplyr::bind_rows(
    db_template_tables$samples,
    samples1 %>%
      dplyr::mutate(
        type = "samples1"
      ),
    samples2 %>%
      dplyr::mutate(
        type = "samples2"
      ),
    samples3 %>%
      dplyr::mutate(
        type = "samples3"
      ),
    samples4 %>%
      dplyr::mutate(
        type = "samples4"
      )
  )

2.4 samples_to_samples

samples_to_samples <- 
  samples %>%
  dplyr::filter(! id_sample %in% id_sample_origin) %>%
  dplyr::mutate(
    transition_description =
      dplyr::case_when(
        type %in% c("samples2", "samples3") ~ "wait",
        TRUE ~ NA_character_
      )
  ) %>%
  dplyr::select(id_sample_parent, id_sample, transition_description) %>%
  dplyr::rename(
    id_sample_child = "id_sample"
  )

2.5 data

d1 <- 
  samples1 %>%
  tidyr::pivot_longer(
    cols = dplyr::all_of(c("mass_absolute", "mass_relative_mass", "mesh_size_absolute", "water_table_depth", "P_relative_mass", "N_relative_mass", "K_relative_mass", "Ca_relative_mass", "Mg_relative_mass", "P_absolute", "N_absolute", "K_absolute", "Ca_absolute", "Mg_absolute")),
    names_to = "attribute_name",
    values_to = "value"
  ) %>%
  dplyr::mutate(
    id_measurement = seq_len(nrow(.)) + id_last$id_measurement,
    id_measurement_numerator =
      purrr::map_int(seq_len(nrow(.)), function(i) {
        switch(
          attribute_name[[i]],
          "mass_relative_mass" = {
            id_measurement[id_sample == id_sample[[i]] & attribute_name == "mass_absolute"]
          },
          "P_relative_mass" =,
          "N_relative_mass" =,  
          "K_relative_mass" =,
          "Ca_relative_mass" =,
          "Mg_relative_mass" = {
            id_measurement[id_sample == id_sample[[i]] & attribute_name == paste0(stringr::str_remove(attribute_name[[i]], "_relative_mass$"), "_absolute")]
          },
          NA_integer_
        )
      }),
    id_measurement_denominator =
      purrr::map_int(seq_len(nrow(.)), function(i) {
        switch(
          attribute_name[[i]],
          "mass_relative_mass" =,
          "P_relative_mass" =,
          "N_relative_mass" =,  
          "K_relative_mass" =,
          "Ca_relative_mass" =,
          "Mg_relative_mass" = {
            id_measurement[id_sample == id_sample[[i]] & attribute_name == "mass_absolute"]
          },
          NA_integer_
        )
      }),
    value_type = 
      dplyr::case_when(
        attribute_name == "mass_relative_mass" ~ "mean", 
        TRUE ~ "point"
      ),
    sample_size = 
      dplyr::case_when(
        value_type == "mean" ~ 3L,
        TRUE ~ NA_integer_
      ),
    comments_measurement =
      dplyr::case_when(
        attribute_name == "water_table_depth" ~ "Approximate position of the water table depth at the time samples were inserted into the peat to start the decomposition experiment.",
        TRUE ~ NA_character_
      )
  )

d2 <- 
  samples2 %>%
  tidyr::pivot_longer(
    cols = dplyr::all_of(c("mass_absolute", "mass_relative_mass", "mesh_size_absolute")),
    names_to = "attribute_name",
    values_to = "value"
  ) %>%
  dplyr::mutate(
    id_measurement = seq_len(nrow(.)) + max(d1$id_measurement),
    id_measurement_numerator =
      purrr::map_int(seq_len(nrow(.)), function(i) {
        switch(
          attribute_name[[i]],
          "mass_relative_mass" = {
            id_measurement[id_sample == id_sample[[i]] & attribute_name == "mass_absolute"]
          },
          "P_relative_mass" =,
          "N_relative_mass" =,  
          "K_relative_mass" =,
          "Ca_relative_mass" =,
          "Mg_relative_mass" = {
            id_measurement[id_sample == id_sample[[i]] & attribute_name == paste0(stringr::str_remove(attribute_name[[i]], "_relative_mass$"), "_absolute")]
          },
          NA_integer_
        )
      }),
    id_measurement_denominator =
      purrr::map_int(seq_len(nrow(.)), function(i) {
        switch(
          attribute_name[[i]],
          "mass_relative_mass" = {
            d1$id_measurement[d1$id_sample == id_sample_origin[[i]] & d1$attribute_name == "mass_absolute"]
          },
          "P_relative_mass" =,
          "N_relative_mass" =,  
          "K_relative_mass" =,
          "Ca_relative_mass" =,
          "Mg_relative_mass" = {
            d1$id_measurement[d1$id_sample == id_sample_origin[[i]] & d1$attribute_name == paste0(stringr::str_remove(attribute_name[[i]], "_relative_mass$"), "_absolute")]
          },
          NA_integer_
        )
      }),
    value_type = 
      dplyr::case_when(
        attribute_name == "mesh_size_absolute" ~ "point", 
        TRUE ~ "mean"
      ),
    sample_size = 
      dplyr::case_when(
        value_type == "mean" ~ 3L,
        TRUE ~ NA_integer_
      ),
    comments_measurement =
      dplyr::case_when(
        attribute_name == "water_table_depth" ~ "Approximate position of the water table depth at the time samples were inserted into the peat to start the decomposition experiment.",
        TRUE ~ NA_character_
      )
  )

d3 <- 
  samples3 %>%
  tidyr::pivot_longer(
    cols = dplyr::all_of(c("mass_absolute", "mass_relative_mass", "mesh_size_absolute", "P_relative_mass", "N_relative_mass", "K_relative_mass", "Ca_relative_mass", "Mg_relative_mass", "P_absolute", "N_absolute", "K_absolute", "Ca_absolute", "Mg_absolute")),
    names_to = "attribute_name",
    values_to = "value"
  ) %>%
  dplyr::mutate(
    id_measurement = seq_len(nrow(.)) + max(d2$id_measurement),
    id_measurement_numerator =
      purrr::map_int(seq_len(nrow(.)), function(i) {
        switch(
          attribute_name[[i]],
          "mass_relative_mass" = {
            id_measurement[id_sample == id_sample[[i]] & attribute_name == "mass_absolute"]
          },
          "P_relative_mass" =,
          "N_relative_mass" =,  
          "K_relative_mass" =,
          "Ca_relative_mass" =,
          "Mg_relative_mass" = {
            id_measurement[id_sample == id_sample[[i]] & attribute_name == paste0(stringr::str_remove(attribute_name[[i]], "_relative_mass$"), "_absolute")]
          },
          NA_integer_
        )
      }),
    id_measurement_denominator =
      purrr::map_int(seq_len(nrow(.)), function(i) {
        switch(
          attribute_name[[i]],
          "mass_relative_mass" = {
            d1$id_measurement[d1$id_sample == id_sample_origin[[i]] & d1$attribute_name == "mass_absolute"]
          },
          "P_relative_mass" =,
          "N_relative_mass" =,  
          "K_relative_mass" =,
          "Ca_relative_mass" =,
          "Mg_relative_mass" = {
            d1$id_measurement[d1$id_sample == id_sample_origin[[i]] & d1$attribute_name == paste0(stringr::str_remove(attribute_name[[i]], "_relative_mass$"), "_absolute")]
          },
          NA_integer_
        )
      }),
    value_type = 
      dplyr::case_when(
        attribute_name == "mesh_size_absolute" ~ "point", 
        TRUE ~ "mean"
      ),
    sample_size = 
      dplyr::case_when(
        value_type == "mean" ~ 3L,
        TRUE ~ NA_integer_
      ),
    comments_measurement =
      dplyr::case_when(
        attribute_name == "water_table_depth" ~ "Approximate position of the water table depth at the time samples were inserted into the peat to start the decomposition experiment.",
        TRUE ~ NA_character_
      )
  )

d3_error <- 
  samples3 %>%
  tidyr::pivot_longer(
    cols = dplyr::all_of(paste0(c("P", "N", "K", "Ca", "Mg"), "_error")),
    names_to = "attribute_name",
    values_to = "error"
  ) %>%
  dplyr::mutate(
    attribute_name =
      attribute_name %>%
      stringr::str_remove(pattern = "_error$") %>%
      paste0(., "_relative_mass")
  ) %>%
  dplyr::select(id_sample, attribute_name, error)

d3 <- 
  d3 %>%
  dplyr::mutate(
    error =
      dplyr::left_join(d3, d3_error, by = c("id_sample", "attribute_name")) %>%
      dplyr::pull(error),
    error_type =
      dplyr::case_when(
        is.na(error) ~ NA_character_,
        TRUE ~ "sd"
      )
  )

d4 <- 
  samples4 %>%
  dplyr::rename(
    value = "pH"
  ) %>%
  dplyr::mutate(
    attribute_name = "pH",
    id_measurement = seq_len(nrow(.)) + max(d3$id_measurement),
    comments_measurement =
      dplyr::case_when(
        attribute_name == "water_table_depth" ~ "Time range and sample frequency over which the minimum/maximum was computed are unknown.",
        TRUE ~ NA_character_
      )
  )


# combine
d <- 
  dplyr::bind_rows(
    db_template_tables$data,
    d1,
    d2,
    d3,
    d4
  ) %>%
  dplyr::select(dplyr::all_of(colnames(db_template_tables$data)))

2.6 experimental_design_format

experimental_design_format <- 
  tibble::tibble(
    id_dataset = datasets$id_dataset,
    file = paste0(id_last$id_dataset + 1L, "/experimental_design_format.csv"),
    experimental_design_description = "`site_name`: Name of the site."
  )

# csv file to export
experimental_design_format2 <- 
  samples %>%
  dplyr::filter(! is.na(experimental_design)) %>%
  dplyr::filter(! duplicated(experimental_design)) %>%
  dplyr::select(experimental_design, site_name)

# export
write.csv(experimental_design_format2, paste0(dir_target, "/experimental_design_format.csv"), row.names = FALSE)

3 Export to database

# list all tables
dm_insert_in <-
  list(
    datasets = 
      datasets %>% 
      dplyr::select(dplyr::all_of(colnames(dm_dpeatdecomposition$datasets))),
    samples = 
      samples %>% 
      dplyr::select(dplyr::all_of(colnames(dm_dpeatdecomposition$samples))),
    data = 
      d %>% 
      dplyr::select(dplyr::all_of(colnames(dm_dpeatdecomposition$data))),
    samples_to_samples = 
      samples_to_samples %>% 
      dplyr::select(dplyr::all_of(colnames(dm_dpeatdecomposition$samples_to_samples))),
    citations_to_datasets = 
      citations_to_datasets %>% 
      dplyr::select(dplyr::all_of(colnames(dm_dpeatdecomposition$citations_to_datasets))),
    experimental_design_format = 
      experimental_design_format %>% 
      dplyr::select(dplyr::all_of(colnames(dm_dpeatdecomposition$experimental_design_format)))
  )

# check whether all column names as present in table attributes
all_column_names <- 
  purrr::map(dm_insert_in, colnames) %>%
  unlist() %>%
  unique()

if(! all(all_column_names %in% (dm_dpeatdecomposition %>% dm::pull_tbl(attributes) %>% dplyr::pull(attribute_name)))) {
  cond <- purrr::map_lgl(all_column_names, function(.x) ! .x %in% (dm_dpeatdecomposition %>% dm::pull_tbl(attributes) %>% dplyr::pull(attribute_name)))
  RMariaDB::dbDisconnect(con)
  stop(paste0("New `attribute_name`s discovered: ", paste(all_column_names[cond], collapse = ", ")))
}

all_data_attributes <- unique(dm_insert_in$data$attribute_name)

if(! all(all_data_attributes %in% (dm_dpeatdecomposition %>% dm::pull_tbl(attributes) %>% dplyr::pull(attribute_name)))) {
  RMariaDB::dbDisconnect(con)
  cond <- purrr::map_lgl(all_data_attributes, function(.x) ! .x %in% (dm_dpeatdecomposition %>% dm::pull_tbl(attributes) %>% dplyr::pull(attribute_name)))
  stop(paste0("New `attribute_name`s discovered: ", paste(all_data_attributes[cond], collapse = ", ")))
}


# filter empty tables
dm_insert_in_check <-
  dm_insert_in[purrr::map_lgl(dm_insert_in, function(x) nrow(x) > 0)] %>%
  dm::as_dm() %>%
  dp_dm_add_keys(dm_dpeatdecomposition)

# copy into dm_pmird
for(i in seq_along(dm_insert_in)) {
  RMariaDB::dbAppendTable(con, name = names(dm_insert_in)[[i]], value = dm_insert_in[[i]])
}

RMariaDB::dbDisconnect(con)

4 Notes