# packages
library(metaDigitise)
library(magrittr)
library(tibble)
library(dplyr)
library(lubridate)
library(dpeatdecomposition)
library(dm)
library(RMariaDB)

1 Preparations

Connect to database

# connect to database
con <-
  RMariaDB::dbConnect(
    drv = RMariaDB::MariaDB(),
    dbname = "dpeatdecomposition",
    default.file = "~/my.cnf"
  )

# get database as dm object
dm_dpeatdecomposition <-
  dpeatdecomposition::dp_get_dm(con, learn_keys = TRUE)

Get most current IDs

id_last <- 
  list(
    id_dataset = 
      dm_dpeatdecomposition %>%
      dm::pull_tbl(datasets) %>%
      dplyr::pull(id_dataset) %>%
      tail(1),
    id_sample = 
      dm_dpeatdecomposition %>%
      dm::pull_tbl(samples) %>%
      dplyr::pull(id_sample) %>%
      tail(1),
    id_measurement = 
      dm_dpeatdecomposition %>%
      dm::pull_tbl(data) %>%
      dplyr::pull(id_measurement) %>%
      tail(1)
    ) %>%
  purrr::map(function(.x) {
    if(length(.x) == 0) {
      0L
    } else {
      .x
    }
  })

Create directories

dir_name <- "d52"
dir_source <- "../raw_data/data/d52"
dir_target <- paste0("../derived_data/", id_last$id_dataset + 1L)

if(!dir.exists(dir_target)) {
  dir.create(dir_target)
}

2 Data wrangling

2.1 dataset

datasets <- 
  tibble::tibble(
    id_dataset = id_last$id_dataset + 1L
  )

2.2 citations_to_datasets

citations_to_datasets <- 
  dplyr::bind_rows(
    db_template_tables$citations_to_datasets,
    tibble::tibble(
      id_dataset = datasets$id_dataset,
      id_citation = c("Bengtsson.2018a", "Bengtsson.2018") 
    )
  )

2.3 samples

# litter chemstry
samples1 <- 
  read.csv(paste0(dir_source, "/raw/bengtsson_etal_2018_LQ.csv")) %>%
  dplyr::rename(
    sample_label = "sample",
    site_name = "site",
    sampling_latitude = "latitude",
    sampling_longitude = "longitude",
    taxon_rank_value = "species.name",
    site_type = "vegetation.type",
    sample_microhabitat = "microtopographical.position",
    holocellulose_mass_relative_mass = "HC_mg_g",
    sphagnan_mass_relative_mass = "sphagn_litter_mg_g",
    phenolics_tannic_acid_equivalents_mass_relative_mass = "phenolics_TA_mg_g",
    phenolics_PHBA_equivalents_mass_relative_mass = "phenolics_PHBA_mg_g",
    Klason_lignin_mass_relative_mass = "totalKL_mg_g",
    soluble_Klason_lignin_mass_relative_mass = "solubleKL_mg_g",
    cation_exchange_capacity_relative_mass = "CEC_meq_g",
    N_relative_mass = "N_mg_g",
    C_relative_mass = "C_mg_g",
    P_relative_mass = "PO4_mg_g" #---question: the readme does not explicitly say whether this is elemental P or phosphate content
  ) %>%
  dplyr::select(! dplyr::any_of(c("species.code", "author.citation", "section", "sphagn_HC_mg_g", "KL_mg_g", "CNratio", "abs_ratio_205_280", "solubleKL_perc_of_totalKL", "dev_100perc", "HWT2012", "losslab2b", "lossfield"))) %>%
  dplyr::mutate(
    dplyr::across(dplyr::any_of(c("holocellulose_mass_relative_mass", "sphagnan_mass_relative_mass", "phenolics_tannic_acid_equivalents_mass_relative_mass", "phenolics_PHBA_equivalents_mass_relative_mass", "N_relative_mass", "C_relative_mass", "P_relative_mass", "soluble_Klason_lignin_mass_relative_mass", "Klason_lignin_mass_relative_mass")), function(.x) {
      .x %>% 
        units::set_units("mg/g") %>% 
        units::set_units("g/g") %>%
        units::drop_units()
    }),
    dplyr::across(
      dplyr::ends_with("_relative_mass"),
      function(.x) NA_real_,
      .names = "{.col}_absolute"
    ),
    plot_label = {
      
      digit <- 
        sample_label %>%
        stringr::str_extract(pattern = "\\d+$") %>%
        as.numeric()
      
      digit <- ifelse(digit < 10, paste0("0", digit), as.character(digit))
      
      rest <- 
        sample_label %>%
        stringr::str_remove(pattern = "\\d+$")
      
      paste0(rest, digit)
      
    },
    taxon_rank_value = 
      taxon_rank_value %>%
      stringr::str_replace(pattern = "_", replacement = " "),
    taxon_rank_name = "species",
    shade = 
      dplyr::case_when(
        shade == "1open" ~ "open",
        shade == "2semi" ~ "semi-open",
        shade == "3dark" ~ "shaded"
      ),
    sampling_year = 2013,
    sampling_month = 10,
    sampling_day = NA_integer_,
    sample_treatment = "control",
    sample_type = "litter",
    taxon_organ = "shoots",
    sample_depth_upper = 1,
    sample_depth_lower = 
      dplyr::case_when(
        taxon_rank_value == "Sphagnum tenellum" ~ 3,
        TRUE ~ 4
      ),
    dplyr::across(dplyr::any_of(c("site_type", "microhabitat")), tolower),
    comments_samples = "Coordinates are only the approximate location of the study site, but not sampling points. The exact day of sampling is unknown. Each sample corresponds to one separate patch (plot, identified by `plot_label`) on the peatlands from which samples were taken. Samples were collected at the same patches as samples from @Bengtsson.2016, but are not the same samples. I assumed that the capitulum of moss shoots had a length of 1 cm.",
    comments_samples =
      dplyr::case_when(
        taxon_rank_value == "Sphagnum balticum" ~ paste0(comments_samples, " @Bengtsson.2018 mentions that for some replicates for S. balticum, shoots of a length of only 2 cm were collected."),
        TRUE ~ comments_samples
      ),
    id_dataset = datasets$id_dataset[[1]],
    is_incubated = FALSE,
    incubation_duration = 0.0,
    incubation_environment = "peat",
    id_sample = seq_len(nrow(.)) + id_last$id_sample,
    id_sample_origin = id_sample,
    id_sample_parent = id_sample,
    id_sample_incubation_start = NA_integer_,
    experimental_design =
      paste0(
        as.numeric(as.factor(sample_treatment)), "_",
        as.numeric(as.factor(site_name)), "_",
        as.numeric(as.factor(plot_label))
      ),
    mass_absolute = NA_real_
  ) %>%
  dplyr::rename_with(
    .fn = function(.x) {
      stringr::str_replace(.x, "_relative_mass_absolute$", "_absolute")
    }
  )


## combine
samples <- 
  dplyr::bind_rows(
    db_template_tables$samples,
    samples1 %>%
      dplyr::mutate(
        type = "samples1"
      )
  )

2.4 samples_to_samples

samples_to_samples <- 
  samples %>%
  dplyr::filter(! id_sample %in% id_sample_origin) %>%
  dplyr::mutate(
    transition_description =
      dplyr::case_when(
        type %in% c("samples2") ~ "translocate",
        type %in% c("samples3") ~ "wait",
        TRUE ~ NA_character_
      )
  ) %>%
  dplyr::select(id_sample_parent, id_sample, transition_description) %>%
  dplyr::rename(
    id_sample_child = "id_sample"
  )

2.5 data

d1 <- 
  samples1 %>%
  tidyr::pivot_longer(
    cols = dplyr::any_of(c("mass_absolute", "mass_relative_mass", "mesh_size_absolute", paste0(PeriodicTable:::periodicTable$symb, "_relative_mass"), paste0(PeriodicTable:::periodicTable$symb, "_absolute"), 
                           paste0(c("soluble_Klason_lignin_mass", "Klason_lignin_mass", "holocellulose_mass", "sphagnan_mass", "phenolics_tannic_acid_equivalents_mass", "phenolics_PHBA_equivalents_mass", "cation_exchange_capacity", "soluble_Klason_lignin_mass"), "_relative_mass"),
                           paste0(c("soluble_Klason_lignin_mass", "Klason_lignin_mass", "holocellulose_mass", "sphagnan_mass", "phenolics_tannic_acid_equivalents_mass", "phenolics_PHBA_equivalents_mass", "cation_exchange_capacity", "soluble_Klason_lignin_mass"), "_absolute"))),
    names_to = "attribute_name",
    values_to = "value"
  ) %>%
  dplyr::mutate(
    id_measurement = seq_len(nrow(.)) + id_last$id_measurement,
    id_measurement_numerator =
      purrr::map_int(seq_len(nrow(.)), function(i) {
        if(attribute_name[[i]] == "mass_relative_mass") {
          id_measurement[id_sample == id_sample[[i]] & attribute_name == "mass_absolute"]
        } else if (stringr::str_detect(attribute_name[[i]], pattern = "_relative_mass$")) {
          id_measurement[id_sample == id_sample[[i]] & attribute_name == stringr::str_replace(attribute_name[[i]], "_relative_mass2?", "_absolute")]
        } else {
          NA_integer_
        }
      }),
    id_measurement_denominator =
      purrr::map_int(seq_len(nrow(.)), function(i) {
        if(attribute_name[[i]] == "mass_relative_mass") {
          id_measurement[id_sample == id_sample_incubation_start[[i]] & attribute_name == "mass_absolute"]
        } else if (stringr::str_detect(attribute_name[[i]], pattern = "_relative_mass$")) {
          id_measurement[id_sample == id_sample[[i]] & attribute_name == "mass_absolute"]
        } else {
          NA_integer_
        }
      }),
    value_type = "point",
    sample_size = 1L
  )


# combine
d <- 
  dplyr::bind_rows(
    db_template_tables$data,
    d1
  ) %>%
  dplyr::select(dplyr::all_of(colnames(db_template_tables$data)))

2.6 experimental_design_format

experimental_design_format <- 
  tibble::tibble(
    id_dataset = datasets$id_dataset,
    file = paste0(id_last$id_dataset + 1L, "/experimental_design_format.csv"),
    experimental_design_description = "`site_name`: Character value: Site where the samples grew (and were collected). `plot_label`: Character value. Plot label where the samples grew (and from which samples were collected). `site_type`: Character type describing the peatland type. `shade`: Character value describing how much the floow is shaded by trees/shrubs."
  )

# csv file to export
experimental_design_format2 <- 
  samples %>%
  dplyr::filter(! is.na(experimental_design)) %>%
  dplyr::filter(! duplicated(experimental_design)) %>%
  dplyr::select(experimental_design, sample_treatment, site_name, plot_label, site_type, shade)

# export
write.csv(experimental_design_format2, paste0(dir_target, "/experimental_design_format.csv"), row.names = FALSE)

3 Export to database

# list all tables
dm_insert_in <-
  list(
    datasets = 
      datasets %>% 
      dplyr::select(dplyr::all_of(colnames(dm_dpeatdecomposition$datasets))),
    samples = 
      samples %>% 
      dplyr::select(dplyr::all_of(colnames(dm_dpeatdecomposition$samples))),
    data = 
      d %>% 
      dplyr::select(dplyr::all_of(colnames(dm_dpeatdecomposition$data))),
    samples_to_samples = 
      samples_to_samples %>% 
      dplyr::select(dplyr::all_of(colnames(dm_dpeatdecomposition$samples_to_samples))),
    citations_to_datasets = 
      citations_to_datasets %>% 
      dplyr::select(dplyr::all_of(colnames(dm_dpeatdecomposition$citations_to_datasets))),
    experimental_design_format = 
      experimental_design_format %>% 
      dplyr::select(dplyr::all_of(colnames(dm_dpeatdecomposition$experimental_design_format)))
  )

# check whether all column names as present in table attributes
all_column_names <- 
  purrr::map(dm_insert_in, colnames) %>%
  unlist() %>%
  unique()

if(! all(all_column_names %in% (dm_dpeatdecomposition %>% dm::pull_tbl(attributes) %>% dplyr::pull(attribute_name)))) {
  cond <- purrr::map_lgl(all_column_names, function(.x) ! .x %in% (dm_dpeatdecomposition %>% dm::pull_tbl(attributes) %>% dplyr::pull(attribute_name)))
  RMariaDB::dbDisconnect(con)
  stop(paste0("New `attribute_name`s discovered: ", paste(all_column_names[cond], collapse = ", ")))
}

all_data_attributes <- unique(dm_insert_in$data$attribute_name)

if(! all(all_data_attributes %in% (dm_dpeatdecomposition %>% dm::pull_tbl(attributes) %>% dplyr::pull(attribute_name)))) {
  cond <- purrr::map_lgl(all_data_attributes, function(.x) ! .x %in% (dm_dpeatdecomposition %>% dm::pull_tbl(attributes) %>% dplyr::pull(attribute_name)))
  stop(paste0("New `attribute_name`s discovered: ", paste(all_data_attributes[cond], collapse = ", ")))
  RMariaDB::dbDisconnect(con)
}


# filter empty tables
dm_insert_in_check <-
  dm_insert_in[purrr::map_lgl(dm_insert_in, function(x) nrow(x) > 0)] %>%
  dm::as_dm() %>%
  dp_dm_add_keys(dm_dpeatdecomposition)

# copy into dm_pmird
for(i in seq_along(dm_insert_in)) {
  RMariaDB::dbAppendTable(con, name = names(dm_insert_in)[[i]], value = dm_insert_in[[i]])
}

RMariaDB::dbDisconnect(con)

4 Notes