# packages
library(metaDigitise)
library(magrittr)
library(tibble)
library(dplyr)
library(lubridate)
library(dpeatdecomposition)
library(dm)
library(RMariaDB)
Connect to database
# connect to database
con <-
RMariaDB::dbConnect(
drv = RMariaDB::MariaDB(),
dbname = "dpeatdecomposition",
default.file = "~/my.cnf"
)
# get database as dm object
dm_dpeatdecomposition <-
dpeatdecomposition::dp_get_dm(con, learn_keys = TRUE)
Get most current IDs
id_last <-
list(
id_dataset =
dm_dpeatdecomposition %>%
dm::pull_tbl(datasets) %>%
dplyr::pull(id_dataset) %>%
tail(1),
id_sample =
dm_dpeatdecomposition %>%
dm::pull_tbl(samples) %>%
dplyr::pull(id_sample) %>%
tail(1),
id_measurement =
dm_dpeatdecomposition %>%
dm::pull_tbl(data) %>%
dplyr::pull(id_measurement) %>%
tail(1)
) %>%
purrr::map(function(.x) {
if(length(.x) == 0) {
0L
} else {
.x
}
})
Create directories
dir_name <- "d19"
dir_source <- "../raw_data/data/d19"
dir_target <- paste0("../derived_data/", id_last$id_dataset + 1L)
if(!dir.exists(dir_target)) {
dir.create(dir_target)
}
datasets <-
tibble::tibble(
id_dataset = id_last$id_dataset + 1L
)
citations_to_datasets <-
dplyr::bind_rows(
db_template_tables$citations_to_datasets,
tibble::tibble(
id_dataset = datasets$id_dataset,
id_citation = c("Szumigalski.1996")
)
)
## mass remaining
samples3 <-
dplyr::bind_rows(
readRDS(paste0(dir_source, "/raw/caldat/Szumigalski.1996-Fig2a"))$processed_data,
readRDS(paste0(dir_source, "/raw/caldat/Szumigalski.1996-Fig2b"))$processed_data,
readRDS(paste0(dir_source, "/raw/caldat/Szumigalski.1996-Fig2c"))$processed_data,
readRDS(paste0(dir_source, "/raw/caldat/Szumigalski.1996-Fig2d"))$processed_data,
readRDS(paste0(dir_source, "/raw/caldat/Szumigalski.1996-Fig2e"))$processed_data,
readRDS(paste0(dir_source, "/raw/caldat/Szumigalski.1996-Fig3"))$processed_data
) %>%
dplyr::mutate(
id_dataset = datasets$id_dataset[[1]],
id =
dplyr::case_when( #---note: manually correct an erroneous label
id == "siteopenrichfen_speciesDrepanocladus_vernicosus" ~ "siteopenrichfen_speciesDrepanocladus_vernicosus_year2",
id == "siteopenrichfen_speciesCraex_lasiocarpa_year1" ~ "siteopenrichfen_speciesCarex_lasiocarpa_year1",
TRUE ~ id
),
site_label =
id %>%
stringr::str_extract(pattern = "site[:alpha:]+") %>%
stringr::str_remove(pattern = "^site"),
site_name = site_label,
taxon_rank_value =
id %>%
stringr::str_extract(pattern = "species[:alpha:]+_*[:alpha:]*_year") %>%
stringr::str_remove(pattern = "^species") %>%
stringr::str_remove(pattern = "_year$") %>%
stringr::str_replace_all(pattern = "_", replacement = " "),
taxon_rank_value =
dplyr::case_when(
taxon_rank_value == "Carex" & ! stringr::str_detect(id, "sitesedgefen_speciesCarex") ~ "Carex spec.",
taxon_rank_value == "Carex" & stringr::str_detect(id, "sitesedgefen_speciesCarex") ~ "Carex lasiocarpa",
taxon_rank_value == "Tomenthypnum nitens" ~ "Tomentypnum nitens",
TRUE ~ taxon_rank_value
),
taxon_rank_name =
dplyr::case_when(
taxon_rank_value == "Carex spec." ~ "genus",
TRUE ~ "species"
),
incubation_duration =
dplyr::case_when(
stringr::str_detect(id, "year1") ~ 365,
stringr::str_detect(id, "year2") ~ 365 * 2
),
sampling_date =
dplyr::case_when(
stringr::str_detect(id, "year1") ~ "1991-10-01",
stringr::str_detect(id, "year2") ~ "1992-10-01"
) %>%
as.Date(),
sampling_year = lubridate::year(sampling_date),
sampling_month = lubridate::month(sampling_date),
sampling_latitude =
dplyr::case_when(
site_label %in% c("bog", "poorfen") ~ "54°41'N",
site_label %in% c("openrichfen", "woodedrichfen") ~ "54°28'N",
site_label %in% c("sedgefen") ~ "54°28'N",
) %>%
sp::char2dms(chd = "°", chm = "'", chs = "''") %>%
as.numeric(),
sampling_longitude =
dplyr::case_when(
site_label %in% c("bog", "poorfen") ~ "113°28'W",
site_label %in% c("openrichfen", "woodedrichfen") ~ "113°17'W",
site_label %in% c("sedgefen") ~ "113°20'W",
) %>%
sp::char2dms(chd = "°", chm = "'", chs = "''") %>%
as.numeric(),
mass_relative_mass = mean/100,
mass_absolute = NA_real_,
mesh_size_absolute = 1,
mass_remaining_error = error/100,
mass_remaining_sample_size = 5, #---note: the text says sample sizes were either 5 or 10, but it is not clear what number applies. I use 5 as conservative estimate here.
mass_remaining_error_type = "se",
is_incubated = TRUE,
incubation_environment = "peat",
sample_treatment = "control",
sample_type = "litter",
sample_type2 =
dplyr::case_when(
taxon_rank_value == "Betula pumila" ~ "Upper 15 cm of branches with leaves",
TRUE ~ NA_character_
),
taxon_organ =
dplyr::case_when(
taxon_rank_value %in% c("Carex spec.", "Betula pumila", "Carex lasiocarpa") ~ "aboveground",
TRUE ~ "whole plant"
),
experimental_design =
site_label %>%
as.factor() %>%
as.numeric() %>%
as.character(),
sample_depth_upper = 2,
sample_depth_lower = 2,
comments_samples =
"Coordinates are only the approximate location of the study site, but not sampling points. `sample_depth_upper`, `sample_depth_lower`: Approximate position based on text description."
) %>%
dplyr::filter(
! stringr::str_detect(id, pattern = "sitesedgefen_speciesCarex_y") #---note: I assume that this is the same as the data for Carex lasiocarpa since there are data on C and N content only for C. lasiocarpa for this site.
)
# initial masses
samples2 <-
samples3 %>%
dplyr::filter(!duplicated(paste0(site_label, "_", taxon_rank_value))) %>%
dplyr::mutate(
mass_relative_mass = 1.0,
mass_remaining_error = 0.0,
incubation_duration = 0.0,
sampling_year = 1990,
sampling_month = 10,
sampling_date = as.Date("1990-10-01")
)
# litter collection
samples1 <-
samples2 %>%
dplyr::mutate(
site_label =
dplyr::case_when(
taxon_rank_value == "Carex lasiocarpa" ~ "sedgefen",
TRUE ~ site_label
),
site_name = site_label,
sampling_latitude =
dplyr::case_when(
site_label %in% c("bog", "poorfen") ~ "54°41'N",
site_label %in% c("openrichfen", "woodedrichfen") ~ "54°28'N",
site_label %in% c("sedgefen") ~ "54°28'N",
) %>%
sp::char2dms(chd = "°", chm = "'", chs = "''") %>%
as.numeric(),
sampling_longitude =
dplyr::case_when(
site_label %in% c("bog", "poorfen") ~ "113°28'W",
site_label %in% c("openrichfen", "woodedrichfen") ~ "113°17'W",
site_label %in% c("sedgefen") ~ "113°20'W",
) %>%
sp::char2dms(chd = "°", chm = "'", chs = "''") %>%
as.numeric(),
experimental_design =
site_name %>%
as.factor() %>%
as.numeric() %>%
as.character(),
sample_type =
dplyr::case_when(
taxon_rank_value %in% c("Carex spec.", "Betula pumila", "Carex lasiocarpa") ~ "litter",
TRUE ~ "vegetation"
),
sample_depth_upper =
dplyr::case_when(
taxon_rank_value %in% c("Carex spec.", "Betula pumila", "Carex lasiocarpa") ~ NA_real_,
TRUE ~ 0
),
sample_depth_lower =
dplyr::case_when(
taxon_rank_value %in% c("Carex spec.", "Betula pumila", "Carex lasiocarpa") ~ NA_real_,
TRUE ~ 5 #---note: assumed
),
sampling_year = 1990,
sampling_month = 9,
sampling_day = 16,
is_incubated = FALSE,
incubation_duration = 0.0,
incubation_environment = NA_character_,
comments_samples = "Coordinates are only the approximate location of the study site, but not sampling points. `sample_depth_upper`, `sample_depth_lower`: Assumed sampling depths."
) %>%
dplyr::filter(!duplicated(paste0(site_label, "_", taxon_rank_value))) %>%
dplyr::mutate(
id_sample = seq_len(nrow(.)) + id_last$id_sample,
id_sample_parent = id_sample,
id_sample_origin = id_sample,
id_sample_incubation_start = id_sample
)
# add missing ids
samples2 <-
dplyr::bind_rows(
samples2 %>%
dplyr::mutate(
type = "samples2"
),
samples3 %>%
dplyr::mutate(
type = "samples3"
)
) %>%
dplyr::select(-error)
samples2 <-
samples2 %>%
dplyr::mutate(
id_sample = seq_len(nrow(.)) + max(samples1$id_sample),
id_sample_origin =
dplyr::left_join(
samples2 %>% dplyr::select(site_label, taxon_rank_value),
samples1 %>% dplyr::select(site_label, taxon_rank_value, id_sample),
by = c("site_label", "taxon_rank_value")
) %>%
dplyr::pull(id_sample),
id_sample_origin =
dplyr::case_when(
taxon_rank_value == "Carex lasiocarpa" ~ rep(id_sample_origin[taxon_rank_value == "Carex lasiocarpa" & site_label == "sedgefen"][[1]], nrow(samples2)),
TRUE ~ id_sample_origin
),
id_sample_incubation_start =
purrr::map_int(seq_len(nrow(.)), function(i) {
index <- paste0(taxon_rank_value, "_", site_label) == paste0(taxon_rank_value, "_", site_label)[[i]] & experimental_design == experimental_design[[i]] & incubation_duration == 0
id_sample[index]
}),
id_sample_parent =
purrr::map_int(seq_len(nrow(.)), function(i) {
index <- paste0(taxon_rank_value, "_", site_label) == paste0(taxon_rank_value, "_", site_label)[[i]] & experimental_design == experimental_design[[i]] & incubation_duration < incubation_duration[[i]]
if(! any(index)) {
id_sample_origin[[i]]
} else {
target_incubation_duration <- max(incubation_duration[index])
index <- index & incubation_duration == target_incubation_duration
id_sample[index]
}
})
)
# average annual water table depth
samples4 <-
readRDS(paste0(dir_source, "/raw/caldat/Szumigalski.1996-Fig4"))$processed_data %>%
dplyr::select(id, x) %>%
dplyr::rename(
water_table_depth = "x",
site_label = "id"
) %>%
dplyr::left_join(
samples1 %>%
dplyr::filter(!duplicated(site_label)) %>%
dplyr::select(dplyr::any_of(c("site_label", colnames(db_template_tables$samples)))) %>%
dplyr::select(-comments_samples),
by = c("site_label")
) %>%
dplyr::mutate(
site_name = site_label,
water_table_depth = water_table_depth * (-1),
id_sample = seq_len(nrow(.)) + max(samples2$id_sample),
id_sample_origin = id_sample,
id_sample_parent = id_sample,
sampling_year = NA_real_,
sampling_month = NA_real_,
sampling_day = NA_real_,
sample_type = "peat",
sample_type2 = NA_character_,
taxon_rank_name = NA_character_,
taxon_rank_value = NA_character_,
taxon_organ = NA_character_,
is_incubated = FALSE,
incubation_duration = 0.0,
sample_depth_upper = 0,
sample_depth_lower = 0,
comments_samples = "Coordinates are only the approximate location of the study site, but not sampling points."
)
# Tab. 1
d19_C_N <-
readODS::read_ods(paste0(dir_source, "/derived/Szumigalski.1996-Tab1.ods")) %>%
dplyr::rename(
C_to_N = "CN"
) %>%
dplyr::mutate(
N_error = N_error/100,
N_relative_mass = N/100,
C_relative_mass = NA_real_,
C_absolute = NA_real_,
N_absolute = NA_real_,
comments_samples = "Coordinates are only the approximate location of the study site, but not sampling points. Only year and month of sample collection are known."
) %>%
dplyr::select(site_label, taxon_rank_value, sampling_date, C_to_N, CN_error, N_relative_mass, N_error, C_relative_mass, C_absolute, N_absolute)
samples2 <-
dplyr::left_join(
samples2,
d19_C_N,
by = c("site_label", "taxon_rank_value", "sampling_date")
)
## combine
samples <-
dplyr::bind_rows(
db_template_tables$samples,
samples1 %>%
dplyr::mutate(
type = "samples1"
),
samples2,
samples4 %>%
dplyr::mutate(
type = "samples4"
)
)
samples_to_samples <-
samples %>%
dplyr::filter(! id_sample %in% id_sample_origin) %>%
dplyr::mutate(
transition_description =
dplyr::case_when(
type %in% c("samples2") ~ "translocate",
type %in% c("samples3") ~ "wait",
TRUE ~ NA_character_
)
) %>%
dplyr::select(id_sample_parent, id_sample, transition_description) %>%
dplyr::rename(
id_sample_child = "id_sample"
)
d2 <-
samples2 %>%
tidyr::pivot_longer(
cols = dplyr::all_of(c("mass_absolute", "mass_relative_mass", "mesh_size_absolute", "C_to_N", "N_relative_mass", "C_relative_mass", "C_absolute", "N_absolute")),
names_to = "attribute_name",
values_to = "value"
) %>%
dplyr::mutate(
id_measurement = seq_len(nrow(.)) + id_last$id_measurement,
id_measurement_numerator =
purrr::map_int(seq_len(nrow(.)), function(i) {
switch(
attribute_name[[i]],
"C_to_N" = {
id_measurement[id_sample == id_sample[[i]] & attribute_name == "C_relative_mass"]
},
"N_relative_mass" = {
id_measurement[id_sample == id_sample[[i]] & attribute_name == "N_absolute"]
},
"C_relative_mass" = {
id_measurement[id_sample == id_sample[[i]] & attribute_name == "C_absolute"]
},
"mass_relative_mass" = {
id_measurement[id_sample == id_sample[[i]] & attribute_name == "mass_absolute"]
},
NA_integer_
)
}),
id_measurement_denominator =
purrr::map_int(seq_len(nrow(.)), function(i) {
switch(
attribute_name[[i]],
"C_to_N" = {
id_measurement[id_sample == id_sample[[i]] & attribute_name == "N_relative_mass"]
},
"N_relative_mass" = ,
"C_relative_mass" = {
id_measurement[id_sample == id_sample[[i]] & attribute_name == "mass_absolute"]
},
"mass_relative_mass" = {
id_measurement[id_sample == id_sample_incubation_start[[i]] & attribute_name == "mass_absolute"]
},
NA_integer_
)
}),
value_type =
dplyr::case_when(
attribute_name == "mesh_size_absolute" ~ "point",
TRUE ~ "mean"
),
error_type =
dplyr::case_when(
attribute_name %in% c("mesh_size_absolute", "mass_absolute") ~ NA_character_,
TRUE ~ "se"
),
sample_size =
dplyr::case_when(
attribute_name %in% c("mesh_size_absolute", "C_absolute", "N_absolute", "C_relative_mass") ~ NA_integer_,
attribute_name %in% c("C_to_N", "N_relative_mass") ~ 3L,
TRUE ~ 5L
),
comments_measurements =
dplyr::case_when(
attribute_name %in% c("mesh_size_absolute", "C_absolute", "N_absolute", "C_relative_mass", "C_to_N", "N_relative_mass") ~ NA_character_,
TRUE ~ "The text says sample sizes were either 5 or 10, but it is not clear what number applies. I use 5 as conservative estimate here."
)
)
d2_error <-
samples2 %>%
tidyr::pivot_longer(
cols = dplyr::all_of(c("CN_error", "N_error", "mass_remaining_error")),
names_to = "attribute_name",
values_to = "error"
) %>%
dplyr::mutate(
attribute_name =
dplyr::case_when(
attribute_name == "mass_remaining_error" ~ "mass_relative_mass",
attribute_name == "N_error" ~ "N_relative_mass",
attribute_name == "CN_error" ~ "C_to_N"
)
) %>%
dplyr::select(id_sample, attribute_name, error)
d2 <-
d2 %>%
dplyr::mutate(
error =
dplyr::left_join(d2, d2_error, by = c("id_sample", "attribute_name")) %>%
dplyr::pull(error)
)
d4 <-
samples4 %>%
tidyr::pivot_longer(
cols = dplyr::all_of(c("water_table_depth")),
names_to = "attribute_name",
values_to = "value"
) %>%
dplyr::mutate(
id_measurement = seq_len(nrow(.)) + max(d2$id_measurement),
value_type = "mean",
sample_size = NA_integer_,
comments_measurements = "Water table depths were measured across the ice free time. The text does not mention the exact interval over which the water table depth was measured. It also does not mention whether the water table depth was measured at the same location where litter bags were incubated."
)
# combine
d <-
dplyr::bind_rows(
db_template_tables$data,
d2,
d4
) %>%
dplyr::select(dplyr::all_of(colnames(db_template_tables$data)))
experimental_design_format <-
tibble::tibble(
id_dataset = datasets$id_dataset,
file = paste0(id_last$id_dataset + 1L, "/experimental_design_format.csv"),
experimental_design_description = "`site_name`: Name of the site."
)
# csv file to export
experimental_design_format2 <-
samples %>%
dplyr::filter(! is.na(experimental_design)) %>%
dplyr::filter(! duplicated(experimental_design)) %>%
dplyr::select(experimental_design, site_name)
# export
write.csv(experimental_design_format2, paste0(dir_target, "/experimental_design_format.csv"), row.names = FALSE)
# list all tables
dm_insert_in <-
list(
datasets =
datasets %>%
dplyr::select(dplyr::all_of(colnames(dm_dpeatdecomposition$datasets))),
samples =
samples %>%
dplyr::select(dplyr::all_of(colnames(dm_dpeatdecomposition$samples))),
data =
d %>%
dplyr::select(dplyr::all_of(colnames(dm_dpeatdecomposition$data))),
samples_to_samples =
samples_to_samples %>%
dplyr::select(dplyr::all_of(colnames(dm_dpeatdecomposition$samples_to_samples))),
citations_to_datasets =
citations_to_datasets %>%
dplyr::select(dplyr::all_of(colnames(dm_dpeatdecomposition$citations_to_datasets))),
experimental_design_format =
experimental_design_format %>%
dplyr::select(dplyr::all_of(colnames(dm_dpeatdecomposition$experimental_design_format)))
)
# check whether all column names as present in table attributes
all_column_names <-
purrr::map(dm_insert_in, colnames) %>%
unlist() %>%
unique()
if(! all(all_column_names %in% (dm_dpeatdecomposition %>% dm::pull_tbl(attributes) %>% dplyr::pull(attribute_name)))) {
cond <- purrr::map_lgl(all_column_names, function(.x) ! .x %in% (dm_dpeatdecomposition %>% dm::pull_tbl(attributes) %>% dplyr::pull(attribute_name)))
RMariaDB::dbDisconnect(con)
stop(paste0("New `attribute_name`s discovered: ", paste(all_column_names[cond], collapse = ", ")))
}
all_data_attributes <- unique(dm_insert_in$data$attribute_name)
if(! all(all_data_attributes %in% (dm_dpeatdecomposition %>% dm::pull_tbl(attributes) %>% dplyr::pull(attribute_name)))) {
RMariaDB::dbDisconnect(con)
cond <- purrr::map_lgl(all_data_attributes, function(.x) ! .x %in% (dm_dpeatdecomposition %>% dm::pull_tbl(attributes) %>% dplyr::pull(attribute_name)))
stop(paste0("New `attribute_name`s discovered: ", paste(all_data_attributes[cond], collapse = ", ")))
}
# filter empty tables
dm_insert_in_check <-
dm_insert_in[purrr::map_lgl(dm_insert_in, function(x) nrow(x) > 0)] %>%
dm::as_dm() %>%
dp_dm_add_keys(dm_dpeatdecomposition)
# copy into dm_pmird
for(i in seq_along(dm_insert_in)) {
RMariaDB::dbAppendTable(con, name = names(dm_insert_in)[[i]], value = dm_insert_in[[i]])
}
RMariaDB::dbDisconnect(con)