# packages
library(metaDigitise)
library(magrittr)
library(tibble)
library(dplyr)
library(lubridate)
library(dpeatdecomposition)
library(dm)
library(RMariaDB)
Connect to database
# connect to database
con <-
RMariaDB::dbConnect(
drv = RMariaDB::MariaDB(),
dbname = "dpeatdecomposition",
default.file = "~/my.cnf"
)
# get database as dm object
dm_dpeatdecomposition <-
dpeatdecomposition::dp_get_dm(con, learn_keys = TRUE)
Get most current IDs
id_last <-
list(
id_dataset =
dm_dpeatdecomposition %>%
dm::pull_tbl(datasets) %>%
dplyr::pull(id_dataset) %>%
tail(1),
id_sample =
dm_dpeatdecomposition %>%
dm::pull_tbl(samples) %>%
dplyr::pull(id_sample) %>%
tail(1),
id_measurement =
dm_dpeatdecomposition %>%
dm::pull_tbl(data) %>%
dplyr::pull(id_measurement) %>%
tail(1)
) %>%
purrr::map(function(.x) {
if(length(.x) == 0) {
0L
} else {
.x
}
})
Create directories
dir_name <- "d59"
dir_source <- "../raw_data/data/d59"
dir_target <- paste0("../derived_data/", id_last$id_dataset + 1L)
if(!dir.exists(dir_target)) {
dir.create(dir_target)
}
datasets <-
tibble::tibble(
id_dataset = id_last$id_dataset + 1L
)
citations_to_datasets <-
dplyr::bind_rows(
db_template_tables$citations_to_datasets,
tibble::tibble(
id_dataset = datasets$id_dataset,
id_citation = c("Hagemann.2015", "Hagemann.2016")
)
)
# mass remaining
samples3 <-
read.csv(paste0(dir_source, "/raw/2007_290_data_decomposition.csv"), dec = ",", sep = "\t") %>%
dplyr::select(1:26) %>%
dplyr::slice(-c(1:24)) %>%
dplyr::filter(INCUB_METHOD == "Litterbag") %>%
dplyr::rename(
incubation_duration = "INCUB_DAY",
id_site = "SITE",
id_plot = "PLOT",
sample_label = "LABEL",
id_replicate = "REPL",
mass_relative_mass = "RESID_MASS_PERC",
C_relative_mass = "C_CONC",
S_relative_mass = "S_CONC",
N_relative_mass = "N_CONC",
P_relative_mass = "P_CONC",
K_relative_mass = "K_CONC",
Mg_relative_mass = "MG_CONC",
Ca_relative_mass = "CA_CONC",
sample_type2 = "LITTER_TYPE"
) %>%
dplyr::mutate(
id_dataset = datasets$id_dataset[[1]],
sample_type = "litter",
mass_absolute = NA_real_,
mass_relative_mass = mass_relative_mass/100,
sampling_date =
COLLECT_DATE %>%
as.Date(format = "%d.%m.%y"),
incubation_environment = "litterbag",
site_type =
paste0(tolower(STAND_TYPE), " boreal forest"),
experimental_design =
paste0(
as.numeric(as.factor(id_site)), "_",
as.numeric(as.factor(site_type)), "_",
as.numeric(as.factor(id_plot)), "_",
as.numeric(as.factor(id_replicate))
),
sampling_longitude = -65.21613047049127, #---note:by converting data from Hagemann.2009 here (http://rcn.montana.edu/Resources/Converter.aspx) to lon lat
sampling_latitude = 53.53519486917857,
comments_samples = "Coordinates are only the approximate location of the site, but not the exact sampling point. `sample_depth_upper`, `sample_depth_lower`: guessed from description.",
mesh_size_absolute = 1,
is_incubated = TRUE,
sampling_year = lubridate::year(sampling_date),
sampling_month = lubridate::month(sampling_date),
sampling_day = lubridate::day(sampling_date),
site_name = "Highboreal ForesteLake Melville Ecoregion",
sample_depth_lower = 5, #---note: guessed from description
sample_depth_upper = 5,
sample_type2 = tolower(sample_type2),
taxon_rank_value =
dplyr::case_when(
sample_type2 == "alder" ~ "Alnus rugosa", # species
sample_type2 == "feathermoss" ~ "Hypnales", # order
sample_type2 == "peatmoss" ~ "Sphagnum", # genus
sample_type2 == "lichen" ~ NA_character_
),
taxon_rank_name =
dplyr::case_when(
sample_type2 == "alder" ~ "species",
sample_type2 == "feathermoss" ~ "order",
sample_type2 == "peatmoss" ~ "genus",
sample_type2 == "lichen" ~ NA_character_
),
taxon_organ =
dplyr::case_when(
sample_type2 == "alder" ~ "leaves",
sample_type2 == "feathermoss" ~ "whole plant",
sample_type2 == "peatmoss" ~ "whole plant",
sample_type2 == "lichen" ~ "whole tallus"
),
sample_type2 =
dplyr::case_when(
sample_type2 == "alder" ~ "Alnus rugosa leaves",
sample_type2 == "feathermoss" ~ "Pleurozium, Ptilium whole plants",
sample_type2 == "peatmoss" ~ "Sphagnum capillifolium, Sphagnum russowii whole plants",
sample_type2 == "lichen" ~ "Nephroma arcticum (L.) Torss. and Peltigera aphthosa (L.) Willd. whole tallus"
),
sample_treatment = "control",
dplyr::across(
dplyr::ends_with("_relative_mass") & ! dplyr::ends_with("mass_relative_mass"),
function(.x) NA_real_,
.names = "{.col}_absolute"
)
) %>%
dplyr::rename_with(
.cols = dplyr::ends_with("_relative_mass_absolute"),
.fn = function(.x) {
stringr::str_replace(.x, "_relative_mass_absolute", "_absolute")
}
)
d59_litter_chemistry_1 <-
samples3 %>%
dplyr::filter(incubation_duration == 0.0) %>%
dplyr::group_by(taxon_rank_value, sample_type2) %>%
dplyr::summarize(
dplyr::across(
dplyr::ends_with("_relative_mass"),
sd, na.rm = TRUE, .names = "{.col}_error"
),
dplyr::across(
dplyr::ends_with("_relative_mass"),
function(.x) sum(!is.na(.x)),
.names = "{.col}_sample_size"
),
dplyr::across(
dplyr::ends_with("_relative_mass"),
mean, na.rm = TRUE
),
.groups = "drop"
) %>%
dplyr::mutate(
dplyr::across(
dplyr::ends_with("_relative_mass"),
function(.x) .x/100
)
)
samples3 <-
samples3 %>%
dplyr::filter(incubation_duration != 0.0)
# initial mass
samples2 <-
samples3 %>%
dplyr::filter(incubation_duration %in% c(42, 43)) %>%
dplyr::select(-sampling_date) %>%
dplyr::mutate(
sampling_date =
INSTALL_DATE %>%
as.Date(format = "%d.%m.%y"),
sampling_year = lubridate::year(sampling_date),
sampling_month = lubridate::month(sampling_date),
sampling_day = lubridate::day(sampling_date),
incubation_duration = 0,
mass_relative_mass = 1.0,
sample_label = NA_character_
) %>%
dplyr::select(! dplyr::any_of(setdiff(colnames(d59_litter_chemistry_1), c("taxon_rank_value", "sample_type2")))) %>%
dplyr::left_join(
d59_litter_chemistry_1,
by = c("taxon_rank_value", "sample_type2")
)
# litter collection
samples1 <-
samples2 %>%
dplyr::filter(!duplicated(paste0(taxon_rank_value, "_", sample_type2))) %>%
dplyr::mutate(
id_sample = seq_len(nrow(.)) + id_last$id_sample,
id_sample_origin = id_sample,
id_sample_parent = id_sample,
id_sample_incubation_start = NA_integer_,
is_incubated = FALSE,
sample_type = "vegetation", #---note: assumed based on description
site_type = "old-growth boreal forest",
id_site = NA_integer_,
id_plot = NA_integer_,
id_replicate = NA_integer_,
sampling_year = NA_real_,
sampling_month = NA_real_,
sampling_day = NA_real_,
incubation_environment = NA_character_,
experimental_design =
paste0(
as.numeric(as.factor(id_site)), "_",
as.numeric(as.factor(site_type)), "_",
as.numeric(as.factor(id_plot)), "_",
as.numeric(as.factor(id_replicate))
),
sample_depth_upper = 0, #---note: based on text description (I assume that live material is the upper two cm)
sample_depth_lower = 2
)
# add missing ids
samples2 <-
dplyr::bind_rows(
samples2 %>%
dplyr::mutate(
type = "samples2"
),
samples3 %>%
dplyr::mutate(
type = "samples3",
id_plot =
dplyr::case_when( #---note: correct one erroneous id_plot
sample_type2 == "Alnus rugosa leaves" & experimental_design == "4_3_2_1" & sample_label == "C4- 3A" ~ 3L,
sample_type2 == "Sphagnum capillifolium, Sphagnum russowii whole plants" & experimental_design == "4_3_3_1" & sample_label == "C4- 2P" ~ 2L,
TRUE ~ id_plot
),
experimental_design =
dplyr::case_when(
sample_type2 == "Alnus rugosa leaves" & experimental_design == "4_3_2_1" & sample_label == "C4- 3A" ~ "4_3_3_1",
sample_type2 == "Sphagnum capillifolium, Sphagnum russowii whole plants" & experimental_design == "4_3_3_1" & sample_label == "C4- 2P" ~ "4_3_2_1",
TRUE ~ experimental_design
)
)
)
samples2 <-
samples2 %>%
dplyr::mutate(
id_sample = seq_len(nrow(.)) + max(samples1$id_sample),
id_sample_origin =
dplyr::left_join(
samples2 %>%
dplyr::select(sample_type2, taxon_rank_value),
samples1 %>% dplyr::select(sample_type2, taxon_rank_value, id_sample),
by = c("sample_type2", "taxon_rank_value")
) %>%
dplyr::pull(id_sample),
id_sample_incubation_start =
purrr::map_int(seq_len(nrow(.)), function(i) {
index <- paste0(sample_type2, "_", taxon_rank_value, "_", site_name, "_", taxon_organ, "_", sample_depth_upper) == paste0(sample_type2, "_", taxon_rank_value, "_", site_name, "_", taxon_organ, "_", sample_depth_upper)[[i]] & experimental_design == experimental_design[[i]] & incubation_duration == 0.0
id_sample[index]
}),
id_sample_parent =
purrr::map_int(seq_len(nrow(.)), function(i) {
index <- paste0(sample_type2, "_", taxon_rank_value, "_", site_name, "_", taxon_organ, "_", sample_depth_upper) == paste0(sample_type2, "_", taxon_rank_value, "_", site_name, "_", taxon_organ, "_", sample_depth_upper)[[i]] & experimental_design == experimental_design[[i]] & incubation_duration < (incubation_duration[[i]] - 10)
if(! any(index)) {
id_sample_origin[[i]]
} else {
target_incubation_duration <- max(incubation_duration[index])
index <- index & incubation_duration == target_incubation_duration
id_sample[index]
}
})
)
## combine
samples <-
dplyr::bind_rows(
db_template_tables$samples,
samples1 %>%
dplyr::mutate(
type = "samples1"
),
samples2
)
samples_to_samples <-
samples %>%
dplyr::filter(! id_sample %in% id_sample_origin) %>%
dplyr::mutate(
transition_description =
dplyr::case_when(
type %in% c("samples2") ~ "translocate",
type %in% c("samples3") ~ "wait",
TRUE ~ NA_character_
)
) %>%
dplyr::select(id_sample_parent, id_sample, transition_description) %>%
dplyr::rename(
id_sample_child = "id_sample"
)
d2 <-
samples2 %>%
tidyr::pivot_longer(
cols = dplyr::any_of(c("mass_absolute", "mass_relative_mass", "mesh_size_absolute", paste0(PeriodicTable:::periodicTable$symb, "_relative_mass"), paste0(PeriodicTable:::periodicTable$symb, "_absolute"))),
names_to = "attribute_name",
values_to = "value"
) %>%
dplyr::mutate(
id_measurement = seq_len(nrow(.)) + id_last$id_measurement,
id_measurement_numerator =
purrr::map_int(seq_len(nrow(.)), function(i) {
if(attribute_name[[i]] == "mass_relative_mass") {
id_measurement[id_sample == id_sample[[i]] & attribute_name == "mass_absolute"]
} else if (stringr::str_detect(attribute_name[[i]], pattern = "_relative_mass$")) {
id_measurement[id_sample == id_sample[[i]] & attribute_name == stringr::str_replace(attribute_name[[i]], "_relative_mass2?", "_absolute")]
} else {
NA_integer_
}
}),
id_measurement_denominator =
purrr::map_int(seq_len(nrow(.)), function(i) {
if(attribute_name[[i]] == "mass_relative_mass") {
id_measurement[id_sample == id_sample_incubation_start[[i]] & attribute_name == "mass_absolute"]
} else if (stringr::str_detect(attribute_name[[i]], pattern = "_relative_mass$")) {
id_measurement[id_sample == id_sample[[i]] & attribute_name == "mass_absolute"]
} else {
NA_integer_
}
}),
value_type =
dplyr::case_when(
incubation_duration == 0 & ! attribute_name %in% c("mass_relative_mass", "mass_absolute") ~ "mean",
TRUE ~ "point"
),
error_type =
dplyr::case_when(
incubation_duration == 0.0 & value_type == "mean" ~ "sd",
TRUE ~ NA_character_
)
)
d2_sample_size <-
samples2 %>%
tidyr::pivot_longer(
cols = dplyr::ends_with("_sample_size"),
names_to = "attribute_name",
values_to = "sample_size"
) %>%
dplyr::mutate(
attribute_name =
attribute_name %>%
stringr::str_remove(pattern = "_sample_size$")
) %>%
dplyr::select(id_sample, attribute_name, sample_size)
d2_error <-
samples2 %>%
tidyr::pivot_longer(
cols = dplyr::ends_with(c("_error")),
names_to = "attribute_name",
values_to = "error"
) %>%
dplyr::mutate(
attribute_name =
attribute_name %>%
stringr::str_remove(pattern = "_error$")
) %>%
dplyr::select(id_sample, attribute_name, error)
d2 <-
d2 %>%
dplyr::mutate(
error =
dplyr::left_join(d2, d2_error, by = c("id_sample", "attribute_name")) %>%
dplyr::pull(error),
sample_size =
dplyr::left_join(d2, d2_sample_size, by = c("id_sample", "attribute_name")) %>%
dplyr::pull(sample_size)
)
# combine
d <-
dplyr::bind_rows(
db_template_tables$data,
d2
) %>%
dplyr::select(dplyr::all_of(colnames(db_template_tables$data)))
experimental_design_format <-
tibble::tibble(
id_dataset = datasets$id_dataset,
file = paste0(id_last$id_dataset + 1L, "/experimental_design_format.csv"),
experimental_design_description = "`id_site`: Integer value. One of six sites with unknown names and unknown exact location. `site_type`: Character value: 'old-growth boreal forest': not harvested since at least 146 years. 'recently harvested boreal forest': Harvested three years ago. `id_plot`: An identifier for the plot (see the article for details). `id_replicate`: An identifier for the replicate (see the article for details)."
)
# csv file to export
experimental_design_format2 <-
samples %>%
dplyr::filter(! is.na(experimental_design)) %>%
dplyr::filter(! duplicated(experimental_design)) %>%
dplyr::select(experimental_design, id_site, site_type, sample_treatment, id_plot, id_replicate)
# export
write.csv(experimental_design_format2, paste0(dir_target, "/experimental_design_format.csv"), row.names = FALSE)
# list all tables
dm_insert_in <-
list(
datasets =
datasets %>%
dplyr::select(dplyr::all_of(colnames(dm_dpeatdecomposition$datasets))),
samples =
samples %>%
dplyr::select(dplyr::all_of(colnames(dm_dpeatdecomposition$samples))),
data =
d %>%
dplyr::select(dplyr::all_of(colnames(dm_dpeatdecomposition$data))),
samples_to_samples =
samples_to_samples %>%
dplyr::select(dplyr::all_of(colnames(dm_dpeatdecomposition$samples_to_samples))),
citations_to_datasets =
citations_to_datasets %>%
dplyr::select(dplyr::all_of(colnames(dm_dpeatdecomposition$citations_to_datasets))),
experimental_design_format =
experimental_design_format %>%
dplyr::select(dplyr::all_of(colnames(dm_dpeatdecomposition$experimental_design_format)))
)
# check whether all column names as present in table attributes
all_column_names <-
purrr::map(dm_insert_in, colnames) %>%
unlist() %>%
unique()
if(! all(all_column_names %in% (dm_dpeatdecomposition %>% dm::pull_tbl(attributes) %>% dplyr::pull(attribute_name)))) {
cond <- purrr::map_lgl(all_column_names, function(.x) ! .x %in% (dm_dpeatdecomposition %>% dm::pull_tbl(attributes) %>% dplyr::pull(attribute_name)))
RMariaDB::dbDisconnect(con)
stop(paste0("New `attribute_name`s discovered: ", paste(all_column_names[cond], collapse = ", ")))
}
all_data_attributes <- unique(dm_insert_in$data$attribute_name)
if(! all(all_data_attributes %in% (dm_dpeatdecomposition %>% dm::pull_tbl(attributes) %>% dplyr::pull(attribute_name)))) {
cond <- purrr::map_lgl(all_data_attributes, function(.x) ! .x %in% (dm_dpeatdecomposition %>% dm::pull_tbl(attributes) %>% dplyr::pull(attribute_name)))
stop(paste0("New `attribute_name`s discovered: ", paste(all_data_attributes[cond], collapse = ", ")))
RMariaDB::dbDisconnect(con)
}
# filter empty tables
dm_insert_in_check <-
dm_insert_in[purrr::map_lgl(dm_insert_in, function(x) nrow(x) > 0)] %>%
dm::as_dm() %>%
dp_dm_add_keys(dm_dpeatdecomposition)
# copy into dm_pmird
for(i in seq_along(dm_insert_in)) {
RMariaDB::dbAppendTable(con, name = names(dm_insert_in)[[i]], value = dm_insert_in[[i]])
}
RMariaDB::dbDisconnect(con)