# packages
library(metaDigitise)
library(magrittr)
library(tibble)
library(dplyr)
library(lubridate)
library(dpeatdecomposition)
library(dm)
library(RMariaDB)
Connect to database
# connect to database
con <-
RMariaDB::dbConnect(
drv = RMariaDB::MariaDB(),
dbname = "dpeatdecomposition",
default.file = "~/my.cnf"
)
# get database as dm object
dm_dpeatdecomposition <-
dpeatdecomposition::dp_get_dm(con, learn_keys = TRUE)
Get most current IDs
id_last <-
list(
id_dataset =
dm_dpeatdecomposition %>%
dm::pull_tbl(datasets) %>%
dplyr::pull(id_dataset) %>%
tail(1),
id_sample =
dm_dpeatdecomposition %>%
dm::pull_tbl(samples) %>%
dplyr::pull(id_sample) %>%
tail(1),
id_measurement =
dm_dpeatdecomposition %>%
dm::pull_tbl(data) %>%
dplyr::pull(id_measurement) %>%
tail(1)
) %>%
purrr::map(function(.x) {
if(length(.x) == 0) {
0L
} else {
.x
}
})
Create directories
dir_name <- "d61"
dir_source <- "../raw_data/data/d61"
dir_target <- paste0("../derived_data/", id_last$id_dataset + 1L)
if(!dir.exists(dir_target)) {
dir.create(dir_target)
}
datasets <-
tibble::tibble(
id_dataset = id_last$id_dataset + 1L
)
citations_to_datasets <-
dplyr::bind_rows(
db_template_tables$citations_to_datasets,
tibble::tibble(
id_dataset = datasets$id_dataset,
id_citation = c("Piatkowski.2021a", "Piatkowski.2021")
)
)
# mass remaining
samples2 <-
read.table(paste0(dir_source, "/raw/mass_loss_data.tsv"), header = TRUE, sep = "\t", skip = 2L) %>%
setNames(nm = c("id_bag", "taxon_rank_value", "mass_absolute_0", "mass_absolute")) %>%
dplyr::mutate(
name_collector =
dplyr::case_when(
stringr::str_detect(taxon_rank_value, "\\(") ~ taxon_rank_value %>% stringr::str_extract(pattern = "\\(.+\\)$") %>% stringr::str_extract(pattern = "[A-Za-z]+,") %>% stringr::str_remove(pattern = ",$"),
TRUE ~ NA_character_
),
label_collection =
dplyr::case_when(
stringr::str_detect(taxon_rank_value, "\\(") ~ taxon_rank_value %>% stringr::str_extract(pattern = "\\(.+\\)$") %>% stringr::str_extract(pattern = "\\d+-\\d+\\)") %>% stringr::str_remove(pattern = "\\)$"),
TRUE ~ NA_character_
),
taxon_rank_value =
dplyr::case_when(
stringr::str_detect(taxon_rank_value, "\\(") ~ taxon_rank_value %>% stringr::str_extract(pattern = "^[A-Za-z]+ [A-Za-z]+"),
TRUE ~ taxon_rank_value
),
dplyr::across(
dplyr::starts_with("mass_absolute"),
as.numeric
),
mass_relative_mass = mass_absolute/mass_absolute_0,
id_replicate =
purrr::map_int(seq_len(nrow(.)), function(i) {
target <- paste0(taxon_rank_value, "_", label_collection)
which(which(target == target[[i]]) == i)
})
) %>%
tidyr::pivot_longer(
cols = dplyr::starts_with("mass_absolute"),
names_to = "variable",
values_to = "mass_absolute"
) %>%
dplyr::mutate(
id_dataset = datasets$id_dataset,
incubation_environment = "peat",
is_incubated = TRUE,
incubation_duration =
dplyr::case_when(
variable == "mass_absolute" ~ lubridate::dyears(2) %>% lubridate::time_length(unit = "days"),
TRUE ~ 0.0
),
mass_relative_mass =
dplyr::case_when(
incubation_duration == 0.0 ~ 1.0,
TRUE ~ mass_relative_mass
),
sampling_date =
dplyr::case_when(
variable == "mass_absolute" ~ "2019-05-01",
TRUE ~ "2017-05-01"
) %>%
as.Date(),
sampling_year = lubridate::year(sampling_date),
sampling_month = lubridate::month(sampling_date),
sampling_day = NA_real_,
sample_treatment = "transplanted",
mesh_size_absolute = 25/1000,
site_name = "McLean Bogs",
sampling_longitude = -76.2662,
sampling_latitude = 42.5488,
sample_depth_upper = 2, #---note: guessed based on text
sample_depth_lower = 2,
taxon_rank_name = "species",
taxon_organ = "whole plant",
sample_type = "litter",
type =
dplyr::case_when(
incubation_duration == 0.0 ~ "samples2",
TRUE ~ "samples3"
)
)
## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion
## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion
# litter collection ---note: there are two levels of litter collection here: For all species, there are samples from multiple sites. For some species, these were mixed, for others not.
samples1_1 <-
read.table(paste0(dir_source, "/raw/collection_information.tsv"), header = TRUE, sep = "\t", skip = 2L) %>%
dplyr::select(-c(5:6)) %>%
setNames(nm = c("id_bag_range", "taxon_rank_value", "name_collector", "label_collection", "site_name", "sampling_date", "coordinates")) %>%
dplyr::mutate(
sampling_date = as.Date(sampling_date, format = "%d-%b-%Y"),
label_collection =
ifelse(label_collection == "s.n.", NA_character_, label_collection),
name_collector =
name_collector %>%
stringr::str_remove("^[A-Z]+ "),
sampling_longitude =
coordinates %>%
stringr::str_remove("^\\d+\\.{1}\\d+, ") %>%
as.numeric(),
sampling_latitude =
coordinates %>%
stringr::str_extract("^\\d+\\.{1}\\d+") %>%
as.numeric(),
id_dataset = datasets$id_dataset[[1]],
id_sample = seq_len(nrow(.)) + id_last$id_sample,
id_sample_parent = id_sample,
id_sample_origin = id_sample,
incubation_environment = NA_character_,
is_incubated = FALSE,
incubation_duration = 0.0,
sampling_year = lubridate::year(sampling_date) + 2000,
sampling_month = lubridate::month(sampling_date),
sampling_day = lubridate::day(sampling_date),
sample_treatment = "control",
sample_depth_upper = 0,
sample_depth_lower = 5,
sample_type = "vegetation",
taxon_rank_name = "species",
taxon_organ = "whole plant"
)
# now_ define mixed samples
samples1_2 <-
samples2 %>%
dplyr::filter(incubation_duration == 0.0 & !duplicated(paste0(taxon_rank_value, "_", label_collection))) %>%
dplyr::mutate(
dplyr::across(
dplyr::all_of(c("site_name", "incubation_environment")),
function(.x) NA_character_
),
dplyr::across(
dplyr::all_of(c("sampling_longitude", "sampling_latitude", "sample_depth_upper", "sample_depth_lower", "sampling_year", "sampling_month", "sampling_day")),
function(.x) NA_real_
),
is_incubated = FALSE,
id_sample = seq_len(nrow(.)) + max(samples1_1$id_sample)
) %>%
dplyr::left_join(
samples1_1 %>%
dplyr::select(dplyr::all_of(c("taxon_rank_value", "label_collection", "id_sample_origin", "id_sample_parent"))) %>%
dplyr::filter(! is.na(label_collection)),
by = c("taxon_rank_value", "label_collection")
)
# experimental_design
samples2 <-
samples2 %>%
dplyr::mutate(
experimental_design =
paste0(
as.numeric(as.factor(site_name)), "_",
as.numeric(factor(label_collection, levels = unique(samples1_1$label_collection))),
as.numeric(as.factor(id_replicate))
)
)
samples1_1 <-
samples1_1 %>%
dplyr::mutate(
experimental_design =
paste0(
as.numeric(as.factor(site_name)), "_",
as.numeric(factor(label_collection, levels = unique(samples1_1$label_collection)))
)
)
samples1_2 <-
samples1_2 %>%
dplyr::mutate(
experimental_design =
paste0(
as.numeric(as.factor(site_name)), "_",
as.numeric(factor(label_collection, levels = unique(samples1_1$label_collection)))
)
)
# add missing ids
samples2 <-
samples2 %>%
dplyr::mutate(
id_sample = seq_len(nrow(.)) + max(samples1_2$id_sample),
id_sample_origin =
dplyr::left_join(
samples2 %>%
dplyr::select(site_name, label_collection, taxon_rank_value),
samples1_2 %>% dplyr::select(site_name, label_collection, taxon_rank_value, id_sample),
by = c("taxon_rank_value", "label_collection")
) %>%
dplyr::pull(id_sample),
id_sample_incubation_start =
purrr::map_int(seq_len(nrow(.)), function(i) {
index <- paste0(taxon_rank_value, "_", site_name, "_", taxon_organ) == paste0(taxon_rank_value, "_", site_name, "_", taxon_organ)[[i]] & experimental_design == experimental_design[[i]] & incubation_duration == 0.0
id_sample[index]
}),
id_sample_parent =
purrr::map_int(seq_len(nrow(.)), function(i) {
index <- paste0(taxon_rank_value, "_", site_name, "_", taxon_organ) == paste0(taxon_rank_value, "_", site_name, "_", taxon_organ)[[i]] & experimental_design == experimental_design[[i]] & incubation_duration < incubation_duration[[i]]
if(! any(index)) {
id_sample_origin[[i]]
} else {
target_incubation_duration <- max(incubation_duration[index])
index <- index & incubation_duration == target_incubation_duration
id_sample[index]
}
})
)
## combine
samples <-
dplyr::bind_rows(
db_template_tables$samples,
samples1_1 %>%
dplyr::mutate(
type = "samples1_1"
),
samples1_2 %>%
dplyr::mutate(
type = "samples1_2"
),
samples2
)
samples_to_samples <-
samples %>%
dplyr::filter(! id_sample %in% id_sample_origin) %>%
dplyr::mutate(
transition_description =
dplyr::case_when(
type %in% c("samples1_2") ~ "mix",
type %in% c("samples2") ~ "translocate",
type %in% c("samples3") ~ "wait",
TRUE ~ NA_character_
)
) %>%
dplyr::select(id_sample_parent, id_sample, transition_description) %>%
dplyr::rename(
id_sample_child = "id_sample"
)
# add samples which were mixed from other samples
samples_to_samples <-
dplyr::bind_rows(
samples_to_samples,
samples %>%
dplyr::filter(is.na(id_sample_origin)) %>%
dplyr::select(-id_sample_parent) %>%
dplyr::left_join(
samples1_1 %>%
dplyr::select(id_sample, taxon_rank_value) %>%
dplyr::rename(
id_sample_parent = "id_sample"
),
by = "taxon_rank_value"
) %>%
dplyr::mutate(
transition_description = "mix"
) %>%
dplyr::select(id_sample_parent, id_sample, transition_description) %>%
dplyr::rename(
id_sample_child = "id_sample"
)
)
d2 <-
samples2 %>%
tidyr::pivot_longer(
cols = dplyr::all_of(c("mass_absolute", "mass_relative_mass", "mesh_size_absolute")),
names_to = "attribute_name",
values_to = "value"
) %>%
dplyr::mutate(
id_measurement = seq_len(nrow(.)) + id_last$id_measurement,
id_measurement_numerator =
purrr::map_int(seq_len(nrow(.)), function(i) {
if(attribute_name[[i]] == "mass_relative_mass") {
id_measurement[id_sample == id_sample[[i]] & attribute_name == "mass_absolute"]
} else if (stringr::str_detect(attribute_name[[i]], pattern = "_relative_mass$")) {
id_measurement[id_sample == id_sample[[i]] & attribute_name == stringr::str_replace(attribute_name[[i]], "_relative_mass2?", "_absolute")]
} else {
NA_integer_
}
}),
id_measurement_denominator =
purrr::map_int(seq_len(nrow(.)), function(i) {
if(attribute_name[[i]] == "mass_relative_mass") {
id_measurement[id_sample == id_sample_incubation_start[[i]] & attribute_name == "mass_absolute"]
} else if (stringr::str_detect(attribute_name[[i]], pattern = "_relative_mass$")) {
id_measurement[id_sample == id_sample[[i]] & attribute_name == "mass_absolute"]
} else {
NA_integer_
}
}),
value_type = "point",
sample_size = 1L
)
# combine
d <-
dplyr::bind_rows(
db_template_tables$data,
d2
) %>%
dplyr::select(dplyr::all_of(colnames(db_template_tables$data)))
experimental_design_format <-
tibble::tibble(
id_dataset = datasets$id_dataset,
file = paste0(id_last$id_dataset + 1L, "/experimental_design_format.csv"),
experimental_design_description = "`site_name`: Character value. If `is_incubated = TRUE`: Name of the site where the samples where incubated. If `is_incubated = FALSE`: Name of the site where the samples grew. `label_collection`: Character value. Label for a litter sample collected at a specific site and plot If this is `NA` and `is_incubated = TRUE`, this means that the sample is a mixture of the samples for the same species grown at all sites (and with all `label_collection`). `id_replicate`: Integer value representing the litterbag number from the same `taxon_rank_value`, and `label_collection`."
)
# csv file to export
experimental_design_format2 <-
samples %>%
dplyr::filter(! is.na(experimental_design)) %>%
dplyr::filter(! duplicated(experimental_design)) %>%
dplyr::select(experimental_design, site_name, label_collection, id_replicate)
# export
write.csv(experimental_design_format2, paste0(dir_target, "/experimental_design_format.csv"), row.names = FALSE)
# list all tables
dm_insert_in <-
list(
datasets =
datasets %>%
dplyr::select(dplyr::all_of(colnames(dm_dpeatdecomposition$datasets))),
samples =
samples %>%
dplyr::select(dplyr::all_of(colnames(dm_dpeatdecomposition$samples))),
data =
d %>%
dplyr::select(dplyr::all_of(colnames(dm_dpeatdecomposition$data))),
samples_to_samples =
samples_to_samples %>%
dplyr::select(dplyr::all_of(colnames(dm_dpeatdecomposition$samples_to_samples))),
citations_to_datasets =
citations_to_datasets %>%
dplyr::select(dplyr::all_of(colnames(dm_dpeatdecomposition$citations_to_datasets))),
experimental_design_format =
experimental_design_format %>%
dplyr::select(dplyr::all_of(colnames(dm_dpeatdecomposition$experimental_design_format)))
)
# check whether all column names as present in table attributes
all_column_names <-
purrr::map(dm_insert_in, colnames) %>%
unlist() %>%
unique()
if(! all(all_column_names %in% (dm_dpeatdecomposition %>% dm::pull_tbl(attributes) %>% dplyr::pull(attribute_name)))) {
cond <- purrr::map_lgl(all_column_names, function(.x) ! .x %in% (dm_dpeatdecomposition %>% dm::pull_tbl(attributes) %>% dplyr::pull(attribute_name)))
RMariaDB::dbDisconnect(con)
stop(paste0("New `attribute_name`s discovered: ", paste(all_column_names[cond], collapse = ", ")))
}
all_data_attributes <- unique(dm_insert_in$data$attribute_name)
if(! all(all_data_attributes %in% (dm_dpeatdecomposition %>% dm::pull_tbl(attributes) %>% dplyr::pull(attribute_name)))) {
RMariaDB::dbDisconnect(con)
cond <- purrr::map_lgl(all_data_attributes, function(.x) ! .x %in% (dm_dpeatdecomposition %>% dm::pull_tbl(attributes) %>% dplyr::pull(attribute_name)))
stop(paste0("New `attribute_name`s discovered: ", paste(all_data_attributes[cond], collapse = ", ")))
}
# filter empty tables
dm_insert_in_check <-
dm_insert_in[purrr::map_lgl(dm_insert_in, function(x) nrow(x) > 0)] %>%
dm::as_dm() %>%
dp_dm_add_keys(dm_dpeatdecomposition)
# copy into dm_pmird
for(i in seq_along(dm_insert_in)) {
RMariaDB::dbAppendTable(con, name = names(dm_insert_in)[[i]], value = dm_insert_in[[i]])
}
RMariaDB::dbDisconnect(con)