This file serves as supplementary methods to the publication Michaelis et al. (2024): “Leveraging zebrafish embryo phenotypic observations to advance data-driven analyses in toxicology”. It contains all code needed to conduct the analysis as outlined in the paper as well as to generate the figures.
Authors: Paul Michaelis, Nils Klüver*, Silke Aulhorn, Hannes Bohring, Jan Bumberger, Kristina Haase, Tobias Kuhnert, Eberhard Küster, Janet Krüger, Till Luckenbach, Riccardo Massei, Lukas Nerlich, Sven Petruschke, Thomas Schnicke, Anton Schnurpel, Stefan Scholz, Nicole Schweiger, Daniel Sielaff, Wibke Busch*
* corresponding authors (nils.kluever@ufz.de, wibke.busch@ufz.de)
Data is read into the environment. This includes:
# data from .csv downloads
data_path = file.path('data', 'intob_data')
effect_list_info = read_csv(file.path(data_path, 'effect_list_info.csv'), show_col_types = F)
# a table of all effects, ordered roughly by category. To be used in plots to order effects.
col_ordering = read_tsv('data/data_for_analysis/ordering_effects.tsv', show_col_types = F)
d = list(
metadata = read_csv(file.path(data_path, 'metadata.csv'), show_col_types = F),
obs_phenotypes = read_csv(file.path(data_path, 'obs_phenotypes.csv'), show_col_types = F),
phys_chem_prop = read_csv(file.path(data_path, 'phys_chem_prop.csv'), show_col_types = F)
)
# only keep experiments that have observations
exp_obs = unique(d$obs_phenotypes$experiment_id)
d = lapply(d, function(x) {x %>% filter(experiment_id %in% exp_obs)})
# getting the plate layout by making a graphql request
token_file = file.path('api', 'token')
token = readChar(token_file, file.info(token_file)$size-1)
# data for plate layout
request_plate_layout = 'query {
experiments (date_lte:"2024-03-07T23:59:59+02:00", itemsPerPage:1000) {
collection {
_id
date
containers {
collection {
id
name
height
width
cells {
collection {
x
y
marker {
name
concentration
unit
substance {
name
}
}
}
}
}
}
}
}
}'
raw_data = query_intob(request_plate_layout, token = token)
saveRDS(raw_data, 'data/graphql/plate_layout_raw_data.rds')
raw_data_plates = readRDS(file.path('data', 'graphql', 'plate_layout_raw_data.rds'))
plate_layout = raw_data_plates$data$experiments$collection %>%
unnest(containers) %>% unnest(collection) %>%
rename(container_name = name) %>%
unnest(cells) %>% cast_df('collection') %>% unnest(collection) %>%
unnest(marker) %>%
rename(marker_name = name) %>%
unnest(substance) %>%
rename(
experiment_id = `_id`,
container_id = id,
substance_name = name
) %>%
add_column(container_type = .$height * .$width)
write_csv(plate_layout, 'data/data_for_publication/data/intob_data/plate_layout.csv')
# adding a column that contains the number of wells that are filled in a plate. If this number is different from container_type, the plate has not been filled completely
plate_layout = plate_layout %>%
group_by(experiment_id, container_id) %>%
mutate(plate_fill = n()) %>%
ungroup()
plate_layout = plate_layout %>% filter(experiment_id %in% d$metadata$experiment_id)
plate_layout$empty_wells = plate_layout$container_type - plate_layout$plate_fill
d$plate_layout = plate_layout
rm(raw_data_plates, plate_layout)
The observation table gets a new effect column
d$obs_phenotypes$effect = d$obs_phenotypes$detailed_effect
na_effects = is.na(d$obs_phenotypes$effect)
d$obs_phenotypes$effect[na_effects] = d$obs_phenotypes$parent_effect[na_effects]
d$obs_phenotypes = d$obs_phenotypes %>%
group_by(
experiment_id,
container,
age_embryo_exposure_start_hpf,
age_embryo_observation_hpf
) %>%
# mutate(container_type_from_data = max(position_x) * max(position_y)) %>%
mutate(n_effects_in_plate = n_distinct(position_x, position_y))
d$obs_phenotypes = d$obs_phenotypes %>%
mutate(container = as.character(container)) %>%
left_join(
d$plate_layout[, c(
'experiment_id',
'x',
'y',
'container_name',
'container_type',
'plate_fill',
'empty_wells'
)],
by = c(
'experiment_id' = 'experiment_id',
'position_x' = 'x',
'position_y' = 'y',
'container' = 'container_name'
)
)
# some substance names are very long, which is difficult to deal with in
# figures. They are changed to shorter synonyms
names_to_change = list(
"N-(1,3-Dimethylbutyl)-N'-phenyl-p-phenylenediamine" = '6-PPDQ',
"Ethyl 3-aminobenzoate methanesulfonic acid salt" = 'Tricaine mesylate',
"Tris(1,3-dichloro-2-propyl) phosphate" = 'TDCPP'
)
for (n in names(names_to_change)) {
d$obs_phenotypes$test_substance[d$obs_phenotypes$test_substance == n] = names_to_change[[n]]
d$metadata$test_substance[d$metadata$test_substance == n] = names_to_change[[n]]
d$phys_chem_prop$`Compound Name`[d$phys_chem_prop$`Compound Name` == n] = names_to_change[[n]]
d$plate_layout$marker_name[d$plate_layout$marker_name == n] = names_to_change[[n]]
d$plate_layout$substance_name[d$plate_layout$substance_name == n] = names_to_change[[n]]
}
Sometimes, experimenters only record adverse effects but not normal embryos. In these cases some embryos have no entry in the obs_phenotypes table.
exp_incomplete = d$obs_phenotypes %>%
filter(n_effects_in_plate != plate_fill) %>%
group_by(experiment_id) %>%
filter(!'normal' %in% effect) %>%
pull(experiment_id) %>%
unique()
For these experiments, the normal effects are infered in the data. This is done by comparing the plate layout with the recorded effects, any embryo that exists in the plate but has no recorded effect is assumed to be normal.
infer_normal = function(exp_obs, exp_plate) {
exp_obs = exp_obs %>% ungroup()
exp_obs_for_diff = exp_obs %>%
dplyr::select(
position_x,
position_y,
container,
concentration,
test_substance,
age_embryo_observation_hpf
)
exp_plate = exp_plate %>%
rename(
position_x = x,
position_y = y,
container = container_name,
concentration = concentration,
test_substance = substance_name
) %>%
dplyr::select(position_x,
position_y,
container,
concentration,
test_substance)
infered = lapply(unique(exp_obs$age_embryo_observation_hpf), function(t) {
exp_obs_t_diff = exp_obs_for_diff %>% filter(age_embryo_observation_hpf == t)
exp_obs_t_diff$age_embryo_observation_hpf = NULL
# wells that have no recorded effects
no_effect = setdiff(exp_plate, exp_obs_t_diff)
# joining the recorded effects with wells that have not been recorded
infered = full_join(
exp_obs[exp_obs$age_embryo_observation_hpf == t,],
no_effect,
by = c(
'position_x',
'position_y',
'container',
'concentration',
'test_substance'
)
) %>%
# some values are repeated and can be filled from the exisiting data
fill(
experiment_id,
age_embryo_exposure_start_hpf,
age_embryo_observation_hpf,
observation_type,
unit,
container_type,
plate_fill,
.direction = 'down'
) %>%
# the columns looking at plate fill and how many effects were recorded per plate are recomputed
group_by(
experiment_id,
container,
age_embryo_exposure_start_hpf,
age_embryo_observation_hpf
) %>%
mutate(n_effects_in_plate = n_distinct(position_x, position_y)) %>%
mutate(empty_wells = container_type - plate_fill) %>%
ungroup()
# the infered normal effects are entered
infered$parent_effect[is.na(infered$parent_effect)] = 'normal'
infered$effect[is.na(infered$effect)] = 'normal'
infered
}) %>%
do.call(rbind, .)
}
infered_exps = lapply(exp_incomplete, function(id) {
exp_obs_original = d$obs_phenotypes %>% filter(experiment_id == id)
exp_plate = d$plate_layout %>% filter(experiment_id == id)
infer_normal(exp_obs_original, exp_plate)
}) %>% data.table::rbindlist()
d$obs_phenotypes = d$obs_phenotypes %>% filter(!experiment_id %in% exp_incomplete)
d$obs_phenotypes = rbind(d$obs_phenotypes, infered_exps)
A effects summary table is made from the observations. Effects within each observation are summarised into categories:
# exposure starts at 10 hpf or earlier are grouped together. The original exposure starts can still be found in the metadata table if needed
d$obs_phenotypes$age_embryo_exposure_start_hpf[d$obs_phenotypes$age_embryo_exposure_start_hpf <= 10] = 0
d$effects_summary = d$obs_phenotypes %>%
group_by(
experiment_id,
age_embryo_exposure_start_hpf,
age_embryo_observation_hpf,
position_x,
position_y,
container,
concentration,
unit,
test_substance,
container_type
) %>%
summarise(
lethal = 'coagulated' %in% effect,
normal = 'normal' %in% effect,
sublethal = !normal,
.groups = 'drop'
)
# zotero library
zotero_lib = fromJSON(file.path('data', 'zotero_phenotype_library', '20240903_Zotero_library.json'))
zotero_effects = read_tsv(file.path('data', 'data_for_analysis', 'zotero_effects.tsv'),
show_col_types = F)
process_zotero = function(zotero_lib) {
zotero_lib = zotero_lib %>%
unnest(author) %>%
unnest(issued)
zotero_lib = zotero_lib %>%
rename(
effect_abb = archive,
age_hpf = archive_location,
test_substance = `call-number`,
fish_strain = `collection-title`
) %>%
# for mixtures, substances are seprated with semicolons, they are filtered out here
filter(!str_detect(test_substance, ';')) %>%
# some substances contain a dtxsid besides their name. This information is
# split into two columns, with NA for missing dtxsids
separate_wider_delim(
cols = 'test_substance',
delim = '|',
names = c('test_substance', 'dtxsid'),
too_few = 'align_start',
too_many = 'merge'
) %>%
# put all effects and time points in separate rows
# separate_longer_delim(cols = matches('age_hpf'), delim = ', ') %>%
separate_longer_delim(cols = matches('effect_abb'), delim = ', ') %>%
left_join(zotero_effects, by = 'effect_abb') %>%
relocate(
effect_abb, effect_long, age_hpf, test_substance, dtxsid, fish_strain,
.before = 1
) %>%
unite(author, matches('family|given'), sep = ', ') %>%
group_by(id) %>%
mutate(
author = paste(unique(author), collapse = '; ')
)
zotero_lib$year = zotero_lib$`date-parts` %>% lapply(function(l) {l[[1]]}) %>% as.numeric()
zotero_lib$first_author = lapply(zotero_lib$author, function(l) {str_extract(l, '^\\w+')}) %>% as.character()
zotero_lib$study_short = paste0(zotero_lib$first_author, ' et al. ', as.character(zotero_lib$year))
zotero_lib$`date-parts` = NULL
zotero_lib$accessed = NULL
zotero_lib$`non-dropping-particle` = NULL
zotero_lib$`dropping-particle` = NULL
zotero_lib = unique(zotero_lib)
zotero_lib %>% ungroup()
}
zotero_lib = process_zotero(zotero_lib)
zotero_substances = zotero_lib %>% ungroup() %>% dplyr::select(test_substance) %>% unique()
write_csv(zotero_lib, 'data/zotero_phenotype_library/phenotype_library.csv')
If experiments have n unique recorded effects, which effects are those?
d$effects_per_exp_long = d$obs_phenotypes %>%
left_join(
unique(
d$metadata %>% dplyr::select(
experiment_id,
quality_flag
)
),
by = 'experiment_id'
) %>%
group_by(experiment_id) %>%
mutate(
n_unique_effects = n_distinct(effect)
)
The dataset contains test experiments with no biological meaning. Typically they contain little data. It is probably safe to assume that any experiment with only one condition (conditions = unique combinations of exp. start, observation time point, concentration and substance) can be filtered out. Some experiments with only two conditions seem valid, though (e.g. exp id 428).
Experiments that:
are filtered out.
d$effects_per_exp = d$obs_phenotypes %>%
left_join(
unique(d$metadata %>% dplyr::select(experiment_id, quality_flag)),
by = 'experiment_id'
) %>%
group_by(experiment_id, quality_flag) %>%
reframe(
n_unique_effects_parent = n_distinct(parent_effect),
n_unique_effects_child = n_distinct(detailed_effect),
n_unique_effects = n_distinct(effect),
n_conditions = pick(
age_embryo_exposure_start_hpf,
age_embryo_observation_hpf,
concentration,
test_substance
) %>% unique() %>% nrow(),
n_embryos = pick(
age_embryo_exposure_start_hpf,
position_x,
position_y,
container
) %>% unique() %>% nrow(),
n_observations = pick(
age_embryo_exposure_start_hpf,
position_x,
position_y,
container,
age_embryo_observation_hpf,
concentration,
test_substance
) %>% unique() %>% nrow(),
n_effects = n()
) %>%
add_column(
observations_per_embryo = .$n_observations / .$n_embryos
)
embryo_cutoff = 15
n_cond_cutoff = 2
exp_ids_filter = d$effects_per_exp %>%
filter(
n_embryos < embryo_cutoff |
n_conditions < n_cond_cutoff
)
data.frame(
description = c('n_experiments', 'n_embryos'),
before_filtering = c(
n_distinct(d$effects_per_exp$experiment_id),
sum(d$effects_per_exp$n_embryos)
),
after_filtering =
c(
n_distinct(d$effects_per_exp[!d$effects_per_exp$experiment_id %in% exp_ids_filter$experiment_id,]$experiment_id),
sum(d$effects_per_exp[!d$effects_per_exp$experiment_id %in% exp_ids_filter$experiment_id,]$n_embryos)
),
difference =
c(
n_distinct(d$effects_per_exp[d$effects_per_exp$experiment_id %in% exp_ids_filter$experiment_id,]$experiment_id),
sum(d$effects_per_exp[d$effects_per_exp$experiment_id %in% exp_ids_filter$experiment_id,]$n_embryos)
)
) %>%
add_column(perc_difference = .$difference / .$before_filtering)
d_f = list(
obs_phenotypes = d$obs_phenotypes %>% filter(!experiment_id %in% exp_ids_filter$experiment_id),
effects_per_exp = d$effects_per_exp %>% filter(!experiment_id %in% exp_ids_filter$experiment_id),
effects_per_exp_long = d$effects_per_exp_long %>% filter(!experiment_id %in% exp_ids_filter$experiment_id),
effects_summary = d$effects_summary %>% filter(!experiment_id %in% exp_ids_filter$experiment_id),
metadata = d$metadata %>% filter(!experiment_id %in% exp_ids_filter$experiment_id),
phys_chem_prop = d$phys_chem_prop %>% filter(!experiment_id %in% exp_ids_filter$experiment_id),
plate_layout = d$plate_layout %>% filter(!experiment_id %in% exp_ids_filter$experiment_id)
)
Untreated embryos naturally have lethal or sublethal effects at a certain probability. If embryos have too much variability this distorts the results of the treated embryos. What is the probability an embryo has an effect and how can experiments be assessed for adequate embryo quality?
To test for experiments that have insufficient embryo quality, a binomial test is used. To get a probability of an embryo being normal the proportion of normal embryos is calculated at the last observations in each respective experiment across all experiments. This is assumed to be the probability of a healthy embryo not exhibiting lethal or sublethal effects by chance. For each experiment a binomial test is performed, testing whether the proportion of normal embryos could be resulting from a binomial distribution with p=0.936 (H0) or p<0.936 (H1). Since significant results from this test are experiments that exhibit insufficient embryo quality, so we are interested in insignificant results (do not accept H1) from this test for further analysis.
c0_norm_not_norm = d_f$obs_phenotypes %>%
filter(concentration == 0) %>%
filter(parent_effect != 'heartbeat') %>%
group_by(experiment_id, position_x, position_y, container) %>%
filter(age_embryo_observation_hpf == max(age_embryo_observation_hpf)) %>%
summarise(
normal = 'normal' %in% effect,
coagulated = 'coagulated' %in% effect,
sublethal = !'normal' %in% effect & !'coagulated' %in% effect,
total = !'normal' %in% effect,
.groups = 'drop'
)
# proportion of normal embryos in the complete dataset
p_normal = sum(c0_norm_not_norm$normal) / nrow(c0_norm_not_norm)
p_normal
## [1] 0.9361913
p_sublethal = sum(c0_norm_not_norm$total) / nrow(c0_norm_not_norm)
p_sublethal
## [1] 0.06380866
p_lethal = sum(c0_norm_not_norm$coagulated) / nrow(c0_norm_not_norm)
p_lethal
## [1] 0.04106498
# aggregate by experiment
c0_aggregated = c0_norm_not_norm %>% group_by(experiment_id) %>%
reframe(
n_normal = sum(normal),
n_coagulated = sum(coagulated),
n_sublethal = sum(sublethal),
n_total = sum(total),
ratio_normal = sum(normal) / n(),
ratio_coagulated = sum(coagulated) / n(),
ratio_sublethal = sum(sublethal) / n(),
n_embryos = n()
) %>%
# filter(n > 20) %>%
left_join(unique(d_f$metadata[, c('experiment_id', 'experiment_date', 'quality_flag')]), by = 'experiment_id')
c0_aggregated$binom_test_p_1s = mapply(
function(x, n, p) {
binom.test(x, n, p, alternative = 'less')$p.value
},
x = c0_aggregated$n_normal,
n = c0_aggregated$n_embryos,
p = p_normal
)
c0_aggregated$binom_test_adjp_1s = p.adjust(c0_aggregated$binom_test_p_1s, method = 'BH')
c0_aggregated$pass_binom = c0_aggregated$binom_test_p_1s > 0.05
message('Number of experiments at this point:')
## Number of experiments at this point:
c0_aggregated$experiment_id %>% n_distinct()
## [1] 608
message('Number of experiments if experiments with fewer than 90% normal control embryos are removed:')
## Number of experiments if experiments with fewer than 90% normal control embryos are removed:
c0_aggregated %>% filter(ratio_normal >= 0.9) %>% nrow()
## [1] 454
message('Number of experiments if experiments that don\'t pass the binomial test (p < 0.05) are removed:')
## Number of experiments if experiments that don't pass the binomial test (p < 0.05) are removed:
c0_aggregated %>% filter(binom_test_p_1s > 0.05) %>% nrow()
## [1] 561
Supplemenatry figure: How does p-value adjustment influence the results?
theme_control_var = theme(
axis.title = element_text(size = 8),
legend.title = element_text(size = 8),
plot.margin = unit(c(0.6, 0.6, 0.6, 0.6), 'lines')
)
p_controls_padj = c0_aggregated %>% ggplot(aes(
x = n_embryos,
y = ratio_normal,
colour = binom_test_adjp_1s < 0.05 | n_embryos < 9
# size = n_embryos
)) +
geom_vline(xintercept = 8.5, linewidth = 0.2, colour = 'grey60') +
geom_point(alpha = 0.5, shape = 19) +
scale_colour_manual(name = 'padj < 0.05 | n < 9', values = c('forestgreen', 'firebrick3')) +
# scale_size(name = 'Number of control embryos', breaks = c(5, 10, 25, 50, 100), range = c(0.5, 6)) +
guides(colour = guide_legend(order = 2), size = guide_legend(order = 1)) +
labs(
y = 'Ratio of normal embryos',
x = 'Number of control embryos'
) +
theme_control_var +
theme(
legend.position = 'bottom',
legend.box = 'vertical'
)
p_controls_p = c0_aggregated %>% ggplot(aes(
x = n_embryos,
y = ratio_normal,
colour = binom_test_p_1s < 0.05 | n_embryos < 9
# size = n_embryos
)) +
geom_vline(xintercept = 8.5, linewidth = 0.2, colour = 'grey60') +
geom_point(alpha = 0.5, shape = 19) +
scale_colour_manual(name = 'p < 0.05 | n < 9', values = c('forestgreen', 'firebrick3')) +
# scale_size(name = 'Number of control embryos', breaks = c(5, 10, 25, 50, 100), range = c(0.5, 6)) +
guides(colour = guide_legend(order = 2), size = guide_legend(order = 1)) +
labs(
y = 'Ratio of normal embryos',
x = 'Number of control embryos'
) +
theme_control_var +
theme(
legend.position = 'bottom',
legend.box = 'vertical'
)
p_controls = egg::ggarrange(
p_controls_p, p_controls_padj,
ncol = 2,
labels = letters[1:2],
label.args = subfigure_label_args,
draw = F
)
ggsave(
'results/figures/control_variability/p_p_padj_comparison.svg',
plot = p_controls,
device = 'svg',
width = 15,
height = 8,
units = 'cm',
bg = 'white'
)
Experiments that don’t pass the binomial test for control variability are filtered out of the dataset.
exp_fail_binom = c0_aggregated$experiment_id[!c0_aggregated$pass_binom |
c0_aggregated$n_embryos < 9]
data.frame(
description = c('n_experiments', 'n_embryos'),
before_filtering = c(
n_distinct(d_f$effects_per_exp$experiment_id),
sum(d_f$effects_per_exp$n_embryos)
),
after_filtering =
c(
n_distinct(d_f$effects_per_exp[!d_f$effects_per_exp$experiment_id %in% exp_fail_binom,]$experiment_id),
sum(d_f$effects_per_exp[!d_f$effects_per_exp$experiment_id %in% exp_fail_binom,]$n_embryos)
),
difference =
c(
n_distinct(d_f$effects_per_exp[d_f$effects_per_exp$experiment_id %in% exp_fail_binom,]$experiment_id),
sum(d_f$effects_per_exp[d_f$effects_per_exp$experiment_id %in% exp_fail_binom,]$n_embryos)
)
) %>%
add_column(perc_difference = .$difference / .$before_filtering)
d_f_c = list(
obs_phenotypes = d_f$obs_phenotypes %>% filter(!experiment_id %in% exp_fail_binom),
effects_per_exp = d$effects_per_exp %>% filter(!experiment_id %in% exp_fail_binom),
effects_per_exp_long = d$effects_per_exp_long %>% filter(!experiment_id %in% exp_fail_binom),
effects_summary = d_f$effects_summary %>% filter(!experiment_id %in% exp_fail_binom),
metadata = d_f$metadata %>% filter(!experiment_id %in% exp_fail_binom),
phys_chem_prop = d_f$phys_chem_prop %>% filter(!experiment_id %in% exp_fail_binom),
plate_layout = d_f$plate_layout %>% filter(!experiment_id %in% exp_fail_binom)
)
Experiments with mixtures or environmental samples are separated from experiments with single (comptox) substances.
d_f_c$metadata %>%
dplyr::select(experiment_id, category) %>%
filter(category != 'control') %>%
unique() %>%
group_by(category) %>%
summarise(n_exp = n_distinct(experiment_id))
other_types = c('mixture', 'sample')
exp_comptox = d_f_c$metadata %>%
group_by(experiment_id) %>%
filter(!any(other_types %in% category)) %>%
pull(experiment_id) %>% unique()
exp_mix_env_other = d_f_c$metadata %>%
filter(category %in% c('mixture', 'sample', 'other')) %>%
pull(experiment_id) %>% unique()
d_f_comptox = list(
obs_phenotypes = d_f_c$obs_phenotypes %>% filter(experiment_id %in% exp_comptox),
effects_per_exp = d$effects_per_exp %>% filter(experiment_id %in% exp_comptox),
effects_per_exp_long = d$effects_per_exp_long %>% filter(experiment_id %in% exp_comptox),
effects_summary = d_f_c$effects_summary %>% filter(experiment_id %in% exp_comptox),
metadata = d_f_c$metadata %>% filter(experiment_id %in% exp_comptox),
phys_chem_prop = d_f_c$phys_chem_prop %>% filter(experiment_id %in% exp_comptox),
plate_layout = d_f_c$plate_layout %>% filter(experiment_id %in% exp_comptox)
)
d_f_mix_env_other = list(
obs_phenotypes = d_f_c$obs_phenotypes %>% filter(experiment_id %in% exp_mix_env_other),
effects_per_exp = d$effects_per_exp %>% filter(experiment_id %in% exp_mix_env_other),
effects_per_exp_long = d$effects_per_exp_long %>% filter(experiment_id %in% exp_mix_env_other),
effects_summary = d_f_c$effects_summary %>% filter(experiment_id %in% exp_mix_env_other),
metadata = d_f_c$metadata %>% filter(experiment_id %in% exp_mix_env_other),
phys_chem_prop = d_f_c$phys_chem_prop %>% filter(experiment_id %in% exp_mix_env_other),
plate_layout = d_f_c$plate_layout %>% filter(experiment_id %in% exp_mix_env_other)
)
The controls are usually called ‘Control 1’ in the
test_substance column. For modelling it is easier to have
the controls also labelled with the actual test substance. This also
means that when multiple substances were measured in a single experiment
with shared controls, these have to be repeated. Finally, modelling will
only be done for comptox substances, but not mixtures or environmental
samples.
comptox_substances = d_f_comptox$metadata %>% filter(category %in% c('comptox')) %>%
dplyr::select(test_substance, dtxsid, `MolWeight_g/mol`) %>% unique()
# a new summary table only to be used for modelling, as this will contain repeated values
d_f_comptox$effects_summary_modelling = d_f_comptox$effects_summary %>%
multiply_controls() %>%
inner_join(comptox_substances, by = c('compound_new' = 'test_substance'))
# converting all mg/L concentrations to umol/L; some concentrations are in %, they will not be changed
d_f_comptox$effects_summary_modelling$concentration = apply(d_f_comptox$effects_summary_modelling, 1, function(row) {
unit = row[['unit']]
conc = as.numeric(row[['concentration']])
molweight = as.numeric(row[['MolWeight_g/mol']])
if (unit == 'mg/L') {
# mg/L is converted to umol/L
mg_to_umol(conc_mg = conc, molweight = molweight)
} else {
# umol/L and % values remain unchanged
conc
}
})
# all units are changed accordingly, only Methanol keeps the % concentration
d_f_comptox$effects_summary_modelling$unit[d_f_comptox$effects_summary_modelling$compound_new != 'Methanol'] = 'umol/L'
substance_data_overview = d_f_comptox$effects_summary_modelling %>%
filter(compound_new %in% comptox_substances$test_substance) %>%
group_by(
compound_new,
age_embryo_exposure_start_hpf,
age_embryo_observation_hpf
) %>%
summarise(
n_exp = n_distinct(experiment_id),
n_conc = n_distinct(concentration),
n_embryos = pick(
experiment_id,
position_x,
position_y,
container
) %>% unique() %>% nrow(),
.groups = 'drop'
)
As some experimenters only record lethality (i.e. only normal or coagulated), not all experiments can be used for modelling of sublethal drcs. The comptox data is further split into a dataset that can be used for lethal drcs (all data) and a dataset that can be used for sublethal drcs (only experiments that record > 2 unique effects).
exp_2_effects = d_f_comptox$effects_per_exp %>%
filter(n_unique_effects < 3) %>%
pull(experiment_id)
data.frame(
description = c('n_experiments', 'n_embryos'),
before_filtering = c(
n_distinct(d_f_comptox$effects_per_exp$experiment_id),
sum(d_f_comptox$effects_per_exp$n_embryos)
),
after_filtering =
c(
n_distinct(d_f_comptox$effects_per_exp[!d_f_comptox$effects_per_exp$experiment_id %in% exp_2_effects,]$experiment_id),
sum(d_f_comptox$effects_per_exp[!d_f_comptox$effects_per_exp$experiment_id %in% exp_2_effects,]$n_embryos)
),
difference =
c(
n_distinct(d_f_comptox$effects_per_exp[d_f_comptox$effects_per_exp$experiment_id %in% exp_2_effects,]$experiment_id),
sum(d_f_comptox$effects_per_exp[d_f_comptox$effects_per_exp$experiment_id %in% exp_2_effects,]$n_embryos)
)
) %>%
add_column(perc_difference = .$difference / .$before_filtering)
d_f_comptox_sublethal = list(
obs_phenotypes = d_f_comptox$obs_phenotypes %>% filter(!experiment_id %in% exp_2_effects),
effects_per_exp = d_f_comptox$effects_per_exp %>% filter(!experiment_id %in% exp_2_effects),
effects_per_exp_long = d_f_comptox$effects_per_exp_long %>% filter(!experiment_id %in% exp_2_effects),
effects_summary = d_f_comptox$effects_summary %>% filter(!experiment_id %in% exp_2_effects),
effects_summary_modelling = d_f_comptox$effects_summary_modelling %>% filter(!experiment_id %in% exp_2_effects),
metadata = d_f_comptox$metadata %>% filter(!experiment_id %in% exp_2_effects),
phys_chem_prop = d_f_comptox$phys_chem_prop %>% filter(!experiment_id %in% exp_2_effects),
plate_layout = d_f_comptox$plate_layout %>% filter(!experiment_id %in% exp_2_effects)
)
d_f_comptox$effects_summary_modelling %>%
filter(age_embryo_exposure_start_hpf %in% c(0, 24) & age_embryo_observation_hpf %in% c(24, 48, 72, 96, 120)) %>%
group_by(experiment_id, age_embryo_exposure_start_hpf, age_embryo_observation_hpf) %>%
reframe() %>%
dplyr::select(age_embryo_exposure_start_hpf, age_embryo_observation_hpf) %>%
table()
## age_embryo_observation_hpf
## age_embryo_exposure_start_hpf 24 48 72 96 120
## 0 294 289 135 124 95
## 24 9 59 51 56 26
exposure_ov_lethal = d_f_comptox$effects_summary_modelling %>%
group_by(
age_embryo_exposure_start_hpf,
age_embryo_observation_hpf
) %>%
summarise(
n_exp = n_distinct(experiment_id),
n_substances = n_distinct(compound_new),
n_conc = n_distinct(concentration),
n_embryos = pick(
experiment_id,
position_x,
position_y,
container
) %>% unique() %>% nrow(),
.groups = 'drop'
) %>%
filter(
age_embryo_exposure_start_hpf %in% c(0, 24) &
age_embryo_observation_hpf %in% (24*1:5)
)
exposure_ov_sublethal = d_f_comptox_sublethal$effects_summary_modelling %>%
group_by(
age_embryo_exposure_start_hpf,
age_embryo_observation_hpf
) %>%
summarise(
n_exp = n_distinct(experiment_id),
n_substances = n_distinct(compound_new),
n_conc = n_distinct(concentration),
n_embryos = pick(
experiment_id,
position_x,
position_y,
container
) %>% unique() %>% nrow(),
.groups = 'drop'
) %>%
filter(
age_embryo_exposure_start_hpf %in% c(0, 24) &
age_embryo_observation_hpf %in% (24*1:5)
)
Figure for manuscript: General overview over INTOB and filtering of experiments with low embryo quality.
theme_intob_ov = theme(
axis.title = element_text(size = 8),
legend.title = element_text(size = 8),
plot.margin = unit(c(0.6, 0.6, 0.6, 0.6), 'lines')
)
col_grey = 'grey50'
# embryos per substance barplot
p_bar_substances_n_embryos = d_f$plate_layout %>%
filter(concentration != 0) %>%
group_by(substance_name) %>%
summarise(n_embryos = n()) %>%
arrange(desc(n_embryos)) %>%
slice_head(n = 10) %>%
ggplot(
aes(y = factor(substance_name, levels = rev(substance_name)), x = n_embryos)
) +
geom_bar(stat = 'identity', width = 0.55, fill = col_grey) +
scale_x_continuous(breaks = c(0, 1000, 2000)) +
labs(
x = 'Embryos (n)'
) +
theme_intob_ov +
theme(
axis.title.y = element_blank(),
axis.text.y = element_text(size = 8, family = 'Roboto condensed')
)
p_hist_embryos_per_exp = d_f$plate_layout %>%
group_by(experiment_id) %>%
summarise(n_embryos = n()) %>%
ggplot(aes(x = n_embryos)) +
geom_histogram(binwidth = 10, fill = col_grey) +
labs(
x = 'Embryos (n)',
y = 'Experiments (n)'
) +
theme_intob_ov
# overview over exposure designs, i.e. start hpf and observation hpf
p_exp_designs = d_f$obs_phenotypes %>%
group_by(experiment_id) %>%
reframe(
start_hpf = unique(age_embryo_exposure_start_hpf),
obs_hpf = unique(age_embryo_observation_hpf)
) %>%
group_by(start_hpf, obs_hpf) %>%
summarise(n = n(), .groups = 'drop') %>%
filter(start_hpf %in% c(0, 24, 48)) %>%
filter(obs_hpf %in% c(24, 48, 72, 96, 120)) %>%
ggplot(
aes(x = factor(obs_hpf), y = factor(start_hpf), label = n)
) +
geom_point(aes(size = n), colour = 'grey15', alpha = 0.7, shape = 16) +
geom_text(size = 9/.pt, family = 'Roboto', colour = 'grey99') +
labs(
x = 'Age at observation (hpf)',
y = 'Age at exposure (hpf)'
) +
scale_size_continuous(
range = c(6, 15)
) +
theme_intob_ov +
theme(
legend.position = 'none'
)
# visualisation of the control variability analysis
p_controls_p = c0_aggregated %>% ggplot(aes(
x = n_embryos,
y = ratio_normal,
colour = binom_test_p_1s < 0.05 | n_embryos < 9
)) +
geom_vline(xintercept = 8.5, linewidth = 0.2, colour = 'grey60') +
geom_point(alpha = 0.6, shape = 16, size = 2) +
scale_colour_manual(name = 'p < 0.05 | n < 9', values = c('forestgreen', 'firebrick3')) +
scale_y_continuous(limits = c(0, 1)) +
guides(colour = guide_legend(order = 2), size = guide_legend(order = 1)) +
labs(
y = 'Ratio of normal embryos',
x = 'Number of control embryos'
) +
theme_intob_ov +
theme(
legend.title = element_text(hjust = 0.5),
legend.position = 'inside',
legend.position.inside = c(0.85, 0.15)
)
p_intob_ov = egg::ggarrange(
p_bar_substances_n_embryos,
p_exp_designs,
p_hist_embryos_per_exp,
p_controls_p,
labels = letters[1:4],
label.args = subfigure_label_args,
padding = unit(1, 'cm'),
ncol = 2,
draw = F
)
ggsave(
filename = 'results/figures/control_variability/p_control_variability.svg',
plot = p_intob_ov,
device = 'svg',
width = 17.5,
height = 10.5,
units = 'cm',
bg = 'white'
)
Dose response curves for comptox substances for data with exposure
starts at 0 or 24 hpf and observations at 48, 72, 96 or 120 hpf are
modeled. For lethal models data from d_f_comptox can be
used, for sublethal models data from d_f_comptox_sublethal
has to be used, as in this dataset it is ensured that all experiments
recorded sublethal effects.
# some helper functions
get_model_data = function(effects_summary, start_hpf, end_hpf, substance) {
effects_summary %>%
filter(
age_embryo_exposure_start_hpf == start_hpf &
age_embryo_observation_hpf == end_hpf &
compound_new == substance
)
}
set_model_name = function(t1, t2) {paste('t', t1, t2, sep = '_')}
get_model_name_t1 = function(model_name) {str_split_1(model_name)[2]}
get_model_name_t2 = function(model_name) {str_split_1(model_name)[3]}
start_end_df = expand_grid(start_hpf = c(0, 24), end_hpf = c(48, 72, 96, 120))
Aggregating the data needed for the figure in the manuscript displaying concentration ranges.
# only substances that have observations at 96hpf
substances = d_f_comptox$effects_summary_modelling %>%
filter((age_embryo_exposure_start_hpf == 0 & age_embryo_observation_hpf == 96) |
(age_embryo_exposure_start_hpf == 24 & age_embryo_observation_hpf == 96)) %>%
pull(compound_new) %>% unique()
substances = substances[substances != 'Methanol']
data_conc_ranges = d_f_comptox$effects_summary_modelling %>%
filter(compound_new %in% substances) %>%
dplyr::select(
experiment_id,
position_x,
position_y,
container,
unit,
test_substance,
compound_new,
dtxsid,
`MolWeight_g/mol`,
concentration
) %>% unique()
# add concentration in mg/L. All concentrations have been converted to umol/L above.
data_conc_ranges$concentration_mgL = umol_to_mg(data_conc_ranges$concentration,
data_conc_ranges$`MolWeight_g/mol`)
# adding a column with number of tested embryos, will be used for fill aesthetic in the plot
data_conc_ranges = data_conc_ranges %>%
group_by(compound_new) %>%
mutate(n_embryos = n()) %>%
ungroup()
# getting the max concentrations, will be used for ordering
max_concentrations = data_conc_ranges %>%
group_by(compound_new) %>%
summarise(max_c = mean(concentration)) %>%
arrange(max_c)
# getting ec50 and lc50 values for 0-96 and 24-96
lc50_values = lapply(substances, function(s) {
lc50_096 = NA
lc50ste_096 = NA
if (class(models_all[[s]]$lethal$t_0_96) == 'drc') {
x = ED(models_all[[s]]$lethal$t_0_96, 50, display = F)
lc50_096 = x[[1]]
lc50ste_096 = x[[2]]
}
ec50_096 = NA
ec50ste_096 = NA
if (class(models_all[[s]]$sublethal$t_0_96) == 'drc') {
x = ED(models_all[[s]]$sublethal$t_0_96, 50, display = F)
ec50_096 = x[[1]]
ec50ste_096 = x[[2]]
}
lc50_2496 = NA
lc50ste_2496 = NA
if (class(models_all[[s]]$lethal$t_24_96) == 'drc') {
x = ED(models_all[[s]]$lethal$t_24_96, 50, display = F)
lc50_2496 = x[[1]]
lc50ste_2496 = x[[2]]
}
ec50_2496 = NA
ec50ste_2496 = NA
if (class(models_all[[s]]$sublethal$t_24_96) == 'drc') {
x = ED(models_all[[s]]$sublethal$t_24_96, 50, display = F)
ec50_2496 = x[[1]]
ec50ste_2496 = x[[2]]
}
rbind(
data.frame(
compound_new = s,
xc50 = lc50_096,
xc50ste = lc50ste_096,
type = 'LC50 (0-96)'
),
data.frame(
compound_new = s,
xc50 = ec50_096,
xc50ste = ec50ste_096,
type = 'EC50 (0-96)'
),
data.frame(
compound_new = s,
xc50 = lc50_2496,
xc50ste = lc50ste_2496,
type = 'LC50 (24-96)'
),
data.frame(
compound_new = s,
xc50 = ec50_2496,
xc50ste = ec50ste_2496,
type = 'EC50 (24-96)'
)
)
}) %>% do.call(rbind, .)
lc50_values = lc50_values %>% left_join(comptox_substances, by = c('compound_new' = 'test_substance'))
lc50_values$xc50_mgL = umol_to_mg(lc50_values$xc50, lc50_values$`MolWeight_g/mol`)
lc50_values$xc50ste_mgL = umol_to_mg(lc50_values$xc50ste, lc50_values$`MolWeight_g/mol`)
sensitivity_ratios = lc50_values %>%
separate_wider_delim(type, delim = ' ', names = c('type', 'time_point')) %>%
group_by(compound_new, time_point) %>%
summarise(sens_ratio = xc50[type == 'LC50'] / xc50[type == 'EC50'], .groups = 'drop') %>%
add_column(log_sens_ratio = log2(.$sens_ratio))
# a summary of the modelling results, will be plotted alongside the concentration ranges
model_overview = lapply(substances, function(s) {
modelling_res = function(substance, model_list, type, timeframe) {
model = model_list[[s]][[type]][[timeframe]]
if (class(model) == 'drc') {
'modelling successful'
} else {
if (nrow(model$origData) == 0) {
'no data'
} else {
'modelling unsuccessful'
}
}
}
data.frame(
substance = s,
'EC50_096' = modelling_res(s, models_all, 'sublethal', 't_0_96'),
'LC50_096' = modelling_res(s, models_all, 'lethal', 't_0_96'),
'EC50_2496' = modelling_res(s, models_all, 'sublethal', 't_24_96'),
'LC50_2496' = modelling_res(s, models_all, 'lethal', 't_24_96')
)
}) %>% do.call(rbind, .) %>%
pivot_longer(cols = contains('50'), names_to = 'model_type')
Building the figure for the manuscript.
theme_conc_ranges = theme(
legend.position = 'bottom',
axis.title.y = element_blank(),
axis.text.y = element_text(size = 7),
axis.text.x = element_text(size = 7),
axis.title.x = element_text(size = 7),
legend.text = element_text(size = 7),
legend.title = element_text(size = 7)
)
theme_conc_ranges_side = theme(
panel.border = element_blank(),
axis.ticks.y = element_blank()
)
# colour for NA values
grey_light = 'grey90'
# lightish colour to show data
grey_mid = 'grey70'
# dark grey for outlines
grey_dark = 'grey50'
# violin plots showing the measured concentrations per substance, additionally,
# EC50 and LC50 values are drawn
p_conc_range = data_conc_ranges %>%
filter(concentration != 0) %>%
ggplot(aes(
x = concentration,
y = factor(compound_new, levels = max_concentrations$compound_new)
)) +
geom_violin(linewidth = 0.2, fill = grey_light, colour = grey_dark, alpha = 0.5) +
geom_point(
data = lc50_values,
mapping = aes(x = xc50, y = compound_new, colour = type),
size = 0.75,
inherit.aes = F
) +
geom_errorbarh(
data = lc50_values,
mapping = aes(xmin = xc50-xc50ste, xmax = xc50+xc50ste, y = compound_new, colour = type),
# height = 0.5,
inherit.aes = F
) +
labs(
x = 'Concentration (µmol/L)'
) +
scale_colour_manual(
name = NULL,
values = c(
'EC50 (0-96)' = 'firebrick1',
'LC50 (0-96)' = 'firebrick4',
'EC50 (24-96)' = 'dodgerblue2',
'LC50 (24-96)' = 'dodgerblue4'
),
# labels = c(
# expression(EC['50']~'(0-96 hpf)'),
# expression(LC['50']~'(0-96 hpf)'),
# expression(EC['50']~'(24-96 hpf)'),
# expression(LC['50']~'(24-96 hpf)')
# ),
guide = guide_legend(ncol = 2)
) +
scale_x_log10(guide = 'axis_logticks', breaks =1*10^c(-4, -2, 0, 2, 4)) +
theme_conc_ranges +
theme(
panel.grid.major.x = element_line(linewidth = 0.2, colour = grey_light),
panel.grid.minor.x = element_line(linewidth = 0.2, colour = grey_light),
panel.grid.major.y = element_line(linewidth = 0.2, colour = grey_light),
axis.line.x = element_line(colour = 'black', linewidth = 0.2),
axis.ticks.x = element_line(colour = 'black', linewidth = 0.2),
plot.margin = unit(c(.1, .1, .1, .1), 'lines')
)
p_model_overview = model_overview %>%
ggplot(aes(
x = factor(
model_type,
levels = c(
'EC50_096',
'LC50_096',
'EC50_2496',
'LC50_2496'
)
),
y = factor(substance, levels = max_concentrations$compound_new),
colour = value
)) +
# geom_tile(colour = grey_light, width = 0.8, height = 0.8) +
geom_point(shape = 15, size = 2.1) +
scale_colour_manual(
name = NULL,
values = c(
'modelling successful' = 'forestgreen',
'modelling unsuccessful' = 'firebrick3',
'no data' = grey_light
),
guide = guide_legend(
keyheight = unit(2.5, units = "mm"),
keywidth = unit(2.5, units = "mm"),
ncol = 1
)
) +
scale_x_discrete(
labels = c(
'0-96 hpf sublethal',
'0-96 hpf lethal',
'24-96 hpf sublethal',
'24-96 hpf lethal'
)
) +
# coord_fixed() +
theme_conc_ranges +
theme_conc_ranges_side +
theme(
axis.title.x = element_blank(),
panel.grid = element_blank(),
axis.ticks = element_blank(),
axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
axis.text.y = element_blank()
)
p_sens_ratio_tile = sensitivity_ratios %>%
ggplot(aes(
x = time_point,
y = factor(compound_new, levels = max_concentrations$compound_new),
colour = sens_ratio
)) +
# geom_tile(colour = grey_light, width = 0.8, height = 0.8) +
geom_point(shape = 15, size = 2.1) +
scale_colour_gradientn(
colours = c('mediumorchid4', 'gold1'),
values = c(0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.75, 1),
na.value = grey_light,
name = 'Sensitivity ratio'
) +
scale_x_discrete(labels = c('0-96 hpf', '24-96 hpf')) +
theme_conc_ranges +
theme_conc_ranges_side +
theme(
axis.title.x = element_blank(),
panel.grid = element_blank(),
axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
axis.text.y = element_blank(),
axis.ticks = element_blank(),
legend.key.width = unit(5, 'mm'),
legend.key.height = unit(3.5, 'mm'),
legend.title.position = 'top'
)
# density plot to show the distribution of concentrations across all substances
p_conc_ranges_all = data_conc_ranges %>%
filter(concentration != 0) %>%
ggplot(
aes(x = concentration, y = after_stat(density))
) +
geom_density(fill = grey_mid, colour = alpha('white', 0)) +
scale_x_log10() +
theme_void()
# bar plot to show the number of embryos per substance
p_substances_n_embryos = data_conc_ranges %>%
ggplot(
aes(y = factor(compound_new, levels = max_concentrations$compound_new))
) +
geom_bar(fill = grey_mid, width = 0.75) +
scale_x_continuous(breaks = c(0, 2500)) +
labs(x = 'Embryos (n)') +
theme_conc_ranges +
theme_conc_ranges_side +
theme(
axis.title.y = element_blank(),
axis.text.y = element_blank(),
panel.grid = element_blank(),
axis.line.x = element_line(colour = 'black', linewidth = .2),
axis.ticks.x = element_line(colour = 'black', linewidth = .2),
plot.margin = unit(c(.1, .1, .1, .1), 'lines')
)
# an empty plot to use when plotting the above plots in a grid
p_empty = ggplot(mtcars, aes(x = wt, y = mpg)) + theme_void() + geom_blank()
p_data = egg::ggarrange(
# first row
p_conc_ranges_all, p_empty, p_empty, p_empty,
# second row
p_conc_range + theme(legend.position = 'none'),
p_substances_n_embryos,
p_model_overview + theme(legend.position = 'none'),
p_sens_ratio_tile + theme(legend.position = 'none'),
ncol = 4,
widths = c(10, 1.1, 1.5, 0.75),
heights = c(1, 30),
draw = F,
debug = F
)
p_legend = egg::ggarrange(
p_empty,
ggpubr::get_legend(p_conc_range) %>% as_ggplot(),
ggpubr::get_legend(p_model_overview) %>% as_ggplot(),
ggpubr::get_legend(p_sens_ratio_tile) %>% as_ggplot(),
p_empty,
widths = c(1, 2, 2, 2, 1),
ncol = 5,
draw = F
)
p = plot_grid(
p_data, p_legend,
ncol = 1, rel_heights = c(30, 2)
)
ggsave(
filename = file.path('results', 'figures', 'concentration_ranges', 'concentration_ranges.svg'),
plot = p,
device = 'svg',
width = 16,
height = 22,
units = 'cm',
bg = 'white',
create.dir = T
)
To ensure similar concentration ranges are compared for all substances, for each substance, data is filtered to be in the range of the EC5 and the LC99. In this range substances cause phenotpypic effects, but effects are not dominated by lethality.
filter_low_occ_effects = function(heatmap, cutoff = 0.01) {
# filter seldom occuring effects
keep_cols = apply(heatmap, 2, mean)[apply(heatmap, 2, mean) > cutoff] %>% names()
heatmap = heatmap[, keep_cols]
return(heatmap)
}
clusters_like_complexHeatmap = function(hclust, heatmap, k) {
cluster_old = hclust %>% cutree(k)
dend = as.dendrogram(hclust)
# this is how complexHeatmap reorders the dendrogram
dend = reorder(dend, -rowMeans(heatmap))
# reversing it orders the clusters from top to bottom in ascending numbers, as
# they are plotted in complexHeatmap
row_index = rev(order.dendrogram(dend))
df = data.frame(
substance = names(cluster_old),
cluster = cluster_old
)
df = df[row_index, ]
df$cluster_new = c(1:6)[factor(df$cluster, levels = unique(df$cluster))]
cluster_new = df$cluster_new %>% set_names(df$substance)
cluster_new
}
set.seed(123)
cluster_n = c(
'48' = 6,
'72' = 6,
'96' = 6,
'120' = 4
)
time_points = c(48, 72, 96, 120)
# iterate over observation time points
pfp_data = lapply(time_points, function(t2) {
# iterate over substances
pfp_data_t = lapply(models_all, function(model_list) {
# iterate over exposure start points
lapply(c(0, 24), function(t1) {
model_name = set_model_name(t1, t2)
# get lower limit = EC5, skip if model does not exist
if (class(model_list$sublethal[[model_name]]) != 'drc') {
return(NULL)
} else {
lower_conc = ED(model_list$sublethal[[model_name]], 5, display = F)[[1]]
}
# get upper limit = LC99, Inf if model does not exist
if (class(model_list$lethal[[model_name]]) != 'drc') {
upper_conc = Inf
} else {
upper_conc = ED(model_list$lethal[[model_name]], 99, display = F)[[1]]
}
observations = d_f_comptox_sublethal$obs_phenotypes %>%
filter(
test_substance == model_list$substance &
age_embryo_exposure_start_hpf == t1 &
age_embryo_observation_hpf == t2
)
observations = left_join(observations, comptox_substances, by = 'test_substance')
# converting all mg/L concentrations to umol/L
observations$concentration = mapply(
function(unit, conc, molweight) {
if (unit == 'mg/L') {
# mg/L is converted to umol/L
mg_to_umol(conc, molweight)
} else {
# umol/L and % values remain unchanged
conc
}
},
unit = observations$unit,
conc = observations$concentration,
molweight = observations$`MolWeight_g/mol`
)
observations$unit[observations$unit == 'mg/L'] = 'umol/L'
observations = observations %>% filter(concentration >= lower_conc &
concentration <= upper_conc)
return(observations)
}) %>% rbindlist()
}) %>% rbindlist()
pfp_data_t = pfp_data_t %>% filter(
!effect %in% c(
# 'coagulated',
'beats per minute',
'miscellaneous',
'no embryo',
'normal',
'hours post fertilization'
)
)
pfp_data_t = pfp_data_t %>% group_by(test_substance) %>%
mutate(
n_embryos = nrow(unique(pick(experiment_id, container, position_x, position_y))),
n_exp = n_distinct(experiment_id)
) %>%
ungroup()
pfp_data_heatmap_t = pfp_data_t %>% group_by(test_substance) %>%
# mutate(n_exp = n_distinct(experiment_id)) %>%
# filter substances that only have 1 experiment
# filter(n_exp > 1) %>%
mutate(n_embryos = pick(experiment_id, position_x, position_y, container) %>% unique() %>% nrow()) %>%
filter(n_embryos > 15) %>%
group_by(test_substance, effect) %>%
summarise(percentage = n() / unique(n_embryos),
.groups = 'drop') %>%
pivot_wider(
id_cols = test_substance,
names_from = effect,
values_from = percentage,
values_fill = 0
) %>%
as.data.frame()
rownames(pfp_data_heatmap_t) = pfp_data_heatmap_t$test_substance
pfp_data_heatmap_t$test_substance = NULL
pfp_data_heatmap_t = pfp_data_heatmap_t %>% as.matrix()
# filtering low data substances and effects
pfp_data_t = pfp_data_t %>% filter(n_embryos >= 30)
pfp_data_heatmap_t = pfp_data_heatmap_t[unique(pfp_data_t$test_substance), ]
pfp_data_heatmap_t = filter_low_occ_effects(pfp_data_heatmap_t, cutoff = 0.01)
hclust_t = hclust(dist(pfp_data_heatmap_t, method = 'euclidean'), method = 'ward.D2')
return(list(
pfp_data = pfp_data_t,
pfp_heatmap = pfp_data_heatmap_t,
pfp_hclust = hclust_t
))
}) %>% set_names(time_points)
Heatmap of the fingerprints for each timepoint (48, 72, 96, 120 hpf).
set.seed(123)
# defining plot and font sizes
# plot size in cm
w_cm = 12
h_cm = 18
# conversion to inches, as svg() uses inches
w_in = w_cm / 2.54
h_in = h_cm / 2.54
# font size row and column labels
fontsize_row_col = 7
font_size_smaller = fontsize_row_col - 1
# family = 'Arial'
# colour for NA values
grey_light = 'grey90'
# lightish colour to show data
grey_mid = 'grey70'
# dark grey for outlines
grey_dark = 'grey30'
time_points = c(48, 72, 96, 120)
for (t in time_points) {
t = as.character(t)
pfp_data_t = pfp_data[[t]]$pfp_data
pfp_data_heatmap_t = pfp_data[[t]]$pfp_heatmap
col_ordering_t = col_ordering %>% filter(effect %in% colnames(pfp_data_heatmap_t))
effect_proportions = lapply(colnames(pfp_data_heatmap_t), function(eff) {
n_embryos = pfp_data_t %>%
dplyr::select(experiment_id, container, position_x, position_y) %>%
unique() %>%
nrow()
n_effect = pfp_data_t %>%
filter(effect == eff) %>%
dplyr::select(experiment_id, container, position_x, position_y) %>%
unique() %>%
nrow()
return(n_effect / n_embryos)
}) %>% as.numeric() %>% set_names(colnames(pfp_data_heatmap_t))
substances_n_embryos = lapply(rownames(pfp_data_heatmap_t), function(s) {
pfp_data_t %>%
filter(test_substance == s) %>%
dplyr::select(experiment_id, position_x, position_y, container) %>%
unique() %>%
nrow()
}) %>% as.numeric() %>% set_names(rownames(pfp_data_heatmap_t))
substances_n_exp = lapply(rownames(pfp_data_heatmap_t), function(s) {
pfp_data_t %>%
filter(test_substance == s) %>%
pull(experiment_id) %>% n_distinct()
}) %>% as.numeric() %>% set_names(rownames(pfp_data_heatmap_t))
clusters = pfp_data[[t]]$pfp_hclust %>% cutree(k = cluster_n[t])
ht_opt$TITLE_PADDING = unit(c(1.5, -0.2), "lines")
svg(
filename = sprintf('results/figures/phenotypic_fingerprint/heatmaps/cheatmap_EC5_LC99_t%s_cut.svg', t),
width = w_in,
height = h_in,
bg = 'white'
)
draw(
Heatmap(
pfp_data_heatmap_t,
# colour and legend
col = colorRampPalette(c('#ffefff', '#660066'))(1024),
heatmap_legend_param = list(
legend_direction = 'horizontal',
legend_width = unit(2, 'cm'),
tick_length = unit(0.5, 'mm'),
title = 'Prevalence of embryos\nexhibiting effect',
title_gp = gpar(fontsize = fontsize_row_col, fontface = 'plain'),
labels_gp = gpar(fontsize = font_size_smaller)
),
border = T,
border_gp = gpar(col = grey_light),
# clustering rows
clustering_distance_rows = 'euclidean',
clustering_method_rows = 'ward.D2',
row_dend_width = unit(w_in * 0.1, 'inches'),
row_title_rot = 0,
row_title_side = 'right',
row_title_gp = gpar(fontsize = fontsize_row_col + 1, fill = grey_light, lwd = 0),
split = cluster_n[t],
# row_split = clusters,
row_gap = unit(0.2, 'lines'),
# order of columns
cluster_columns = F,
column_order = col_ordering_t$effect,
# text size
row_names_gp = grid::gpar(fontsize = fontsize_row_col),
column_names_gp = grid::gpar(fontsize = fontsize_row_col),
# annotations next to the heatmap
top_annotation = HeatmapAnnotation(
prop = anno_barplot(
effect_proportions,
border = F,
bar_width = 0.5,
height = unit(h_in * 0.04, 'inches'),
ylim = c(0, round(max(effect_proportions), digits = 1)),
gp = gpar(fill = grey_mid, col = alpha('black', 0)),
axis_param = list(
at = c(0, 0.4),
gp = grid::gpar(fontsize = font_size_smaller, col = grey_dark)
)
),
annotation_label = 'Prevalence\nof effect',
annotation_name_side = 'left',
annotation_name_rot = 0,
# axis title
annotation_name_gp = grid::gpar(fontsize = font_size_smaller, col = grey_dark)
),
right_annotation = rowAnnotation(
n_embryos = anno_barplot(
substances_n_embryos,
border = F,
bar_width = 0.5,
width = unit(w_in * 0.04, 'inches'),
gp = gpar(fill = grey_mid, col = alpha('black', 0)),
axis_param = list(
# breaks = 0, max rounded to nearest hundred and half of max rounded to hundred
at = c(
0,
round(max(substances_n_embryos), digits = -2)/2,
round(max(substances_n_embryos), digits = -2)
),
gp = gpar(fontsize = font_size_smaller, col = grey_dark)
)
),
# n_embryos = anno_numeric(
# substances_n_embryos,
# labels_gp = gpar(fontsize = font_size_smaller - 1),
# bg_gp = gpar(fill = "grey70", col = alpha('black', 0)),
# labels_offset = unit(2, "pt"),
# round_corners = F,
# width = unit(w_in * 0.04, 'inches')
# ),
annotation_label = 'embryos in\nfingerprint (n)',
annotation_name_rot = 90,
annotation_name_gp = grid::gpar(fontsize = font_size_smaller, col = grey_dark)
)
),
heatmap_legend_side = "bottom"
)
dev.off()
ht_opt(RESET = TRUE)
}
Supplementary figure with PCA biplots for the fingerprints.
pca_plots = lapply(as.character(c(48, 72, 96, 120)), function(t) {
pca_biplot(pfp_data[[t]]$pfp_heatmap) +
theme(
axis.text = element_text(family = 'Roboto', size = 6),
axis.title = element_text(family = 'Roboto', size = 8)
)
})
p_pca_combined = egg::ggarrange(
plots = pca_plots,
ncol = 2,
labels = letters[1:4],
label.args = subfigure_label_args,
draw = F
)
ggsave(
'results/figures/phenotypic_fingerprint/pca/pca_all_effects.svg',
plot = p_pca_combined,
device = 'svg',
width = 18,
height = 18,
units = 'cm',
bg = 'white'
)
Figure with UMAP embeddings and heatmaps of the PCA loadings for each time point.
set.seed(123)
pca_loading_plots = lapply(as.character(c(48, 72, 96, 120)), function(t) {
pca = prcomp(pfp_data[[t]]$pfp_heatmap, scale. = F)
explained_variance <- pca$sdev^2
proportion_variance_explained <- explained_variance / sum(explained_variance)
proportion_variance_explained[1:5] %>% sum
var_explained_df = data.frame(
name = colnames(pca$rotation[, 1:5]),
var_expl = proportion_variance_explained[1:5]
)
euc_norm = pca$rotation[, 1:5] %>% apply(1, function(x) sqrt(sum(x^2)))
pc_sums = abs(pca$rotation[, 1:5]) %>% rowSums()
pca$rotation[, 1:5] %>% as_tibble(rownames = 'effect') %>%
add_column(importance = euc_norm) %>%
arrange(importance) %>%
slice_tail(n = 10) %>%
pivot_longer(cols = starts_with('PC')) %>%
left_join(var_explained_df, by = 'name') %>%
add_column(
pc_expl = sprintf('%s\n%.1f%%', .$name, round(.$var_expl * 100, digits = 1))
) %>%
ggplot(aes(
x = pc_expl,
y = factor(effect, levels = unique(effect)),
colour = value
)) +
geom_point(shape = 'square', size = 4) +
scale_colour_gradient2(
name = 'Loading of variable',
low = 'darkblue',
mid = 'grey90',
high = 'darkred',
breaks = c(-0.6, 0, 0.6),
labels = c('-0.6', '0', '0.6')
) +
theme(
axis.title = element_blank(),
legend.key.size = unit(c(0.3), 'cm'),
axis.text = element_text(family = 'Roboto condensed', size = 7),
legend.title = element_text(family = 'Roboto', size = 7),
legend.text = element_text(family = 'Roboto', size = 7),
legend.title.position = 'top'
)
}) %>% set_names(c(48, 72, 96, 120))
umap_plots = lapply(as.character(c(48, 72, 96, 120)), function(t) {
umap = umap(pfp_data[[t]]$pfp_heatmap)
clusters = clusters_like_complexHeatmap(
pfp_data[[t]]$pfp_hclust,
pfp_data[[t]]$pfp_heatmap,
cluster_n[t]
)
clusters_df = data.frame(
substance = names(clusters),
cluster = clusters
)
umap_df = as.data.frame(umap$layout) %>%
add_column(substance = rownames(umap$layout)) %>%
left_join(clusters_df, by = 'substance')
umap_df %>% ggplot(
aes(x = V1, y = V2, colour = factor(cluster), label = substance)
) +
geom_point(size = 2.5, alpha = 0.8) +
# geom_text(size = 6/.pt) +
scale_colour_manual(
values = c(
'#D43F3AFF',
'#EEA236FF',
'#5CB85CFF',
'#357EBDFF',
'#9632B8FF',
'#B8B8B8FF'
)[1:cluster_n[t]]
) +
labs(
x = 'UMAP 1',
y = 'UMAP 2'
) +
theme(
axis.title = element_text(family = 'Roboto', size = 8),
axis.text = element_blank(),
axis.ticks = element_blank()
)
}) %>% set_names(c(48, 72, 96, 120))
p_empty = ggplot(mtcars, aes(x = wt, y = mpg)) + theme_void() + geom_blank()
for (n in names(umap_plots)) {
p = plot_grid(
plot_grid(
p_empty,
umap_plots[[n]],
nrow = 1,
rel_widths = c(1, 10)
),
pca_loading_plots[[n]],
ncol = 1,
rel_heights = c(3, 3)
)
ggsave(
sprintf('results/figures/phenotypic_fingerprint/umap/umap_t%s_no_names.svg', n),
plot = p,
device = 'svg',
width = 7,
height = 17,
units = 'cm',
bg = 'white'
)
}
substance_fp_comparison = function(substance, intob_pfp, zotero, intob_tps = c(72, 96, 120), return_data = F) {
all_effects = c('coagulated', 'abnormal eye', 'abnormal tail effects', 'developmental delay',
'malformation', 'abnormal behavior', 'heart', 'abnormal blood circulation',
'edema', 'abnormal pigmentation', 'abnormal hatching', 'abnormal swim bladder',
'abnormal intestine') %>% sort()
ds_zotero = zotero %>% filter(test_substance == substance)
ds_zotero = ds_zotero %>%
arrange(year) %>%
dplyr::select(effect_long, study_short) %>%
dplyr::rename(source = study_short, parent_effect = effect_long) %>%
add_column(value = NA_integer_) %>%
filter(!parent_effect %in% c('gene regulation', 'mortality rate', 'neuronal effects'))
ds_intob = lapply(as.character(intob_tps), function(t) {
ds_pfp = intob_pfp[[t]]$pfp_data %>% filter(test_substance == substance)
n_embryos = ds_pfp %>%
dplyr::select(test_substance, experiment_id, position_x, position_y, container) %>%
unique() %>%
nrow()
ds_pfp = intob_pfp[[t]]$pfp_data %>% filter(test_substance == substance) %>%
group_by(parent_effect) %>%
count() %>%
ungroup() %>%
mutate(value = n / n_embryos) %>%
add_column(source = sprintf('INTOB (%s hpf)', t)) %>%
dplyr::select(-n)
ds_pfp$parent_effect[ds_pfp$parent_effect == 'deformation / malformation'] = 'malformation'
ds_pfp$parent_effect[ds_pfp$parent_effect == 'developmental delay effects'] = 'developmental delay'
ds_pfp$parent_effect[ds_pfp$parent_effect == 'heartbeat'] = 'heart'
ds_pfp
}) %>% do.call(rbind, .)
ds_combined = rbind(ds_intob, ds_zotero) %>%
add_column(substance = substance)
# create a factor for source, where INTOB is last, preceeded by the studies
ds_combined$source = factor(
ds_combined$source,
levels = rev(c('INTOB', unique(ds_combined$source[ds_combined$source != 'INTOB'])))
)
if (return_data) return(ds_combined)
ds_combined %>% ggplot(
aes(x = factor(parent_effect, levels = all_effects), y = source, fill = value)
) +
geom_point(shape = 22, size = 3, stroke = 0.3) +
scale_x_discrete(drop = F) +
scale_fill_gradient(
name = 'Prevalence of effect',
low = 'white',
high = 'firebrick3',
na.value = 'grey'
) +
guides(fill = guide_colorbar(title.position = 'top')) +
theme(
axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
axis.title = element_blank(),
legend.title = element_text(size = 7)
)
}
age_selection = c('72', '96', '120', '72, 96', '72, 120', '96, 120', '72, 96, 120')
substance_fp_comparison('Nicotine', pfp_data, zotero_lib[zotero_lib$age_hpf %in% age_selection, ]) %>%
ggsave(
'results/figures/zotero/nicotine.png',
plot = .,
device = 'png',
width = 7,
height = 8,
units = 'cm',
dpi = 300,
bg = 'white'
)
p_vpa = substance_fp_comparison('Valproic acid', pfp_data, zotero_lib[zotero_lib$age_hpf %in% age_selection, ])
p_vpa %>%
ggsave(
'results/figures/zotero/valproic_acid.png',
plot = .,
device = 'png',
width = 7,
height = 8,
units = 'cm',
dpi = 300,
bg = 'white'
)
vpa_df = data.frame(
ecx = c(1317, 5.7, 435, 80.6, 82, 0.586),
ecx_ste = c(55, NA, NA, 10.2, NA, NA),
type = c('LC50', 'LC50', 'LC50', 'EC50', 'EC50', 'EC50'),
source = c('INTOB (0-96 hpf)',
'Jarque et al. 2020',
'Brotzmann et al. 2021',
'INTOB (0-96 hpf)',
'Brotzmann et al. 2021',
'Jarque et al. 2020')
)
vpa_df$source = factor(
vpa_df$source,
levels = rev(c(
'INTOB (0-96 hpf)',
'Jarque et al. 2020',
'Brotzmann et al. 2021'
))
)
vpa_ec_lc = vpa_df %>% ggplot(
aes(x = ecx, colour = type, y = source, xmin = ecx-ecx_ste, xmax = ecx+ecx_ste)
) +
geom_point(size = 1.5) +
geom_errorbarh(height = 0.45) +
scale_x_log10(guide = 'axis_logticks', breaks =c(1, 10, 100, 1000)) +
scale_colour_manual(name = NULL, values = c(
'EC50' = 'firebrick1',
'LC50' = 'firebrick4'
)) +
labs(
x = 'Concentration (µmol/L)'
) +
theme(
axis.title.y = element_blank(),
legend.direction = 'vertical',
axis.title.x = element_text(size = 7)
)
egg::ggarrange(
p_vpa,
vpa_ec_lc,
heights = c(1, 0.75),
labels = c('b', 'c'),
label.args = subfigure_label_args
) %>%
ggsave(
'results/figures/zotero/valproic_acid_ec_lc.svg',
plot = .,
device = 'svg',
width = 7,
height = 12,
units = 'cm',
dpi = 300,
bg = 'white'
)
for (s in intersect(substances_list$zotero, substances_list$intob)) {
p = substance_fp_comparison(s, pfp_data, zotero_lib_combined) +
ggtitle(s)
print(p)
}
plotlist = lapply(intersect(substances_list$zotero, substances_list$intob), function(s) {
substance_fp_comparison(s, pfp_data, zotero_lib_combined) +
ggtitle(s)
})
p = cowplot::plot_grid(
plotlist = plotlist,
ncol = 4
)
ggsave(
filename = 'results/figures/zotero/zotero_intob_effects.png',
plot = p,
device = 'png',
width = 50,
height = 70,
units = 'cm',
dpi = 350,
bg = 'white'
)
# top_zotero_substances = zotero_lib_combined %>%
# group_by(test_substance) %>%
# count() %>%
# arrange(desc(n)) %>%
# pull(test_substance) %>% .[1:15]
#
# for (s in top_zotero_substances) {
# p = substance_fp_comparison(s, pfp_data, zotero_lib_combined) +
# ggtitle(s)
# print(p)
# }
Making a heatmap of selected substances from the literature review dataset.
age_selection = c('72', '96', '120', '72, 96', '72, 120', '96, 120', '72, 96, 120')
zotero_matrix_df = zotero_lib %>%
filter(age_hpf %in% age_selection) %>%
mutate(id = sprintf('%s (%s)', test_substance, study_short)) %>%
mutate(id = iconv(id, from = 'UTF-8', to = 'ASCII//TRANSLIT')) %>%
mutate(observed = 1) %>%
group_by(test_substance) %>%
mutate(substance_n = n_distinct(id)) %>%
filter(substance_n > 1) %>%
pivot_wider(
id_cols = matches('id'),
names_from = effect_long,
values_from = observed,
values_fn = first,
values_fill = 0
) %>%
as.data.frame()
zotero_matrix_df = zotero_matrix_df[!duplicated(zotero_matrix_df$id), ]
zotero_matrix = zotero_matrix_df
zotero_matrix$dtxsid = NULL
zotero_matrix$`NA` = NULL
zotero_matrix$`gene regulation` = NULL
zotero_matrix$id = NULL
rownames(zotero_matrix) = zotero_matrix_df$id
zotero_matrix = zotero_matrix %>% as.matrix()
zotero_matrix = zotero_matrix[rowSums(zotero_matrix) != 0, ]
set.seed(1234)
# defining plot and font sizes
# plot size in cm
w_cm = 13
h_cm = 19
# conversion to inches, as svg() uses inches
w_in = w_cm / 2.54
h_in = h_cm / 2.54
# font size row and column labels
fontsize_row_col = 7
font_size_smaller = fontsize_row_col - 1
substances = rownames(zotero_matrix) %>% str_extract('^.+(?= \\()')
studies = rownames(zotero_matrix) %>% str_extract('(?<= \\().+(?=\\))')
svg(
filename = sprintf(
'results/figures/zotero/zotero_heatmap_72_96_120.svg'
),
width = w_in,
height = h_in,
bg = 'white'
)
draw(
Heatmap(
zotero_matrix,
show_row_names = F,
# colour and legend
col = colorRampPalette(c('grey90', 'grey20'))(2),
heatmap_legend_param = list(
title = 'Effect reported',
title_gp = gpar(fontsize = fontsize_row_col, fontface = 'plain'),
lgd = Legend(
at = c(0, 1),
labels = c('not reported', 'reported'),
legend_gp = gpar(fill = c('grey90', 'grey20'), fontsize = fontsize_row_col-1)
),
legend_direction = 'horizontal',
legend_width = unit(w_in * 0.15, 'inches')
),
right_annotation = rowAnnotation(
substance = zotero_matrix %>% rownames %>% str_extract('^.+(?=[:space:]\\()'),
substance_name = anno_text(substances, gp = gpar(fontsize = fontsize_row_col, fontface = 'plain')),
study = zotero_matrix %>% rownames %>% str_extract('\\(.+'),
study_name = anno_text(studies, gp = gpar(fontsize = fontsize_row_col, fontface = 'plain')),
show_legend = F,
annotation_name_gp = gpar(fontsize = fontsize_row_col, col = 'grey20'),
simple_anno_size = unit(0.3, 'cm')
),
# clustering rows
clustering_distance_rows = 'euclidean',
clustering_method_rows = 'ward.D2',
row_dend_width = unit(w_in * 0.1, 'inches'),
# order of columns
cluster_columns = F,
# text size
row_names_gp = grid::gpar(fontsize = fontsize_row_col),
column_names_gp = grid::gpar(fontsize = fontsize_row_col)
)
)
dev.off()
corr_time = function(experiments, t1, t2, occurence_cut = 0.005, message = F) {
# get the relevant data
exps = d_f_comptox_sublethal$obs_phenotypes %>%
# only containers where embryos can be tracked
filter(experiment_id %in% experiments) %>%
# only treatment concentrations
filter(concentration != 0) %>%
# filter uninformative effects
filter(
!effect %in% c(
'beats per minute',
'miscellaneous',
'no embryo',
'hours post fertilization'
)
) %>%
# get experiments that have observations at 48 and 96
group_by(experiment_id) %>%
filter(t1 %in% age_embryo_observation_hpf & t2 %in% age_embryo_observation_hpf) %>%
filter(age_embryo_observation_hpf %in% c(t1, t2)) %>%
add_column(
embryo_id = paste(
.$experiment_id,
.$test_substance,
.$position_x,
.$position_y,
.$container,
sep = '.'
)
)
if (message) message(sprintf('n embryos: %i', n_distinct(exps$embryo_id)))
# get the data in matrix format
a_48 = exps %>%
ungroup() %>%
filter(age_embryo_observation_hpf == t1) %>%
dplyr::select(embryo_id, effect) %>%
group_by(
embryo_id
) %>%
count(effect) %>%
# rbind(data.frame(
# embryo_id = 'dummy_embryo_a',
# effect = col_ordering$effect,
# n = 0
# )) %>%
pivot_wider(names_from = effect,
values_from = n,
values_fill = 0) %>%
ungroup()
a_96 = exps %>%
ungroup() %>%
filter(age_embryo_observation_hpf == t2) %>%
dplyr::select(embryo_id, effect) %>%
group_by(
embryo_id
) %>%
count(effect) %>%
# rbind(data.frame(
# embryo_id = 'dummy_embryo_b',
# effect = col_ordering$effect,
# n = 0
# )) %>%
pivot_wider(names_from = effect,
values_from = n,
values_fill = 0) %>%
ungroup()
a_48_names = a_48 %>%
filter(embryo_id %in% intersect(a_48$embryo_id, a_96$embryo_id)) %>%
dplyr::select(intersect(names(a_48), names(a_96)))
a_96_names = a_96 %>%
filter(embryo_id %in% intersect(a_48$embryo_id, a_96$embryo_id)) %>%
dplyr::select(intersect(names(a_48), names(a_96)))
a_48 = a_48_names %>% dplyr::select(!embryo_id) %>% as.matrix()
a_96 = a_96_names %>% dplyr::select(!embryo_id) %>% as.matrix()
# filter effects that occur in less than 1% of embryos
high_occurence_effects = rbind(a_48, a_96) %>% apply(2, function(x) sum(x)/length(x))
high_occurence_effects = names(high_occurence_effects[high_occurence_effects > occurence_cut])
a_48 = a_48[, high_occurence_effects]
a_96 = a_96[, high_occurence_effects]
# create matrices of correlations and p-values
n = colnames(a_48) %>% length
cor_48_96 = lapply(colnames(a_48), function(effect_48) {
lapply(colnames(a_96), function(effect_96) {
c = cor.test(a_48[, effect_48], a_96[, effect_96])
m = matrix(c$estimate)
colnames(m) = effect_48
rownames(m) = effect_96
m
}) %>% do.call(rbind, .)
}) %>% do.call(cbind, .)
pval_48_96 = lapply(colnames(a_48), function(effect_48) {
lapply(colnames(a_96), function(effect_96) {
c = cor.test(a_48[, effect_48], a_96[, effect_96])
m = matrix(c$p.value)
colnames(m) = effect_48
rownames(m) = effect_96
m
}) %>% do.call(rbind, .)
}) %>% do.call(cbind, .)
# cols are effects at t1
# rows are effects at t2
return(
list(
cor = cor_48_96,
pval = pval_48_96
)
)
}
exps = d_f_comptox_sublethal$obs_phenotypes %>%
# only containers where embryos can be tracked
filter(container_type %in% c(24, 48, 96)) %>%
pull(experiment_id) %>% unique()
cor_48_96 = corr_time(exps, 48, 96, message = T)
## n embryos: 778
w_cm = 9
h_cm = 9
w_in = w_cm / 2.54
h_in = h_cm / 2.54
svg(
'results/figures/effect_propagation/effect_prop_48_96_corrplot.svg',
width = w_in,
height = h_in,
pointsize = 7,
family = 'Roboto',
bg = 'white'
)
corrplot(
cor_48_96$cor,
method = 'square',
col = rev(COL2('RdBu', 200)),
col.lim = c(-1, 1),
addgrid.col = 'grey95',
# order = 'hclust',
# hclust.method = 'mcquitty',
tl.col = 'black',
cl.align.text = 'l',
pch = 4,
pch.cex = 1,
pch.col = 'grey70',
p.mat = cor_48_96$pval,
sig.level = 0.01
)
dev.off()
figure_time_conc = function(
s,
start_hpf = 0,
time_points = c(48, 72, 96),
text_size = 7,
return_legend = F
) {
theme_time_conc = theme(
legend.position = 'bottom',
axis.title.y = element_blank(),
axis.text.y = element_text(size = text_size, colour = 'black'),
axis.text.x = element_text(size = text_size-1, colour = 'black'),
axis.title.x = element_text(size = text_size),
legend.text = element_text(size = text_size),
legend.title = element_text(size = text_size),
strip.text = element_text(size = text_size),
)
# colour for NA values
grey_light = 'grey90'
# lightish colour to show data
grey_mid = 'grey80'
# dark grey for outlines
grey_dark = 'grey50'
weight = comptox_substances$`MolWeight_g/mol`[comptox_substances$test_substance == s]
s_exps = d_f_comptox_sublethal$obs_phenotypes %>% filter(test_substance == s) %>% pull(experiment_id) %>% unique()
df = d_f_comptox_sublethal$obs_phenotypes %>% ungroup() %>%
filter(experiment_id %in% s_exps) %>%
filter(age_embryo_exposure_start_hpf == start_hpf) %>%
filter(age_embryo_observation_hpf %in% time_points) %>%
filter(test_substance == s) %>%
filter(!effect %in% c('miscellaneous', 'hours post fertilization', 'beats per minute', 'no embryo'))
if (nrow(df) == 0) {
message('Error: Exposure scenario not measured')
return(NULL)
}
df$concentration = apply(df, 1, function(row) {
unit = row[['unit']]
conc = as.numeric(row[['concentration']])
if (unit == 'mg/L') {
# mg/L is converted to umol/L
mg_to_umol(conc_mg = conc, molweight = weight)
} else {
# umol/L and % values remain unchanged
conc
}
})
# all units are changed accordingly, only Methanol keeps the % concentration
df$unit = 'umol/L'
# getting ec50 and lc50 values
df_conditions = expand_grid(
s = s,
start = start_hpf,
end = time_points,
type = c('lethal', 'sublethal')
)
ec_lc_df = mapply(
function(s, t1, t2, type) {
n = set_model_name(t1, t2)
m = models_all[[s]][[type]][[n]]
lvl = case_when(
type == 'lethal' ~ 50,
type == 'sublethal' ~ 50
)
col = case_when(
type == 'lethal' ~ 'LC50',
type == 'sublethal' ~ 'EC50',
)
if (class(m) == 'drc') {
ec = ED(m, lvl, display = F)
return(
data.frame(
test_substance = s,
start_hpf = t1,
age_embryo_observation_hpf = t2,
type = type,
effect_level = lvl,
ec = ec[1],
ec_ste = ec[2],
colour = col
)
)
} else {
return(NULL)
}
},
s = as.character(df_conditions$s),
t1 = as.character(df_conditions$start),
t2 = as.character(df_conditions$end),
type = df_conditions$type,
SIMPLIFY = F
) %>% rbindlist()
# these effect will be plotted, others occur seldomly and are omitted
selected_effects = c(
'normal',
'coagulated',
'no hatching',
# 'early hatching',
'no blood circulation',
'blood congestion',
'abnormal heart',
'no heartbeat',
'decreased heartbeat',
# 'increased heartbeat',
'deformation yolk sac',
'pericardium edema',
'yolk sac edema',
# 'increased pigmentation',
'low pigmentation',
'no pigmentation',
# 'scoliosis',
'modified structure of chorda',
'abnormal tail',
'lack of somites',
'abnormal tail tip',
'abnormal tail length',
'no tail detachment',
# 'abnormal tail fin',
'deformation head',
'smaller head',
'abnormal eye size',
# 'no eye',
# 'malformation sacculi / otoliths',
# 'no spontaneous tail contraction',
# 'no movement after hatching',
'shivering',
# 'tremor',
# 'no swim bladder',
'developmental delay'
# 'necrosis'
)
df = df %>% filter(effect %in% selected_effects)
# a data frame containing effects and timepoints at which they cannot occur. This will be marked in the plot.
timepoints_prohibited = effect_list_info %>%
dplyr::select(effect_label, timepoint_start, timepoint_end) %>%
rename(effect = effect_label) %>%
pmap(possibly(function(effect, timepoint_start, timepoint_end) {
data.frame(effect = effect,
age_embryo_observation_hpf = time_points[time_points < timepoint_start |
time_points > timepoint_end])
})) %>% rbindlist()
# one effect with a long name is shortened
selected_effects[selected_effects == 'modified structure of chorda'] = 'mod. structure of chorda'
df$effect[df$effect == 'modified structure of chorda'] = 'mod. structure of chorda'
timepoints_prohibited[timepoints_prohibited$effect == 'modified structure of chorda'] = 'mod. structure of chorda'
p_fp = df %>%
group_by(concentration, age_embryo_observation_hpf) %>%
mutate(n_embryos_in_dg = nrow(unique(pick(experiment_id, container, position_x, position_y)))) %>%
group_by(concentration, age_embryo_observation_hpf, effect) %>%
reframe(perc_in_dg = n() / n_embryos_in_dg) %>%
unique() %>%
group_by(effect) %>%
mutate(mean_prevalence = mean(perc_in_dg)) %>%
# filter(mean_prevalence > 0.1) %>%
filter(concentration != 0) %>%
ggplot(aes(
y = factor(effect, levels = rev(selected_effects)),
x = concentration,
colour = perc_in_dg
)) +
geom_point(shape = 'square', size = 2.5, alpha = 0.8) +
geom_hline(
data = timepoints_prohibited,
aes(yintercept = effect),
colour = grey_mid,
linewidth = 1
) +
scale_x_log10(
guide = 'axis_logticks'
) +
labs(
x = 'Concentration (µmol/L)'
) +
scale_y_discrete(
limits = rev(selected_effects)
) +
scale_colour_gradient(
name = 'Percentage of embryos\nexhibiting effect',
low = '#ffefff', high = '#660066',
limits = c(0, 1),
breaks = c(0, 0.5, 1),
guide = guide_colorbar(direction = 'horizontal')
) +
facet_wrap(vars(age_embryo_observation_hpf), ncol = length(time_points)) +
theme_time_conc +
theme(
axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
strip.text = element_blank(),
legend.position = 'right',
legend.title.position = 'top',
legend.key.width = unit(0.5, 'cm'),
plot.margin = unit(c(-3, 4.5, 4.5, 4.5), 'points')
)
p_conc = df %>%
dplyr::select(
test_substance,
experiment_id,
container,
position_x,
position_y,
concentration,
age_embryo_observation_hpf
) %>%
# filter(mean_prevalence > 0.1) %>%
filter(concentration != 0) %>%
unique() %>%
ggplot(aes(
y = test_substance,
x = concentration
)) +
geom_violin(linewidth = 0.2, fill = grey_light, colour = grey_dark, alpha = 0.5) +
# geom_density_ridges(
# linewidth = 0.2,
# fill = grey_light,
# colour = grey_dark,
# alpha = 0.5,
# panel_scaling = T
# ) +
geom_point(
data = ec_lc_df,
mapping = aes(x = ec, y = test_substance, colour = colour),
size = 0.8,
inherit.aes = F
) +
geom_errorbarh(
data = ec_lc_df,
mapping = aes(xmin = ec-ec_ste, xmax = ec+ec_ste, y = test_substance, colour = colour),
height = 0.6,
inherit.aes = F
) +
scale_colour_manual(
name = NULL,
values = c(
'EC50' = 'grey60',
'LC50' = 'grey40'
)
) +
scale_x_log10() +
scale_y_discrete(
expand = expansion(add = 1)
) +
facet_wrap(
vars(age_embryo_observation_hpf), ncol = length(time_points),
labeller = function(df) {
data.frame(labels = sprintf('%i-%i hpf', start_hpf, as.numeric(df$age_embryo_observation_hpf)))
}
) +
theme_time_conc +
theme(
# axis.text.x = element_text(angle = 90, size = 8, hjust = 1, vjust = 0.5),
axis.text.x = element_blank(),
axis.title.x = element_blank(),
axis.ticks.x = element_blank(),
axis.ticks.y = element_blank(),
axis.text.y = element_blank(),
legend.position = 'right',
plot.margin = unit(c(4.5, 4.5, 4.5, 4.5), 'points')
)
if (return_legend) {
p_legend = egg::ggarrange(
ggpubr::get_legend(p_conc) %>% as_ggplot(),
ggpubr::get_legend(p_fp) %>% as_ggplot(),
ncol = 2,
draw = F,
debug = F
)
return(p_legend)
} else {
p_time_conc = egg::ggarrange(
p_conc + theme(legend.position = 'none'),
p_fp + theme(legend.position = 'none'),
ncol = 1,
heights = c(1, 12),
draw = F,
debug = F
)
return(p_time_conc)
}
}
ggsave(
'results/figures/effect_propagation/time_conc_fig_legend.svg',
plot = figure_time_conc('Bisphenol A', 0, text_size = 7, return_legend = T),
device = 'svg',
width = 6,
height = 2.5,
units = 'cm',
bg = 'white'
)
substances = c(
'Bisphenol A',
'Benzovindiflupyr',
'6-Propyl-2-thiouracil',
'Triclosan',
'Ferbam',
'N,N-Dimethyldecanamide',
'Azinphos-methyl',
'Gemfibrozil',
'Naproxen sodium'
)
for (s in substances) {
ggsave(
sprintf('results/figures/effect_propagation/time_conc_fig_%s_0hpf.svg', s),
plot = figure_time_conc(s, 0),
device = 'svg',
width = 9,
height = 9,
units = 'cm',
bg = 'white'
)
}
This document was created with the following command:
rmarkdown::render(
input = 'intob_analysis.Rmd',
output_format = 'html_document',
output_file = sprintf(
'%s_intob_analysis.html',
format(Sys.time(), '%y%m%d')
)
)
R version 4.3.0 (2023-04-21)
Platform: x86_64-pc-linux-gnu (64-bit)
attached base packages:
other attached packages:
loaded via a namespace (and not attached):