Information

This file serves as supplementary methods to the publication Michaelis et al. (2024): “Leveraging zebrafish embryo phenotypic observations to advance data-driven analyses in toxicology”. It contains all code needed to conduct the analysis as outlined in the paper as well as to generate the figures.

Authors: Paul Michaelis, Nils Klüver*, Silke Aulhorn, Hannes Bohring, Jan Bumberger, Kristina Haase, Tobias Kuhnert, Eberhard Küster, Janet Krüger, Till Luckenbach, Riccardo Massei, Lukas Nerlich, Sven Petruschke, Thomas Schnicke, Anton Schnurpel, Stefan Scholz, Nicole Schweiger, Daniel Sielaff, Wibke Busch*

* corresponding authors (, )

Data import

INTOB data

Data is read into the environment. This includes:

  • the data from the INTOB database with all experiments entered before 07.03.2024 and downloaded via the .csv download option in the frontend
  • a library of phenotypic effects that was created by manual curation of publications
  • a table from https://doi.org/10.5281/zenodo.10071824, which contains calculated baseline toxicities for substances
# data from .csv downloads
data_path = file.path('data', 'intob_data')

effect_list_info = read_csv(file.path(data_path, 'effect_list_info.csv'), show_col_types = F)
# a table of all effects, ordered roughly by category. To be used in plots to order effects.
col_ordering = read_tsv('data/data_for_analysis/ordering_effects.tsv', show_col_types = F)

d = list(
  metadata = read_csv(file.path(data_path, 'metadata.csv'), show_col_types = F),
  obs_phenotypes = read_csv(file.path(data_path, 'obs_phenotypes.csv'), show_col_types = F),
  phys_chem_prop = read_csv(file.path(data_path, 'phys_chem_prop.csv'), show_col_types = F)
)
# only keep experiments that have observations
exp_obs = unique(d$obs_phenotypes$experiment_id)
d = lapply(d, function(x) {x %>% filter(experiment_id %in% exp_obs)})
# getting the plate layout by making a graphql request
token_file = file.path('api', 'token')
token = readChar(token_file, file.info(token_file)$size-1)

# data for plate layout
request_plate_layout = 'query {
    experiments (date_lte:"2024-03-07T23:59:59+02:00", itemsPerPage:1000) {
        collection {
            _id
            date
            containers {
                collection {
                    id
                    name
                    height
                    width
                    cells {
                        collection {
                            x
                            y
                            marker {
                                name
                                concentration
                unit
                                substance {
                                    name
                                }
                            }
                        }
                    }
                }
            }
        }
    }
}'
raw_data = query_intob(request_plate_layout, token = token)
saveRDS(raw_data, 'data/graphql/plate_layout_raw_data.rds')
raw_data_plates = readRDS(file.path('data', 'graphql', 'plate_layout_raw_data.rds'))
plate_layout = raw_data_plates$data$experiments$collection %>% 
  unnest(containers) %>% unnest(collection) %>% 
  rename(container_name = name) %>% 
  unnest(cells) %>% cast_df('collection') %>% unnest(collection) %>% 
  unnest(marker) %>% 
  rename(marker_name = name) %>% 
  unnest(substance) %>% 
  rename(
    experiment_id = `_id`,
    container_id = id,
    substance_name = name
  ) %>% 
  add_column(container_type = .$height * .$width)

write_csv(plate_layout, 'data/data_for_publication/data/intob_data/plate_layout.csv')

# adding a column that contains the number of wells that are filled in a plate. If this number is different from container_type, the plate has not been filled completely
plate_layout = plate_layout %>% 
  group_by(experiment_id, container_id) %>% 
  mutate(plate_fill = n()) %>% 
  ungroup()

plate_layout = plate_layout %>% filter(experiment_id %in% d$metadata$experiment_id)

plate_layout$empty_wells = plate_layout$container_type - plate_layout$plate_fill

d$plate_layout = plate_layout

rm(raw_data_plates, plate_layout)

The observation table gets a new effect column

d$obs_phenotypes$effect = d$obs_phenotypes$detailed_effect
na_effects = is.na(d$obs_phenotypes$effect)
d$obs_phenotypes$effect[na_effects] = d$obs_phenotypes$parent_effect[na_effects]

d$obs_phenotypes = d$obs_phenotypes %>% 
  group_by(
    experiment_id,
    container,
    age_embryo_exposure_start_hpf,
    age_embryo_observation_hpf
  ) %>% 
  # mutate(container_type_from_data = max(position_x) * max(position_y)) %>% 
  mutate(n_effects_in_plate = n_distinct(position_x, position_y))

d$obs_phenotypes = d$obs_phenotypes %>% 
  mutate(container = as.character(container)) %>%
  left_join(
  d$plate_layout[, c(
    'experiment_id',
    'x',
    'y',
    'container_name',
    'container_type',
    'plate_fill',
    'empty_wells'
  )],
  by = c(
    'experiment_id' = 'experiment_id',
    'position_x' = 'x',
    'position_y' = 'y',
    'container' = 'container_name'
  )
)

# some substance names are very long, which is difficult to deal with in
# figures. They are changed to shorter synonyms
names_to_change = list(
  "N-(1,3-Dimethylbutyl)-N'-phenyl-p-phenylenediamine" = '6-PPDQ',
  "Ethyl 3-aminobenzoate methanesulfonic acid salt" = 'Tricaine mesylate',
  "Tris(1,3-dichloro-2-propyl) phosphate" = 'TDCPP'
)
for (n in names(names_to_change)) {
  d$obs_phenotypes$test_substance[d$obs_phenotypes$test_substance == n] = names_to_change[[n]]
  d$metadata$test_substance[d$metadata$test_substance == n] = names_to_change[[n]]
  d$phys_chem_prop$`Compound Name`[d$phys_chem_prop$`Compound Name` == n] = names_to_change[[n]]
  d$plate_layout$marker_name[d$plate_layout$marker_name == n] = names_to_change[[n]]
  d$plate_layout$substance_name[d$plate_layout$substance_name == n] = names_to_change[[n]]
}

Sometimes, experimenters only record adverse effects but not normal embryos. In these cases some embryos have no entry in the obs_phenotypes table.

exp_incomplete = d$obs_phenotypes %>%
  filter(n_effects_in_plate != plate_fill) %>% 
  group_by(experiment_id) %>% 
  filter(!'normal' %in% effect) %>% 
  pull(experiment_id) %>% 
  unique()

For these experiments, the normal effects are infered in the data. This is done by comparing the plate layout with the recorded effects, any embryo that exists in the plate but has no recorded effect is assumed to be normal.

infer_normal = function(exp_obs, exp_plate) {
  exp_obs = exp_obs %>% ungroup()
  
  exp_obs_for_diff = exp_obs %>%
    dplyr::select(
      position_x,
      position_y,
      container,
      concentration,
      test_substance,
      age_embryo_observation_hpf
    )
  
  exp_plate = exp_plate %>%
    rename(
      position_x = x,
      position_y = y,
      container = container_name,
      concentration = concentration,
      test_substance = substance_name
    ) %>%
    dplyr::select(position_x,
           position_y,
           container,
           concentration,
           test_substance)
  
  infered = lapply(unique(exp_obs$age_embryo_observation_hpf), function(t) {
    exp_obs_t_diff = exp_obs_for_diff %>% filter(age_embryo_observation_hpf == t)
    exp_obs_t_diff$age_embryo_observation_hpf = NULL
    # wells that have no recorded effects
    no_effect = setdiff(exp_plate, exp_obs_t_diff)
    # joining the recorded effects with wells that have not been recorded
    infered = full_join(
      exp_obs[exp_obs$age_embryo_observation_hpf == t,],
      no_effect,
      by = c(
        'position_x',
        'position_y',
        'container',
        'concentration',
        'test_substance'
      )
    ) %>%
      # some values are repeated and can be filled from the exisiting data
      fill(
        experiment_id,
        age_embryo_exposure_start_hpf,
        age_embryo_observation_hpf,
        observation_type,
        unit,
        container_type,
        plate_fill,
        .direction = 'down'
      ) %>%
      # the columns looking at plate fill and how many effects were recorded per plate are recomputed
      group_by(
        experiment_id,
        container,
        age_embryo_exposure_start_hpf,
        age_embryo_observation_hpf
      ) %>%
      mutate(n_effects_in_plate = n_distinct(position_x, position_y)) %>%
      mutate(empty_wells = container_type - plate_fill) %>%
      ungroup()
    # the infered normal effects are entered
    infered$parent_effect[is.na(infered$parent_effect)] = 'normal'
    infered$effect[is.na(infered$effect)] = 'normal'
    
    infered
  }) %>%
    do.call(rbind, .)
}

infered_exps = lapply(exp_incomplete, function(id) {
  exp_obs_original = d$obs_phenotypes %>% filter(experiment_id == id)
  exp_plate = d$plate_layout %>% filter(experiment_id == id)
  infer_normal(exp_obs_original, exp_plate)
}) %>% data.table::rbindlist()

d$obs_phenotypes = d$obs_phenotypes %>% filter(!experiment_id %in% exp_incomplete)
d$obs_phenotypes = rbind(d$obs_phenotypes, infered_exps)

A effects summary table is made from the observations. Effects within each observation are summarised into categories:

  • lethal: ‘coagulated’ is among the listed effects
  • normal: ‘normal’ is among the listed effects
  • sublethal: ‘normal’ is not among the listed effects
# exposure starts at 10 hpf or earlier are grouped together. The original exposure starts can still be found in the metadata table if needed
d$obs_phenotypes$age_embryo_exposure_start_hpf[d$obs_phenotypes$age_embryo_exposure_start_hpf <= 10] = 0

d$effects_summary = d$obs_phenotypes %>% 
  group_by(
    experiment_id,
    age_embryo_exposure_start_hpf,
    age_embryo_observation_hpf,
    position_x,
    position_y,
    container,
    concentration,
    unit,
    test_substance,
    container_type
  ) %>% 
  summarise(
    lethal = 'coagulated' %in% effect,
    normal = 'normal' %in% effect,
    sublethal = !normal,
    .groups = 'drop'
  )

Zotero data

# zotero library
zotero_lib = fromJSON(file.path('data', 'zotero_phenotype_library', '20240903_Zotero_library.json'))
zotero_effects = read_tsv(file.path('data', 'data_for_analysis', 'zotero_effects.tsv'),
                          show_col_types = F)

process_zotero = function(zotero_lib) {
  zotero_lib = zotero_lib %>% 
    unnest(author) %>% 
    unnest(issued)
  zotero_lib = zotero_lib %>% 
    rename(
      effect_abb = archive,
      age_hpf = archive_location,
      test_substance = `call-number`,
      fish_strain = `collection-title`
    ) %>% 
    # for mixtures, substances are seprated with semicolons, they are filtered out here
    filter(!str_detect(test_substance, ';')) %>% 
    # some substances contain a dtxsid besides their name. This information is
    # split into two columns, with NA for missing dtxsids
    separate_wider_delim(
      cols = 'test_substance',
      delim = '|',
      names = c('test_substance', 'dtxsid'),
      too_few = 'align_start',
      too_many = 'merge'
    ) %>% 
    # put all effects and time points in separate rows
    # separate_longer_delim(cols = matches('age_hpf'), delim = ', ') %>% 
    separate_longer_delim(cols = matches('effect_abb'), delim = ', ') %>% 
    left_join(zotero_effects, by = 'effect_abb') %>% 
    relocate(
      effect_abb, effect_long, age_hpf, test_substance, dtxsid, fish_strain,
      .before = 1
    ) %>% 
    unite(author, matches('family|given'), sep = ', ') %>% 
    group_by(id) %>% 
    mutate(
      author = paste(unique(author), collapse = '; ')
    )
  zotero_lib$year = zotero_lib$`date-parts` %>% lapply(function(l) {l[[1]]}) %>% as.numeric()
  zotero_lib$first_author = lapply(zotero_lib$author, function(l) {str_extract(l, '^\\w+')}) %>% as.character()
  zotero_lib$study_short = paste0(zotero_lib$first_author, ' et al. ', as.character(zotero_lib$year))
  zotero_lib$`date-parts` = NULL
  zotero_lib$accessed = NULL
  zotero_lib$`non-dropping-particle` = NULL
  zotero_lib$`dropping-particle` = NULL
  zotero_lib = unique(zotero_lib)
  zotero_lib %>% ungroup()
}

zotero_lib = process_zotero(zotero_lib)

zotero_substances = zotero_lib %>% ungroup() %>% dplyr::select(test_substance) %>% unique()

write_csv(zotero_lib, 'data/zotero_phenotype_library/phenotype_library.csv')

Data cleanup

If experiments have n unique recorded effects, which effects are those?

d$effects_per_exp_long = d$obs_phenotypes %>% 
  left_join(
    unique(
      d$metadata %>% dplyr::select(
        experiment_id,
        quality_flag
      )
    ), 
    by = 'experiment_id'
  ) %>% 
  group_by(experiment_id) %>% 
  mutate(
    n_unique_effects = n_distinct(effect)
  )

The dataset contains test experiments with no biological meaning. Typically they contain little data. It is probably safe to assume that any experiment with only one condition (conditions = unique combinations of exp. start, observation time point, concentration and substance) can be filtered out. Some experiments with only two conditions seem valid, though (e.g. exp id 428).

Experiments that:

  • have fewer than 15 embryos or
  • only 1 condition or

are filtered out.

d$effects_per_exp = d$obs_phenotypes %>% 
  left_join(
    unique(d$metadata %>% dplyr::select(experiment_id, quality_flag)),
    by = 'experiment_id'
  ) %>% 
  group_by(experiment_id, quality_flag) %>% 
  reframe(
    n_unique_effects_parent = n_distinct(parent_effect),
    n_unique_effects_child = n_distinct(detailed_effect),
    n_unique_effects = n_distinct(effect),
    n_conditions = pick(
      age_embryo_exposure_start_hpf,
      age_embryo_observation_hpf,
      concentration,
      test_substance
    ) %>% unique() %>% nrow(),
    n_embryos = pick(
      age_embryo_exposure_start_hpf,
      position_x,
      position_y,
      container
    ) %>% unique() %>% nrow(),
    n_observations = pick(
      age_embryo_exposure_start_hpf,
      position_x,
      position_y,
      container,
      age_embryo_observation_hpf,
      concentration,
      test_substance
    ) %>% unique() %>% nrow(),
    n_effects = n()
  ) %>% 
  add_column(
    observations_per_embryo = .$n_observations / .$n_embryos
  )
embryo_cutoff = 15
n_cond_cutoff = 2

exp_ids_filter = d$effects_per_exp %>%
  filter(
    n_embryos < embryo_cutoff |
    n_conditions < n_cond_cutoff
  )
data.frame(
  description = c('n_experiments', 'n_embryos'),
  before_filtering = c(
    n_distinct(d$effects_per_exp$experiment_id),
    sum(d$effects_per_exp$n_embryos)
  ),
  after_filtering =
    c(
      n_distinct(d$effects_per_exp[!d$effects_per_exp$experiment_id %in% exp_ids_filter$experiment_id,]$experiment_id),
      sum(d$effects_per_exp[!d$effects_per_exp$experiment_id %in% exp_ids_filter$experiment_id,]$n_embryos)
    ),
  difference =
    c(
      n_distinct(d$effects_per_exp[d$effects_per_exp$experiment_id %in% exp_ids_filter$experiment_id,]$experiment_id),
      sum(d$effects_per_exp[d$effects_per_exp$experiment_id %in% exp_ids_filter$experiment_id,]$n_embryos)
    )
) %>%
  add_column(perc_difference = .$difference / .$before_filtering)
d_f = list(
  obs_phenotypes = d$obs_phenotypes %>% filter(!experiment_id %in% exp_ids_filter$experiment_id),
  effects_per_exp = d$effects_per_exp %>% filter(!experiment_id %in% exp_ids_filter$experiment_id),
  effects_per_exp_long = d$effects_per_exp_long %>% filter(!experiment_id %in% exp_ids_filter$experiment_id),
  effects_summary = d$effects_summary %>% filter(!experiment_id %in% exp_ids_filter$experiment_id),
  metadata = d$metadata %>% filter(!experiment_id %in% exp_ids_filter$experiment_id),
  phys_chem_prop = d$phys_chem_prop %>% filter(!experiment_id %in% exp_ids_filter$experiment_id),
  plate_layout = d$plate_layout %>% filter(!experiment_id %in% exp_ids_filter$experiment_id)
)

A closer look at control variability

Untreated embryos naturally have lethal or sublethal effects at a certain probability. If embryos have too much variability this distorts the results of the treated embryos. What is the probability an embryo has an effect and how can experiments be assessed for adequate embryo quality?

To test for experiments that have insufficient embryo quality, a binomial test is used. To get a probability of an embryo being normal the proportion of normal embryos is calculated at the last observations in each respective experiment across all experiments. This is assumed to be the probability of a healthy embryo not exhibiting lethal or sublethal effects by chance. For each experiment a binomial test is performed, testing whether the proportion of normal embryos could be resulting from a binomial distribution with p=0.936 (H0) or p<0.936 (H1). Since significant results from this test are experiments that exhibit insufficient embryo quality, so we are interested in insignificant results (do not accept H1) from this test for further analysis.

c0_norm_not_norm = d_f$obs_phenotypes %>% 
  filter(concentration == 0) %>% 
  filter(parent_effect != 'heartbeat') %>% 
  group_by(experiment_id, position_x, position_y, container) %>% 
  filter(age_embryo_observation_hpf == max(age_embryo_observation_hpf)) %>%
  summarise(
    normal = 'normal' %in% effect,
    coagulated = 'coagulated' %in% effect,
    sublethal = !'normal' %in% effect & !'coagulated' %in% effect,
    total = !'normal' %in% effect,
    .groups = 'drop'
  )

# proportion of normal embryos in the complete dataset
p_normal = sum(c0_norm_not_norm$normal) / nrow(c0_norm_not_norm)
p_normal
## [1] 0.9361913
p_sublethal = sum(c0_norm_not_norm$total) / nrow(c0_norm_not_norm)
p_sublethal
## [1] 0.06380866
p_lethal = sum(c0_norm_not_norm$coagulated) / nrow(c0_norm_not_norm)
p_lethal
## [1] 0.04106498
# aggregate by experiment
c0_aggregated = c0_norm_not_norm %>% group_by(experiment_id) %>% 
  reframe(
    n_normal = sum(normal),
    n_coagulated = sum(coagulated),
    n_sublethal = sum(sublethal),
    n_total = sum(total),
    ratio_normal = sum(normal) / n(),
    ratio_coagulated = sum(coagulated) / n(),
    ratio_sublethal = sum(sublethal) / n(),
    n_embryos = n()
  ) %>% 
  # filter(n > 20) %>% 
  left_join(unique(d_f$metadata[, c('experiment_id', 'experiment_date', 'quality_flag')]), by = 'experiment_id')

c0_aggregated$binom_test_p_1s = mapply(
  function(x, n, p) {
    binom.test(x, n, p, alternative = 'less')$p.value
  },
  x = c0_aggregated$n_normal,
  n = c0_aggregated$n_embryos,
  p = p_normal
)
c0_aggregated$binom_test_adjp_1s = p.adjust(c0_aggregated$binom_test_p_1s, method = 'BH')
c0_aggregated$pass_binom = c0_aggregated$binom_test_p_1s > 0.05

message('Number of experiments at this point:')
## Number of experiments at this point:
c0_aggregated$experiment_id %>% n_distinct()
## [1] 608
message('Number of experiments if experiments with fewer than 90% normal control embryos are removed:')
## Number of experiments if experiments with fewer than 90% normal control embryos are removed:
c0_aggregated %>% filter(ratio_normal >= 0.9) %>% nrow()
## [1] 454
message('Number of experiments if experiments that don\'t pass the binomial test (p < 0.05) are removed:')
## Number of experiments if experiments that don't pass the binomial test (p < 0.05) are removed:
c0_aggregated %>% filter(binom_test_p_1s > 0.05) %>% nrow()
## [1] 561

Supplemenatry figure: How does p-value adjustment influence the results?

theme_control_var = theme(
  axis.title = element_text(size = 8),
  legend.title = element_text(size = 8),
  plot.margin = unit(c(0.6, 0.6, 0.6, 0.6), 'lines')
)

p_controls_padj = c0_aggregated %>% ggplot(aes(
  x = n_embryos,
  y = ratio_normal,
  colour = binom_test_adjp_1s < 0.05 | n_embryos < 9
  # size = n_embryos
)) +
  geom_vline(xintercept = 8.5, linewidth = 0.2, colour = 'grey60') +
  geom_point(alpha = 0.5, shape = 19) +
  scale_colour_manual(name = 'padj < 0.05 | n < 9', values = c('forestgreen', 'firebrick3')) +
  # scale_size(name = 'Number of control embryos', breaks = c(5, 10, 25, 50, 100), range = c(0.5, 6)) +
  guides(colour = guide_legend(order = 2), size = guide_legend(order = 1)) +
  labs(
    y = 'Ratio of normal embryos',
    x = 'Number of control embryos'
  ) +
  theme_control_var +
  theme(
    legend.position = 'bottom',
    legend.box = 'vertical'
  )

p_controls_p = c0_aggregated %>% ggplot(aes(
  x = n_embryos,
  y = ratio_normal,
  colour = binom_test_p_1s < 0.05 | n_embryos < 9
  # size = n_embryos
)) +
  geom_vline(xintercept = 8.5, linewidth = 0.2, colour = 'grey60') +
  geom_point(alpha = 0.5, shape = 19) +
  scale_colour_manual(name = 'p < 0.05 | n < 9', values = c('forestgreen', 'firebrick3')) +
  # scale_size(name = 'Number of control embryos', breaks = c(5, 10, 25, 50, 100), range = c(0.5, 6)) +
  guides(colour = guide_legend(order = 2), size = guide_legend(order = 1)) +
  labs(
    y = 'Ratio of normal embryos',
    x = 'Number of control embryos'
  ) +
  theme_control_var +
  theme(
    legend.position = 'bottom',
    legend.box = 'vertical'
  )

p_controls = egg::ggarrange(
  p_controls_p, p_controls_padj,
  ncol = 2,
  labels = letters[1:2],
  label.args = subfigure_label_args,
  draw = F
)

ggsave(
  'results/figures/control_variability/p_p_padj_comparison.svg',
  plot = p_controls,
  device = 'svg',
  width = 15,
  height = 8,
  units = 'cm',
  bg = 'white'
)

Experiments that don’t pass the binomial test for control variability are filtered out of the dataset.

exp_fail_binom = c0_aggregated$experiment_id[!c0_aggregated$pass_binom |
                                               c0_aggregated$n_embryos < 9]

data.frame(
  description = c('n_experiments', 'n_embryos'),
  before_filtering = c(
    n_distinct(d_f$effects_per_exp$experiment_id),
    sum(d_f$effects_per_exp$n_embryos)
  ),
  after_filtering =
    c(
      n_distinct(d_f$effects_per_exp[!d_f$effects_per_exp$experiment_id %in% exp_fail_binom,]$experiment_id),
      sum(d_f$effects_per_exp[!d_f$effects_per_exp$experiment_id %in% exp_fail_binom,]$n_embryos)
    ),
  difference =
    c(
      n_distinct(d_f$effects_per_exp[d_f$effects_per_exp$experiment_id %in% exp_fail_binom,]$experiment_id),
      sum(d_f$effects_per_exp[d_f$effects_per_exp$experiment_id %in% exp_fail_binom,]$n_embryos)
    )
) %>%
  add_column(perc_difference = .$difference / .$before_filtering)
d_f_c = list(
  obs_phenotypes = d_f$obs_phenotypes %>% filter(!experiment_id %in% exp_fail_binom),
  effects_per_exp = d$effects_per_exp %>% filter(!experiment_id %in% exp_fail_binom),
  effects_per_exp_long = d$effects_per_exp_long %>% filter(!experiment_id %in% exp_fail_binom),
  effects_summary = d_f$effects_summary %>% filter(!experiment_id %in% exp_fail_binom),
  metadata = d_f$metadata %>% filter(!experiment_id %in% exp_fail_binom),
  phys_chem_prop = d_f$phys_chem_prop %>% filter(!experiment_id %in% exp_fail_binom),
  plate_layout = d_f$plate_layout %>% filter(!experiment_id %in% exp_fail_binom)
)

Experiments with mixtures or environmental samples are separated from experiments with single (comptox) substances.

d_f_c$metadata %>%
  dplyr::select(experiment_id, category) %>%
  filter(category != 'control') %>% 
  unique() %>%
  group_by(category) %>% 
  summarise(n_exp = n_distinct(experiment_id))
other_types = c('mixture', 'sample')

exp_comptox = d_f_c$metadata %>% 
  group_by(experiment_id) %>% 
  filter(!any(other_types %in% category)) %>% 
  pull(experiment_id) %>% unique()

exp_mix_env_other = d_f_c$metadata %>% 
  filter(category %in% c('mixture', 'sample', 'other')) %>% 
  pull(experiment_id) %>% unique()

d_f_comptox = list(
  obs_phenotypes = d_f_c$obs_phenotypes %>% filter(experiment_id %in% exp_comptox),
  effects_per_exp = d$effects_per_exp %>% filter(experiment_id %in% exp_comptox),
  effects_per_exp_long = d$effects_per_exp_long %>% filter(experiment_id %in% exp_comptox),
  effects_summary = d_f_c$effects_summary %>% filter(experiment_id %in% exp_comptox),
  metadata = d_f_c$metadata %>% filter(experiment_id %in% exp_comptox),
  phys_chem_prop = d_f_c$phys_chem_prop %>% filter(experiment_id %in% exp_comptox),
  plate_layout = d_f_c$plate_layout %>% filter(experiment_id %in% exp_comptox)
)

d_f_mix_env_other = list(
  obs_phenotypes = d_f_c$obs_phenotypes %>% filter(experiment_id %in% exp_mix_env_other),
  effects_per_exp = d$effects_per_exp %>% filter(experiment_id %in% exp_mix_env_other),
  effects_per_exp_long = d$effects_per_exp_long %>% filter(experiment_id %in% exp_mix_env_other),
  effects_summary = d_f_c$effects_summary %>% filter(experiment_id %in% exp_mix_env_other),
  metadata = d_f_c$metadata %>% filter(experiment_id %in% exp_mix_env_other),
  phys_chem_prop = d_f_c$phys_chem_prop %>% filter(experiment_id %in% exp_mix_env_other),
  plate_layout = d_f_c$plate_layout %>% filter(experiment_id %in% exp_mix_env_other)
)

Effects summary table for modelling

The controls are usually called ‘Control 1’ in the test_substance column. For modelling it is easier to have the controls also labelled with the actual test substance. This also means that when multiple substances were measured in a single experiment with shared controls, these have to be repeated. Finally, modelling will only be done for comptox substances, but not mixtures or environmental samples.

comptox_substances = d_f_comptox$metadata %>% filter(category %in% c('comptox')) %>% 
  dplyr::select(test_substance, dtxsid, `MolWeight_g/mol`) %>% unique()

# a new summary table only to be used for modelling, as this will contain repeated values
d_f_comptox$effects_summary_modelling = d_f_comptox$effects_summary %>% 
  multiply_controls() %>% 
  inner_join(comptox_substances, by = c('compound_new' = 'test_substance'))

# converting all mg/L concentrations to umol/L; some concentrations are in %, they will not be changed
d_f_comptox$effects_summary_modelling$concentration = apply(d_f_comptox$effects_summary_modelling, 1, function(row) {
  unit = row[['unit']]
  conc = as.numeric(row[['concentration']])
  molweight = as.numeric(row[['MolWeight_g/mol']])
  if (unit == 'mg/L') {
    # mg/L is converted to umol/L
    mg_to_umol(conc_mg = conc, molweight = molweight)
  } else {
    # umol/L and % values remain unchanged
    conc
  }
})
# all units are changed accordingly, only Methanol keeps the % concentration
d_f_comptox$effects_summary_modelling$unit[d_f_comptox$effects_summary_modelling$compound_new != 'Methanol'] = 'umol/L'

substance_data_overview = d_f_comptox$effects_summary_modelling %>% 
  filter(compound_new %in% comptox_substances$test_substance) %>% 
  group_by(
    compound_new,
    age_embryo_exposure_start_hpf,
    age_embryo_observation_hpf
  ) %>% 
  summarise(
    n_exp = n_distinct(experiment_id),
    n_conc = n_distinct(concentration),
    n_embryos = pick(
      experiment_id,
      position_x,
      position_y,
      container
    ) %>% unique() %>% nrow(),
    .groups = 'drop'
  )

As some experimenters only record lethality (i.e. only normal or coagulated), not all experiments can be used for modelling of sublethal drcs. The comptox data is further split into a dataset that can be used for lethal drcs (all data) and a dataset that can be used for sublethal drcs (only experiments that record > 2 unique effects).

exp_2_effects = d_f_comptox$effects_per_exp %>%
  filter(n_unique_effects < 3) %>% 
  pull(experiment_id)

data.frame(
  description = c('n_experiments', 'n_embryos'),
  before_filtering = c(
    n_distinct(d_f_comptox$effects_per_exp$experiment_id),
    sum(d_f_comptox$effects_per_exp$n_embryos)
  ),
  after_filtering =
    c(
      n_distinct(d_f_comptox$effects_per_exp[!d_f_comptox$effects_per_exp$experiment_id %in% exp_2_effects,]$experiment_id),
      sum(d_f_comptox$effects_per_exp[!d_f_comptox$effects_per_exp$experiment_id %in% exp_2_effects,]$n_embryos)
    ),
  difference =
    c(
      n_distinct(d_f_comptox$effects_per_exp[d_f_comptox$effects_per_exp$experiment_id %in% exp_2_effects,]$experiment_id),
      sum(d_f_comptox$effects_per_exp[d_f_comptox$effects_per_exp$experiment_id %in% exp_2_effects,]$n_embryos)
    )
) %>%
  add_column(perc_difference = .$difference / .$before_filtering)
d_f_comptox_sublethal = list(
  obs_phenotypes = d_f_comptox$obs_phenotypes %>% filter(!experiment_id %in% exp_2_effects),
  effects_per_exp = d_f_comptox$effects_per_exp %>% filter(!experiment_id %in% exp_2_effects),
  effects_per_exp_long = d_f_comptox$effects_per_exp_long %>% filter(!experiment_id %in% exp_2_effects),
  effects_summary = d_f_comptox$effects_summary %>% filter(!experiment_id %in% exp_2_effects),
  effects_summary_modelling = d_f_comptox$effects_summary_modelling %>% filter(!experiment_id %in% exp_2_effects),
  metadata = d_f_comptox$metadata %>% filter(!experiment_id %in% exp_2_effects),
  phys_chem_prop = d_f_comptox$phys_chem_prop %>% filter(!experiment_id %in% exp_2_effects),
  plate_layout = d_f_comptox$plate_layout %>% filter(!experiment_id %in% exp_2_effects)
)
d_f_comptox$effects_summary_modelling %>%
  filter(age_embryo_exposure_start_hpf %in% c(0, 24) & age_embryo_observation_hpf %in% c(24, 48, 72, 96, 120)) %>%
  group_by(experiment_id, age_embryo_exposure_start_hpf, age_embryo_observation_hpf) %>% 
  reframe() %>% 
  dplyr::select(age_embryo_exposure_start_hpf, age_embryo_observation_hpf) %>% 
  table()
##                              age_embryo_observation_hpf
## age_embryo_exposure_start_hpf  24  48  72  96 120
##                            0  294 289 135 124  95
##                            24   9  59  51  56  26
exposure_ov_lethal = d_f_comptox$effects_summary_modelling %>% 
  group_by(
    age_embryo_exposure_start_hpf,
    age_embryo_observation_hpf
  ) %>% 
  summarise(
    n_exp = n_distinct(experiment_id),
    n_substances = n_distinct(compound_new),
    n_conc = n_distinct(concentration),
    n_embryos = pick(
      experiment_id,
      position_x,
      position_y,
      container
    ) %>% unique() %>% nrow(),
    .groups = 'drop'
  ) %>% 
  filter(
    age_embryo_exposure_start_hpf %in% c(0, 24) &
      age_embryo_observation_hpf %in% (24*1:5)
  )

exposure_ov_sublethal = d_f_comptox_sublethal$effects_summary_modelling %>% 
  group_by(
    age_embryo_exposure_start_hpf,
    age_embryo_observation_hpf
  ) %>% 
  summarise(
    n_exp = n_distinct(experiment_id),
    n_substances = n_distinct(compound_new),
    n_conc = n_distinct(concentration),
    n_embryos = pick(
      experiment_id,
      position_x,
      position_y,
      container
    ) %>% unique() %>% nrow(),
    .groups = 'drop'
  ) %>% 
  filter(
    age_embryo_exposure_start_hpf %in% c(0, 24) &
      age_embryo_observation_hpf %in% (24*1:5)
  )

Overview over substances and experimental designs

Figure for manuscript: General overview over INTOB and filtering of experiments with low embryo quality.

theme_intob_ov = theme(
  axis.title = element_text(size = 8),
  legend.title = element_text(size = 8),
  plot.margin = unit(c(0.6, 0.6, 0.6, 0.6), 'lines')
)

col_grey = 'grey50'

# embryos per substance barplot
p_bar_substances_n_embryos = d_f$plate_layout %>%
  filter(concentration != 0) %>% 
  group_by(substance_name) %>% 
  summarise(n_embryos = n()) %>% 
  arrange(desc(n_embryos)) %>% 
  slice_head(n = 10) %>% 
  ggplot(
    aes(y = factor(substance_name, levels = rev(substance_name)), x = n_embryos)
  ) +
  geom_bar(stat = 'identity', width = 0.55, fill = col_grey) +
  scale_x_continuous(breaks = c(0, 1000, 2000)) +
  labs(
    x = 'Embryos (n)'
  ) +
  theme_intob_ov +
  theme(
    axis.title.y = element_blank(),
    axis.text.y = element_text(size = 8, family = 'Roboto condensed')
  )

p_hist_embryos_per_exp = d_f$plate_layout %>%
  group_by(experiment_id) %>% 
  summarise(n_embryos = n()) %>% 
  ggplot(aes(x = n_embryos)) +
  geom_histogram(binwidth = 10, fill = col_grey) +
  labs(
    x = 'Embryos (n)',
    y = 'Experiments (n)'
  ) +
  theme_intob_ov

# overview over exposure designs, i.e. start hpf and observation hpf
p_exp_designs = d_f$obs_phenotypes %>%
  group_by(experiment_id) %>% 
  reframe(
    start_hpf = unique(age_embryo_exposure_start_hpf),
    obs_hpf = unique(age_embryo_observation_hpf)
  ) %>% 
  group_by(start_hpf, obs_hpf) %>% 
  summarise(n = n(), .groups = 'drop') %>% 
  filter(start_hpf %in% c(0, 24, 48)) %>%
  filter(obs_hpf %in% c(24, 48, 72, 96, 120)) %>%
  ggplot(
    aes(x = factor(obs_hpf), y = factor(start_hpf), label = n)
  ) +
  geom_point(aes(size = n), colour = 'grey15', alpha = 0.7, shape = 16) +
  geom_text(size = 9/.pt, family = 'Roboto', colour = 'grey99') +
  labs(
    x = 'Age at observation (hpf)',
    y = 'Age at exposure (hpf)'
  ) +
  scale_size_continuous(
    range = c(6, 15)
  ) +
  theme_intob_ov +
  theme(
    legend.position = 'none'
  )


# visualisation of the control variability analysis
p_controls_p = c0_aggregated %>% ggplot(aes(
  x = n_embryos,
  y = ratio_normal,
  colour = binom_test_p_1s < 0.05 | n_embryos < 9
)) +
  geom_vline(xintercept = 8.5, linewidth = 0.2, colour = 'grey60') +
  geom_point(alpha = 0.6, shape = 16, size = 2) +
  scale_colour_manual(name = 'p < 0.05 | n < 9', values = c('forestgreen', 'firebrick3')) +
  scale_y_continuous(limits = c(0, 1)) +
  guides(colour = guide_legend(order = 2), size = guide_legend(order = 1)) +
  labs(
    y = 'Ratio of normal embryos',
    x = 'Number of control embryos'
  ) +
  theme_intob_ov +
  theme(
    legend.title = element_text(hjust = 0.5),
    legend.position = 'inside',
    legend.position.inside = c(0.85, 0.15)
  )

p_intob_ov = egg::ggarrange(
  p_bar_substances_n_embryos,
  p_exp_designs,
  p_hist_embryos_per_exp,
  p_controls_p,
  labels = letters[1:4],
  label.args = subfigure_label_args,
  padding = unit(1, 'cm'),
  ncol = 2,
  draw = F
)

ggsave(
  filename = 'results/figures/control_variability/p_control_variability.svg',
  plot = p_intob_ov,
  device = 'svg',
  width = 17.5,
  height = 10.5,
  units = 'cm',
  bg = 'white'
)

Dose response modelling

Lethal and sublethal drc

Dose response curves for comptox substances for data with exposure starts at 0 or 24 hpf and observations at 48, 72, 96 or 120 hpf are modeled. For lethal models data from d_f_comptox can be used, for sublethal models data from d_f_comptox_sublethal has to be used, as in this dataset it is ensured that all experiments recorded sublethal effects.

# some helper functions
get_model_data = function(effects_summary, start_hpf, end_hpf, substance) {
  effects_summary %>% 
    filter(
      age_embryo_exposure_start_hpf == start_hpf &
        age_embryo_observation_hpf == end_hpf &
        compound_new == substance
    )
}
set_model_name = function(t1, t2) {paste('t', t1, t2, sep = '_')}
get_model_name_t1 = function(model_name) {str_split_1(model_name)[2]}
get_model_name_t2 = function(model_name) {str_split_1(model_name)[3]}

start_end_df = expand_grid(start_hpf = c(0, 24), end_hpf = c(48, 72, 96, 120))

Concentration ranges

Aggregating the data needed for the figure in the manuscript displaying concentration ranges.

# only substances that have observations at 96hpf
substances = d_f_comptox$effects_summary_modelling %>% 
  filter((age_embryo_exposure_start_hpf == 0 & age_embryo_observation_hpf == 96) |
           (age_embryo_exposure_start_hpf == 24 & age_embryo_observation_hpf == 96)) %>% 
  pull(compound_new) %>% unique()
substances = substances[substances != 'Methanol']

data_conc_ranges = d_f_comptox$effects_summary_modelling %>%
  filter(compound_new %in% substances) %>% 
  dplyr::select(
    experiment_id,
    position_x,
    position_y,
    container,
    unit,
    test_substance,
    compound_new,
    dtxsid,
    `MolWeight_g/mol`,
    concentration
  ) %>% unique()

# add concentration in mg/L. All concentrations have been converted to umol/L above.
data_conc_ranges$concentration_mgL = umol_to_mg(data_conc_ranges$concentration,
                                                data_conc_ranges$`MolWeight_g/mol`)

# adding a column with number of tested embryos, will be used for fill aesthetic in the plot
data_conc_ranges = data_conc_ranges %>% 
  group_by(compound_new) %>% 
  mutate(n_embryos = n()) %>% 
  ungroup()

# getting the max concentrations, will be used for ordering
max_concentrations = data_conc_ranges %>%
  group_by(compound_new) %>% 
  summarise(max_c = mean(concentration)) %>% 
  arrange(max_c)

# getting ec50 and lc50 values for 0-96 and 24-96
lc50_values = lapply(substances, function(s) {
  lc50_096 = NA
  lc50ste_096 = NA
  if (class(models_all[[s]]$lethal$t_0_96) == 'drc') {
    x = ED(models_all[[s]]$lethal$t_0_96, 50, display = F)
    lc50_096 = x[[1]]
    lc50ste_096 = x[[2]]
  }
  ec50_096 = NA
  ec50ste_096 = NA
  if (class(models_all[[s]]$sublethal$t_0_96) == 'drc') {
    x = ED(models_all[[s]]$sublethal$t_0_96, 50, display = F)
    ec50_096 = x[[1]]
    ec50ste_096 = x[[2]]
  }
  lc50_2496 = NA
  lc50ste_2496 = NA
  if (class(models_all[[s]]$lethal$t_24_96) == 'drc') {
    x = ED(models_all[[s]]$lethal$t_24_96, 50, display = F)
    lc50_2496 = x[[1]]
    lc50ste_2496 = x[[2]]
  }
  ec50_2496 = NA
  ec50ste_2496 = NA
  if (class(models_all[[s]]$sublethal$t_24_96) == 'drc') {
    x = ED(models_all[[s]]$sublethal$t_24_96, 50, display = F)
    ec50_2496 = x[[1]]
    ec50ste_2496 = x[[2]]
  }
  rbind(
    data.frame(
      compound_new = s,
      xc50 = lc50_096,
      xc50ste = lc50ste_096,
      type = 'LC50 (0-96)'
    ),
    data.frame(
      compound_new = s,
      xc50 = ec50_096,
      xc50ste = ec50ste_096,
      type = 'EC50 (0-96)'
    ),
    data.frame(
      compound_new = s,
      xc50 = lc50_2496,
      xc50ste = lc50ste_2496,
      type = 'LC50 (24-96)'
    ),
    data.frame(
      compound_new = s,
      xc50 = ec50_2496,
      xc50ste = ec50ste_2496,
      type = 'EC50 (24-96)'
    )
  )
}) %>% do.call(rbind, .)
lc50_values = lc50_values %>% left_join(comptox_substances, by = c('compound_new' = 'test_substance'))
lc50_values$xc50_mgL = umol_to_mg(lc50_values$xc50, lc50_values$`MolWeight_g/mol`)
lc50_values$xc50ste_mgL = umol_to_mg(lc50_values$xc50ste, lc50_values$`MolWeight_g/mol`)

sensitivity_ratios = lc50_values %>% 
  separate_wider_delim(type, delim = ' ', names = c('type', 'time_point')) %>% 
  group_by(compound_new, time_point) %>% 
  summarise(sens_ratio = xc50[type == 'LC50'] / xc50[type == 'EC50'], .groups = 'drop') %>% 
  add_column(log_sens_ratio = log2(.$sens_ratio))

# a summary of the modelling results, will be plotted alongside the concentration ranges
model_overview = lapply(substances, function(s) {
  modelling_res = function(substance, model_list, type, timeframe) {
    model = model_list[[s]][[type]][[timeframe]]
    if (class(model) == 'drc') {
      'modelling successful'
    } else {
      if (nrow(model$origData) == 0) {
        'no data'
      } else {
        'modelling unsuccessful'
      }
    }
  }
  
  data.frame(
    substance = s,
    'EC50_096' = modelling_res(s, models_all, 'sublethal', 't_0_96'),
    'LC50_096' = modelling_res(s, models_all, 'lethal', 't_0_96'),
    'EC50_2496' = modelling_res(s, models_all, 'sublethal', 't_24_96'),
    'LC50_2496' = modelling_res(s, models_all, 'lethal', 't_24_96')
  )
}) %>% do.call(rbind, .) %>% 
  pivot_longer(cols = contains('50'), names_to = 'model_type')

Building the figure for the manuscript.

theme_conc_ranges = theme(
  legend.position = 'bottom',
  axis.title.y = element_blank(),
  axis.text.y = element_text(size = 7),
  axis.text.x = element_text(size = 7),
  axis.title.x = element_text(size = 7),
  legend.text = element_text(size = 7),
  legend.title = element_text(size = 7)
)

theme_conc_ranges_side = theme(
  panel.border = element_blank(),
  axis.ticks.y = element_blank()
)

# colour for NA values
grey_light = 'grey90'
# lightish colour to show data
grey_mid = 'grey70'
# dark grey for outlines
grey_dark = 'grey50'

# violin plots showing the measured concentrations per substance, additionally,
# EC50 and LC50 values are drawn
p_conc_range = data_conc_ranges %>%
  filter(concentration != 0) %>%
  ggplot(aes(
    x = concentration,
    y = factor(compound_new, levels = max_concentrations$compound_new)
  )) +
  geom_violin(linewidth = 0.2, fill = grey_light, colour = grey_dark, alpha = 0.5) +
  geom_point(
    data = lc50_values,
    mapping = aes(x = xc50, y = compound_new, colour = type),
    size = 0.75,
    inherit.aes = F
  ) +
  geom_errorbarh(
    data = lc50_values,
    mapping = aes(xmin = xc50-xc50ste, xmax = xc50+xc50ste, y = compound_new, colour = type),
    # height = 0.5,
    inherit.aes = F
  ) +
  labs(
    x = 'Concentration (µmol/L)'
  ) +
  scale_colour_manual(
    name = NULL,
    values = c(
      'EC50 (0-96)' = 'firebrick1',
      'LC50 (0-96)' = 'firebrick4',
      'EC50 (24-96)' = 'dodgerblue2',
      'LC50 (24-96)' = 'dodgerblue4'
    ),
    # labels = c(
    #   expression(EC['50']~'(0-96 hpf)'),
    #   expression(LC['50']~'(0-96 hpf)'),
    #   expression(EC['50']~'(24-96 hpf)'),
    #   expression(LC['50']~'(24-96 hpf)')
    # ),
    guide = guide_legend(ncol = 2)
  ) +
  scale_x_log10(guide = 'axis_logticks', breaks =1*10^c(-4, -2, 0, 2, 4)) +
  theme_conc_ranges +
  theme(
    panel.grid.major.x = element_line(linewidth = 0.2, colour = grey_light),
    panel.grid.minor.x = element_line(linewidth = 0.2, colour = grey_light),
    panel.grid.major.y = element_line(linewidth = 0.2, colour = grey_light),
    axis.line.x = element_line(colour = 'black', linewidth = 0.2),
    axis.ticks.x = element_line(colour = 'black', linewidth = 0.2),
    plot.margin = unit(c(.1, .1, .1, .1), 'lines')
  )

p_model_overview = model_overview %>%
  ggplot(aes(
    x = factor(
      model_type,
      levels = c(
        'EC50_096',
        'LC50_096',
        'EC50_2496',
        'LC50_2496'
      )
    ), 
    y = factor(substance, levels = max_concentrations$compound_new),
    colour = value
  )) +
  # geom_tile(colour = grey_light, width = 0.8, height = 0.8) +
  geom_point(shape = 15, size = 2.1) +
  scale_colour_manual(
    name = NULL,
    values = c(
      'modelling successful' = 'forestgreen',
      'modelling unsuccessful' = 'firebrick3',
      'no data' = grey_light
    ),
    guide = guide_legend(
      keyheight = unit(2.5, units = "mm"),
      keywidth = unit(2.5, units = "mm"),
      ncol = 1
    )
  ) +
  scale_x_discrete(
    labels = c(
        '0-96 hpf sublethal',
        '0-96 hpf lethal',
        '24-96 hpf sublethal',
        '24-96 hpf lethal'
      )
  ) +
  # coord_fixed() +
  theme_conc_ranges +
  theme_conc_ranges_side +
  theme(
    axis.title.x = element_blank(),
    panel.grid = element_blank(),
    axis.ticks = element_blank(),
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
    axis.text.y = element_blank()
  )

p_sens_ratio_tile = sensitivity_ratios %>% 
  ggplot(aes(
    x = time_point,
    y = factor(compound_new, levels = max_concentrations$compound_new),
    colour = sens_ratio
  )) +
  # geom_tile(colour = grey_light, width = 0.8, height = 0.8) +
  geom_point(shape = 15, size = 2.1) +
  scale_colour_gradientn(
    colours = c('mediumorchid4', 'gold1'),
    values = c(0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.75, 1),
    na.value = grey_light,
    name = 'Sensitivity ratio'
  ) +
  scale_x_discrete(labels = c('0-96 hpf', '24-96 hpf')) +
  theme_conc_ranges +
  theme_conc_ranges_side +
  theme(
    axis.title.x = element_blank(),
    panel.grid = element_blank(),
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
    axis.text.y = element_blank(),
    axis.ticks = element_blank(),
    legend.key.width = unit(5, 'mm'),
    legend.key.height = unit(3.5, 'mm'),
    legend.title.position = 'top'
  )

# density plot to show the distribution of concentrations across all substances
p_conc_ranges_all = data_conc_ranges %>% 
  filter(concentration != 0) %>% 
  ggplot(
    aes(x = concentration, y = after_stat(density))
  ) + 
  geom_density(fill = grey_mid, colour = alpha('white', 0)) +
  scale_x_log10() +
  theme_void()

# bar plot to show the number of embryos per substance
p_substances_n_embryos = data_conc_ranges %>% 
  ggplot(
    aes(y = factor(compound_new, levels = max_concentrations$compound_new))
  ) +
  geom_bar(fill = grey_mid, width = 0.75) +
  scale_x_continuous(breaks = c(0, 2500)) +
  labs(x = 'Embryos (n)') +
  theme_conc_ranges +
  theme_conc_ranges_side +
  theme(
    axis.title.y = element_blank(),
    axis.text.y = element_blank(),
    panel.grid = element_blank(),
    axis.line.x = element_line(colour = 'black', linewidth = .2),
    axis.ticks.x = element_line(colour = 'black', linewidth = .2),
    plot.margin = unit(c(.1, .1, .1, .1), 'lines')
  )

# an empty plot to use when plotting the above plots in a grid
p_empty = ggplot(mtcars, aes(x = wt, y = mpg)) + theme_void() + geom_blank()


p_data = egg::ggarrange(
  # first row
  p_conc_ranges_all, p_empty, p_empty, p_empty,
  # second row
  p_conc_range + theme(legend.position = 'none'),
  p_substances_n_embryos,
  p_model_overview + theme(legend.position = 'none'),
  p_sens_ratio_tile + theme(legend.position = 'none'),
  ncol = 4,
  widths = c(10, 1.1, 1.5, 0.75),
  heights = c(1, 30),
  draw = F,
  debug = F
)

p_legend = egg::ggarrange(
  p_empty,
  ggpubr::get_legend(p_conc_range) %>% as_ggplot(),
  ggpubr::get_legend(p_model_overview) %>% as_ggplot(),
  ggpubr::get_legend(p_sens_ratio_tile) %>% as_ggplot(),
  p_empty,
  widths = c(1, 2, 2, 2, 1),
  ncol = 5,
  draw = F
)

p = plot_grid(
  p_data, p_legend,
  ncol = 1, rel_heights = c(30, 2)
)

ggsave(
  filename = file.path('results', 'figures', 'concentration_ranges', 'concentration_ranges.svg'),
  plot = p,
  device = 'svg',
  width = 16,
  height = 22,
  units = 'cm',
  bg = 'white',
  create.dir = T
)

Phenotypic fingerprints

To ensure similar concentration ranges are compared for all substances, for each substance, data is filtered to be in the range of the EC5 and the LC99. In this range substances cause phenotpypic effects, but effects are not dominated by lethality.

filter_low_occ_effects = function(heatmap, cutoff = 0.01) {
  # filter seldom occuring effects
  keep_cols = apply(heatmap, 2, mean)[apply(heatmap, 2, mean) > cutoff] %>% names()
  heatmap = heatmap[, keep_cols]
  return(heatmap)
}

clusters_like_complexHeatmap = function(hclust, heatmap, k) {
  cluster_old = hclust %>% cutree(k)
  
  dend = as.dendrogram(hclust)
  # this is how complexHeatmap reorders the dendrogram
  dend = reorder(dend, -rowMeans(heatmap))
  # reversing it orders the clusters from top to bottom in ascending numbers, as
  # they are plotted in complexHeatmap
  row_index = rev(order.dendrogram(dend))
  
  df = data.frame(
    substance = names(cluster_old),
    cluster = cluster_old
  )
  
  df = df[row_index, ]
  df$cluster_new = c(1:6)[factor(df$cluster, levels = unique(df$cluster))]
  
  cluster_new = df$cluster_new %>% set_names(df$substance)
  cluster_new
}

set.seed(123)

cluster_n = c(
  '48' = 6,
  '72' = 6,
  '96' = 6,
  '120' = 4
)

time_points = c(48, 72, 96, 120)
# iterate over observation time points
pfp_data = lapply(time_points, function(t2) {
  # iterate over substances
  pfp_data_t = lapply(models_all, function(model_list) {
    # iterate over exposure start points
    lapply(c(0, 24), function(t1) {
      model_name = set_model_name(t1, t2)
      # get lower limit = EC5, skip if model does not exist
      if (class(model_list$sublethal[[model_name]]) != 'drc') {
        return(NULL)
      } else {
        lower_conc = ED(model_list$sublethal[[model_name]], 5, display = F)[[1]]
      }
      # get upper limit = LC99, Inf if model does not exist
      if (class(model_list$lethal[[model_name]]) != 'drc') {
        upper_conc = Inf
      } else {
        upper_conc = ED(model_list$lethal[[model_name]], 99, display = F)[[1]]
      }
      observations = d_f_comptox_sublethal$obs_phenotypes %>%
        filter(
          test_substance == model_list$substance &
            age_embryo_exposure_start_hpf == t1 &
            age_embryo_observation_hpf == t2
        )
      observations = left_join(observations, comptox_substances, by = 'test_substance')
      # converting all mg/L concentrations to umol/L
      observations$concentration = mapply(
        function(unit, conc, molweight) {
          if (unit == 'mg/L') {
            # mg/L is converted to umol/L
            mg_to_umol(conc, molweight)
          } else {
            # umol/L and % values remain unchanged
            conc
          }
        },
        unit = observations$unit,
        conc = observations$concentration,
        molweight = observations$`MolWeight_g/mol`
      )
      observations$unit[observations$unit == 'mg/L'] = 'umol/L'
      observations = observations %>% filter(concentration >= lower_conc &
                                               concentration <= upper_conc)
      
      return(observations)
    }) %>% rbindlist()
  }) %>% rbindlist()
  
  pfp_data_t = pfp_data_t %>% filter(
    !effect %in%  c(
      # 'coagulated',
      'beats per minute',
      'miscellaneous',
      'no embryo',
      'normal',
      'hours post fertilization'
    )
  )
  
  pfp_data_t = pfp_data_t %>% group_by(test_substance) %>% 
    mutate(
      n_embryos = nrow(unique(pick(experiment_id, container, position_x, position_y))),
      n_exp = n_distinct(experiment_id)
    ) %>% 
    ungroup()
  
  pfp_data_heatmap_t = pfp_data_t %>% group_by(test_substance) %>%
    # mutate(n_exp = n_distinct(experiment_id)) %>%
    # filter substances that only have 1 experiment
    # filter(n_exp > 1) %>%
    mutate(n_embryos = pick(experiment_id, position_x, position_y, container) %>% unique() %>% nrow()) %>%
    filter(n_embryos > 15) %>%
    group_by(test_substance, effect) %>%
    summarise(percentage = n() / unique(n_embryos),
              .groups = 'drop') %>%
    pivot_wider(
      id_cols = test_substance,
      names_from = effect,
      values_from = percentage,
      values_fill = 0
    ) %>%
    as.data.frame()
  
  rownames(pfp_data_heatmap_t) = pfp_data_heatmap_t$test_substance
  pfp_data_heatmap_t$test_substance = NULL
  
  pfp_data_heatmap_t = pfp_data_heatmap_t %>% as.matrix()
  
  # filtering low data substances and effects
  pfp_data_t = pfp_data_t %>% filter(n_embryos >= 30)
  pfp_data_heatmap_t = pfp_data_heatmap_t[unique(pfp_data_t$test_substance), ]

  pfp_data_heatmap_t = filter_low_occ_effects(pfp_data_heatmap_t, cutoff = 0.01)
  
  hclust_t = hclust(dist(pfp_data_heatmap_t, method = 'euclidean'), method = 'ward.D2')
  
  return(list(
    pfp_data = pfp_data_t,
    pfp_heatmap = pfp_data_heatmap_t,
    pfp_hclust = hclust_t
  ))
}) %>% set_names(time_points)

Fingerprints per timepoint

Heatmap of the fingerprints for each timepoint (48, 72, 96, 120 hpf).

set.seed(123)
# defining plot and font sizes
# plot size in cm
w_cm = 12
h_cm = 18
# conversion to inches, as svg() uses inches
w_in = w_cm / 2.54
h_in = h_cm / 2.54
# font size row and column labels
fontsize_row_col = 7
font_size_smaller = fontsize_row_col - 1
# family = 'Arial'

# colour for NA values
grey_light = 'grey90'
# lightish colour to show data
grey_mid = 'grey70'
# dark grey for outlines
grey_dark = 'grey30'

time_points = c(48, 72, 96, 120)
for (t in time_points) {
  t = as.character(t)

  pfp_data_t = pfp_data[[t]]$pfp_data
  pfp_data_heatmap_t = pfp_data[[t]]$pfp_heatmap
  
  col_ordering_t = col_ordering %>% filter(effect %in% colnames(pfp_data_heatmap_t))
  effect_proportions = lapply(colnames(pfp_data_heatmap_t), function(eff) {
    n_embryos = pfp_data_t %>%
      dplyr::select(experiment_id, container, position_x, position_y) %>%
      unique() %>%
      nrow()
    n_effect = pfp_data_t %>%
      filter(effect == eff) %>% 
      dplyr::select(experiment_id, container, position_x, position_y) %>%
      unique() %>%
      nrow()
    return(n_effect / n_embryos)
  }) %>% as.numeric() %>% set_names(colnames(pfp_data_heatmap_t))
  
  substances_n_embryos = lapply(rownames(pfp_data_heatmap_t), function(s) {
    pfp_data_t %>%
      filter(test_substance == s) %>%
      dplyr::select(experiment_id, position_x, position_y, container) %>% 
      unique() %>% 
      nrow()
  }) %>% as.numeric() %>% set_names(rownames(pfp_data_heatmap_t))
  
  substances_n_exp = lapply(rownames(pfp_data_heatmap_t), function(s) {
    pfp_data_t %>%
      filter(test_substance == s) %>%
      pull(experiment_id) %>% n_distinct()
  }) %>% as.numeric() %>% set_names(rownames(pfp_data_heatmap_t))
  
  clusters = pfp_data[[t]]$pfp_hclust %>% cutree(k = cluster_n[t])
  
  ht_opt$TITLE_PADDING = unit(c(1.5, -0.2), "lines")
  svg(
    filename = sprintf('results/figures/phenotypic_fingerprint/heatmaps/cheatmap_EC5_LC99_t%s_cut.svg', t),
    width = w_in,
    height = h_in,
    bg = 'white'
  )
  draw(
    Heatmap(
      pfp_data_heatmap_t,
      # colour and legend
      col = colorRampPalette(c('#ffefff', '#660066'))(1024),
      heatmap_legend_param = list(
        legend_direction = 'horizontal',
        legend_width = unit(2, 'cm'),
        tick_length = unit(0.5, 'mm'),
        title = 'Prevalence of embryos\nexhibiting effect',
        title_gp = gpar(fontsize = fontsize_row_col, fontface = 'plain'),
        labels_gp = gpar(fontsize = font_size_smaller)
      ),
      border = T,
      border_gp = gpar(col = grey_light),
      # clustering rows
      clustering_distance_rows = 'euclidean',
      clustering_method_rows = 'ward.D2',
      row_dend_width = unit(w_in * 0.1, 'inches'),
      row_title_rot = 0,
      row_title_side = 'right',
      row_title_gp = gpar(fontsize = fontsize_row_col + 1, fill = grey_light, lwd = 0),
      split = cluster_n[t],
      # row_split = clusters,
      row_gap = unit(0.2, 'lines'),
      # order of columns
      cluster_columns = F,
      column_order = col_ordering_t$effect,
      # text size
      row_names_gp = grid::gpar(fontsize = fontsize_row_col),
      column_names_gp = grid::gpar(fontsize = fontsize_row_col),
      # annotations next to the heatmap
      top_annotation = HeatmapAnnotation(
        prop = anno_barplot(
          effect_proportions,
          border = F,
          bar_width = 0.5,
          height = unit(h_in * 0.04, 'inches'),
          ylim = c(0, round(max(effect_proportions), digits = 1)),
          gp = gpar(fill = grey_mid, col = alpha('black', 0)),
          axis_param = list(
            at = c(0, 0.4),
            gp = grid::gpar(fontsize = font_size_smaller, col = grey_dark)
          )
        ),
        annotation_label = 'Prevalence\nof effect',
        annotation_name_side = 'left',
        annotation_name_rot = 0,
        # axis title
        annotation_name_gp = grid::gpar(fontsize = font_size_smaller, col = grey_dark)
      ),
      right_annotation = rowAnnotation(
        n_embryos = anno_barplot(
          substances_n_embryos,
          border = F,
          bar_width = 0.5,
          width = unit(w_in * 0.04, 'inches'),
          gp = gpar(fill = grey_mid, col = alpha('black', 0)),
          axis_param = list(
            # breaks = 0, max rounded to nearest hundred and half of max rounded to hundred
            at = c(
              0,
              round(max(substances_n_embryos), digits = -2)/2,
              round(max(substances_n_embryos), digits = -2)
            ),
            gp = gpar(fontsize = font_size_smaller, col = grey_dark)
          )
        ),
        # n_embryos = anno_numeric(
        #   substances_n_embryos,
        #   labels_gp = gpar(fontsize = font_size_smaller - 1),
        #   bg_gp = gpar(fill = "grey70", col = alpha('black', 0)),
        #   labels_offset = unit(2, "pt"),
        #   round_corners = F,
        #   width = unit(w_in * 0.04, 'inches')
        # ),
        annotation_label = 'embryos in\nfingerprint (n)',
        annotation_name_rot = 90,
        annotation_name_gp = grid::gpar(fontsize = font_size_smaller, col = grey_dark)
      )
    ),
    heatmap_legend_side = "bottom"
  )
  dev.off()
  ht_opt(RESET = TRUE)
}

Supplementary figure with PCA biplots for the fingerprints.

pca_plots = lapply(as.character(c(48, 72, 96, 120)), function(t) {
  pca_biplot(pfp_data[[t]]$pfp_heatmap) +
    theme(
      axis.text = element_text(family = 'Roboto', size = 6),
      axis.title = element_text(family = 'Roboto', size = 8)
    )
})

p_pca_combined = egg::ggarrange(
  plots = pca_plots,
  ncol = 2,
  labels = letters[1:4],
  label.args = subfigure_label_args,
  draw = F
)

ggsave(
  'results/figures/phenotypic_fingerprint/pca/pca_all_effects.svg',
  plot = p_pca_combined,
  device = 'svg',
  width = 18,
  height = 18,
  units = 'cm',
  bg = 'white'
)

Figure with UMAP embeddings and heatmaps of the PCA loadings for each time point.

set.seed(123)

pca_loading_plots = lapply(as.character(c(48, 72, 96, 120)), function(t) {
  
  pca = prcomp(pfp_data[[t]]$pfp_heatmap, scale. = F)
  explained_variance <- pca$sdev^2
  proportion_variance_explained <- explained_variance / sum(explained_variance)
  proportion_variance_explained[1:5] %>% sum

  var_explained_df = data.frame(
    name = colnames(pca$rotation[, 1:5]),
    var_expl = proportion_variance_explained[1:5]
  )
  
  euc_norm = pca$rotation[, 1:5] %>% apply(1, function(x) sqrt(sum(x^2)))
  pc_sums = abs(pca$rotation[, 1:5]) %>% rowSums()
  
  pca$rotation[, 1:5] %>% as_tibble(rownames = 'effect') %>%
    add_column(importance = euc_norm) %>%
    arrange(importance) %>%
    slice_tail(n = 10) %>%
    pivot_longer(cols = starts_with('PC')) %>%
    left_join(var_explained_df, by = 'name') %>% 
    add_column(
      pc_expl = sprintf('%s\n%.1f%%', .$name, round(.$var_expl * 100, digits = 1))
    ) %>%
    ggplot(aes(
      x = pc_expl,
      y = factor(effect, levels = unique(effect)),
      colour = value
    )) +
    geom_point(shape = 'square', size = 4) +
    scale_colour_gradient2(
      name = 'Loading of variable',
      low = 'darkblue',
      mid = 'grey90',
      high = 'darkred',
      breaks = c(-0.6, 0, 0.6),
      labels = c('-0.6', '0', '0.6')
    ) +
    theme(
      axis.title = element_blank(),
      legend.key.size = unit(c(0.3), 'cm'),
      axis.text = element_text(family = 'Roboto condensed', size = 7),
      legend.title = element_text(family = 'Roboto', size = 7),
      legend.text = element_text(family = 'Roboto', size = 7),
      legend.title.position = 'top'
    )
}) %>% set_names(c(48, 72, 96, 120))



umap_plots = lapply(as.character(c(48, 72, 96, 120)), function(t) {
  umap = umap(pfp_data[[t]]$pfp_heatmap)
  
  clusters = clusters_like_complexHeatmap(
    pfp_data[[t]]$pfp_hclust,
    pfp_data[[t]]$pfp_heatmap,
    cluster_n[t]
  )
  clusters_df = data.frame(
    substance = names(clusters),
    cluster = clusters
  )
  
  umap_df = as.data.frame(umap$layout) %>%
    add_column(substance = rownames(umap$layout)) %>% 
    left_join(clusters_df, by = 'substance')
  
  umap_df %>% ggplot(
    aes(x = V1, y = V2, colour = factor(cluster), label = substance)
  ) + 
    geom_point(size = 2.5, alpha = 0.8) +
    # geom_text(size = 6/.pt) +
    scale_colour_manual(
      values = c(
        '#D43F3AFF',
        '#EEA236FF',
        '#5CB85CFF',
        '#357EBDFF',
        '#9632B8FF',
        '#B8B8B8FF'
      )[1:cluster_n[t]]
    ) +
    labs(
      x = 'UMAP 1',
      y = 'UMAP 2'
    ) +
    theme(
      axis.title = element_text(family = 'Roboto', size = 8),
      axis.text = element_blank(),
      axis.ticks = element_blank()
    )
}) %>% set_names(c(48, 72, 96, 120))

p_empty = ggplot(mtcars, aes(x = wt, y = mpg)) + theme_void() + geom_blank()

for (n in names(umap_plots)) {
  p = plot_grid(
    plot_grid(
      p_empty,
      umap_plots[[n]],
      nrow = 1,
      rel_widths = c(1, 10)
    ),
    pca_loading_plots[[n]],
    ncol = 1,
    rel_heights = c(3, 3)
  )
  ggsave(
    sprintf('results/figures/phenotypic_fingerprint/umap/umap_t%s_no_names.svg', n),
    plot = p,
    device = 'svg',
    width = 7,
    height = 17,
    units = 'cm',
    bg = 'white'
  )
}

Comparison to external resources

substance_fp_comparison = function(substance, intob_pfp, zotero, intob_tps = c(72, 96, 120), return_data = F) {
  all_effects = c('coagulated', 'abnormal eye', 'abnormal tail effects', 'developmental delay',
                  'malformation', 'abnormal behavior', 'heart', 'abnormal blood circulation',
                  'edema', 'abnormal pigmentation', 'abnormal hatching', 'abnormal swim bladder',
                  'abnormal intestine') %>% sort()
  
  ds_zotero = zotero %>% filter(test_substance == substance)
  ds_zotero = ds_zotero %>%
    arrange(year) %>% 
    dplyr::select(effect_long, study_short) %>%
    dplyr::rename(source = study_short, parent_effect = effect_long) %>% 
    add_column(value = NA_integer_) %>% 
    filter(!parent_effect %in% c('gene regulation', 'mortality rate', 'neuronal effects'))
  
  ds_intob = lapply(as.character(intob_tps), function(t) {
    ds_pfp = intob_pfp[[t]]$pfp_data %>% filter(test_substance == substance)
    n_embryos = ds_pfp %>%
      dplyr::select(test_substance, experiment_id, position_x, position_y, container) %>% 
      unique() %>% 
      nrow()
    ds_pfp = intob_pfp[[t]]$pfp_data %>% filter(test_substance == substance) %>% 
      group_by(parent_effect) %>%
      count() %>% 
      ungroup() %>% 
      mutate(value = n / n_embryos) %>% 
      add_column(source = sprintf('INTOB (%s hpf)', t)) %>% 
      dplyr::select(-n)
    ds_pfp$parent_effect[ds_pfp$parent_effect == 'deformation / malformation'] = 'malformation'
    ds_pfp$parent_effect[ds_pfp$parent_effect == 'developmental delay effects'] = 'developmental delay'
    ds_pfp$parent_effect[ds_pfp$parent_effect == 'heartbeat'] = 'heart'
    ds_pfp
  }) %>% do.call(rbind, .)

  ds_combined = rbind(ds_intob, ds_zotero) %>% 
    add_column(substance = substance)
  
  # create a factor for source, where INTOB is last, preceeded by the studies
  ds_combined$source = factor(
    ds_combined$source,
    levels = rev(c('INTOB', unique(ds_combined$source[ds_combined$source != 'INTOB'])))
  )
  
  if (return_data) return(ds_combined)
  
  ds_combined %>% ggplot(
    aes(x = factor(parent_effect, levels = all_effects), y = source, fill = value)
  ) +
    geom_point(shape = 22, size = 3, stroke = 0.3) +
    scale_x_discrete(drop = F) +
    scale_fill_gradient(
      name = 'Prevalence of effect',
      low = 'white',
      high = 'firebrick3',
      na.value = 'grey'
    ) +
    guides(fill = guide_colorbar(title.position = 'top')) +
    theme(
      axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
      axis.title = element_blank(),
      legend.title = element_text(size = 7)
    )
}
age_selection = c('72', '96', '120', '72, 96', '72, 120', '96, 120', '72, 96, 120')

substance_fp_comparison('Nicotine', pfp_data, zotero_lib[zotero_lib$age_hpf %in% age_selection, ]) %>% 
  ggsave(
    'results/figures/zotero/nicotine.png',
    plot = .,
    device = 'png',
    width = 7,
    height = 8,
    units = 'cm',
    dpi = 300,
    bg = 'white'
  )

p_vpa = substance_fp_comparison('Valproic acid', pfp_data, zotero_lib[zotero_lib$age_hpf %in% age_selection, ])

p_vpa %>% 
  ggsave(
    'results/figures/zotero/valproic_acid.png',
    plot = .,
    device = 'png',
    width = 7,
    height = 8,
    units = 'cm',
    dpi = 300,
    bg = 'white'
  )
vpa_df = data.frame(
  ecx = c(1317, 5.7, 435, 80.6, 82, 0.586),
  ecx_ste = c(55, NA, NA, 10.2, NA, NA),
  type = c('LC50', 'LC50', 'LC50', 'EC50', 'EC50', 'EC50'),
  source = c('INTOB (0-96 hpf)',
             'Jarque et al. 2020',
             'Brotzmann et al. 2021',
             'INTOB (0-96 hpf)',
             'Brotzmann et al. 2021',
             'Jarque et al. 2020')
)
vpa_df$source = factor(
  vpa_df$source,
  levels = rev(c(
    'INTOB (0-96 hpf)',
    'Jarque et al. 2020',
    'Brotzmann et al. 2021'
  ))
)

vpa_ec_lc = vpa_df %>% ggplot(
  aes(x = ecx, colour = type, y = source, xmin = ecx-ecx_ste, xmax = ecx+ecx_ste)
) + 
  geom_point(size = 1.5) +
  geom_errorbarh(height = 0.45) +
  scale_x_log10(guide = 'axis_logticks', breaks =c(1, 10, 100, 1000)) +
  scale_colour_manual(name = NULL, values = c(
    'EC50' = 'firebrick1',
    'LC50' = 'firebrick4'
  )) +
  labs(
    x = 'Concentration (µmol/L)'
  ) +
  theme(
    axis.title.y = element_blank(),
    legend.direction = 'vertical',
    axis.title.x = element_text(size = 7)
  ) 

egg::ggarrange(
  p_vpa,
  vpa_ec_lc,
  heights = c(1, 0.75),
  labels = c('b', 'c'),
  label.args = subfigure_label_args
) %>% 
  ggsave(
    'results/figures/zotero/valproic_acid_ec_lc.svg',
    plot = .,
    device = 'svg',
    width = 7,
    height = 12,
    units = 'cm',
    dpi = 300,
    bg = 'white'
  )
for (s in intersect(substances_list$zotero, substances_list$intob)) {
  p = substance_fp_comparison(s, pfp_data, zotero_lib_combined) +
    ggtitle(s)
  print(p)
}
plotlist = lapply(intersect(substances_list$zotero, substances_list$intob), function(s) {
  substance_fp_comparison(s, pfp_data, zotero_lib_combined) +
    ggtitle(s)
})
p = cowplot::plot_grid(
  plotlist = plotlist,
  ncol = 4
)
ggsave(
  filename = 'results/figures/zotero/zotero_intob_effects.png',
  plot = p,
  device = 'png',
  width = 50,
  height = 70,
  units = 'cm',
  dpi = 350,
  bg = 'white'
)
# top_zotero_substances = zotero_lib_combined %>%
#   group_by(test_substance) %>%
#   count() %>%
#   arrange(desc(n)) %>%
#   pull(test_substance) %>% .[1:15]
# 
# for (s in top_zotero_substances) {
#   p = substance_fp_comparison(s, pfp_data, zotero_lib_combined) +
#     ggtitle(s)
#   print(p)
# }

Making a heatmap of selected substances from the literature review dataset.

age_selection = c('72', '96', '120', '72, 96', '72, 120', '96, 120', '72, 96, 120')

zotero_matrix_df = zotero_lib %>% 
  filter(age_hpf %in% age_selection) %>% 
  mutate(id = sprintf('%s (%s)', test_substance, study_short)) %>% 
  mutate(id = iconv(id, from = 'UTF-8', to = 'ASCII//TRANSLIT')) %>% 
  mutate(observed = 1) %>% 
  group_by(test_substance) %>% 
  mutate(substance_n = n_distinct(id)) %>% 
  filter(substance_n > 1) %>% 
  pivot_wider(
    id_cols = matches('id'),
    names_from = effect_long,
    values_from = observed,
    values_fn = first,
    values_fill = 0
  ) %>% 
  as.data.frame()
zotero_matrix_df = zotero_matrix_df[!duplicated(zotero_matrix_df$id), ]


zotero_matrix = zotero_matrix_df
zotero_matrix$dtxsid = NULL
zotero_matrix$`NA` = NULL
zotero_matrix$`gene regulation` = NULL
zotero_matrix$id = NULL
rownames(zotero_matrix) = zotero_matrix_df$id
zotero_matrix = zotero_matrix %>% as.matrix()
zotero_matrix = zotero_matrix[rowSums(zotero_matrix) != 0, ]
set.seed(1234)
# defining plot and font sizes
# plot size in cm
w_cm = 13
h_cm = 19
# conversion to inches, as svg() uses inches
w_in = w_cm / 2.54
h_in = h_cm / 2.54
# font size row and column labels
fontsize_row_col = 7
font_size_smaller = fontsize_row_col - 1

substances = rownames(zotero_matrix) %>% str_extract('^.+(?= \\()')
studies = rownames(zotero_matrix) %>% str_extract('(?<= \\().+(?=\\))')

svg(
  filename = sprintf(
    'results/figures/zotero/zotero_heatmap_72_96_120.svg'
  ),
  width = w_in,
  height = h_in,
  bg = 'white'
)
draw(
  Heatmap(
    zotero_matrix,
    show_row_names = F,
    # colour and legend
    col = colorRampPalette(c('grey90', 'grey20'))(2),
    heatmap_legend_param = list(
      title = 'Effect reported',
      title_gp = gpar(fontsize = fontsize_row_col, fontface = 'plain'),
      lgd = Legend(
        at = c(0, 1),
        labels = c('not reported', 'reported'),
        legend_gp = gpar(fill = c('grey90', 'grey20'), fontsize = fontsize_row_col-1)
      ),
      legend_direction = 'horizontal',
      legend_width = unit(w_in * 0.15, 'inches')
    ),
    right_annotation = rowAnnotation(
      substance = zotero_matrix %>% rownames %>% str_extract('^.+(?=[:space:]\\()'),
      substance_name = anno_text(substances, gp = gpar(fontsize = fontsize_row_col, fontface = 'plain')),
      study = zotero_matrix %>% rownames %>% str_extract('\\(.+'),
      study_name = anno_text(studies, gp = gpar(fontsize = fontsize_row_col, fontface = 'plain')),
      show_legend = F,
      annotation_name_gp = gpar(fontsize = fontsize_row_col, col = 'grey20'),
      simple_anno_size = unit(0.3, 'cm')
    ),
    # clustering rows
    clustering_distance_rows = 'euclidean',
    clustering_method_rows = 'ward.D2',
    row_dend_width = unit(w_in * 0.1, 'inches'),
    # order of columns
    cluster_columns = F,
    # text size
    row_names_gp = grid::gpar(fontsize = fontsize_row_col),
    column_names_gp = grid::gpar(fontsize = fontsize_row_col)
  )
)
dev.off()

Effect propagation and time and concentration dependency

corr_time = function(experiments, t1, t2, occurence_cut = 0.005, message = F) {
  # get the relevant data
  exps = d_f_comptox_sublethal$obs_phenotypes %>%
    # only containers where embryos can be tracked
    filter(experiment_id %in% experiments) %>%
    # only treatment concentrations
    filter(concentration != 0) %>% 
    # filter uninformative effects
    filter(
      !effect %in%  c(
        'beats per minute',
        'miscellaneous',
        'no embryo',
        'hours post fertilization'
      )
    ) %>% 
    # get experiments that have observations at 48 and 96
    group_by(experiment_id) %>% 
    filter(t1 %in% age_embryo_observation_hpf & t2 %in% age_embryo_observation_hpf) %>% 
    filter(age_embryo_observation_hpf %in% c(t1, t2)) %>% 
    add_column(
      embryo_id = paste(
        .$experiment_id,
        .$test_substance,
        .$position_x,
        .$position_y,
        .$container,
        sep = '.'
      )
    )
  
  if (message) message(sprintf('n embryos: %i', n_distinct(exps$embryo_id)))
  
  # get the data in matrix format
  a_48 = exps %>%
    ungroup() %>% 
    filter(age_embryo_observation_hpf == t1) %>%
    dplyr::select(embryo_id, effect) %>% 
    group_by(
      embryo_id
    ) %>%
    count(effect) %>%
    # rbind(data.frame(
    #   embryo_id = 'dummy_embryo_a',
    #   effect = col_ordering$effect,
    #   n = 0
    # )) %>% 
    pivot_wider(names_from = effect,
                values_from = n,
                values_fill = 0) %>%
    ungroup()
  
  a_96 = exps %>%
    ungroup() %>% 
    filter(age_embryo_observation_hpf == t2) %>%
    dplyr::select(embryo_id, effect) %>% 
    group_by(
      embryo_id
    ) %>%
    count(effect) %>%
    # rbind(data.frame(
    #   embryo_id = 'dummy_embryo_b',
    #   effect = col_ordering$effect,
    #   n = 0
    # )) %>% 
    pivot_wider(names_from = effect,
                values_from = n,
                values_fill = 0) %>%
    ungroup()
  
  a_48_names = a_48 %>%
    filter(embryo_id %in% intersect(a_48$embryo_id, a_96$embryo_id)) %>% 
    dplyr::select(intersect(names(a_48), names(a_96)))
  a_96_names = a_96 %>%
    filter(embryo_id %in% intersect(a_48$embryo_id, a_96$embryo_id)) %>% 
    dplyr::select(intersect(names(a_48), names(a_96)))
  
  a_48 = a_48_names %>% dplyr::select(!embryo_id) %>% as.matrix()
  a_96 = a_96_names %>% dplyr::select(!embryo_id) %>% as.matrix()
  
  # filter effects that occur in less than 1% of embryos
  high_occurence_effects = rbind(a_48, a_96) %>% apply(2, function(x) sum(x)/length(x))
  high_occurence_effects = names(high_occurence_effects[high_occurence_effects > occurence_cut])
  a_48 = a_48[, high_occurence_effects]
  a_96 = a_96[, high_occurence_effects]
  
  # create matrices of correlations and p-values
  n = colnames(a_48) %>% length
  cor_48_96 = lapply(colnames(a_48), function(effect_48) {
    lapply(colnames(a_96), function(effect_96) {
      c = cor.test(a_48[, effect_48], a_96[, effect_96])
      m = matrix(c$estimate)
      colnames(m) = effect_48
      rownames(m) = effect_96
      m
    }) %>% do.call(rbind, .)
  }) %>% do.call(cbind, .)
  pval_48_96 = lapply(colnames(a_48), function(effect_48) {
    lapply(colnames(a_96), function(effect_96) {
      c = cor.test(a_48[, effect_48], a_96[, effect_96])
      m = matrix(c$p.value)
      colnames(m) = effect_48
      rownames(m) = effect_96
      m
    }) %>% do.call(rbind, .)
  }) %>% do.call(cbind, .)
  
  # cols are effects at t1
  # rows are effects at t2
  return(
    list(
      cor = cor_48_96,
      pval = pval_48_96
    )
  )
}
exps = d_f_comptox_sublethal$obs_phenotypes %>%
  # only containers where embryos can be tracked
  filter(container_type %in% c(24, 48, 96)) %>%
  pull(experiment_id) %>% unique()

cor_48_96 = corr_time(exps, 48, 96, message = T)
## n embryos: 778
w_cm = 9
h_cm = 9
w_in = w_cm / 2.54
h_in = h_cm / 2.54

svg(
  'results/figures/effect_propagation/effect_prop_48_96_corrplot.svg',
  width = w_in,
  height = h_in,
  pointsize = 7,
  family = 'Roboto',
  bg = 'white'
)
corrplot(
  cor_48_96$cor,
  method = 'square',
  col = rev(COL2('RdBu', 200)),
  col.lim = c(-1, 1),
  addgrid.col = 'grey95',
  # order = 'hclust',
  # hclust.method = 'mcquitty',
  tl.col = 'black',
  cl.align.text = 'l',
  pch = 4,
  pch.cex = 1,
  pch.col = 'grey70',
  p.mat = cor_48_96$pval,
  sig.level = 0.01
)
dev.off()
figure_time_conc = function(
  s,
  start_hpf = 0,
  time_points = c(48, 72, 96),
  text_size = 7,
  return_legend = F
) {
  
  theme_time_conc = theme(
    legend.position = 'bottom',
    axis.title.y = element_blank(),
    axis.text.y = element_text(size = text_size, colour = 'black'),
    axis.text.x = element_text(size = text_size-1, colour = 'black'),
    axis.title.x = element_text(size = text_size),
    legend.text = element_text(size = text_size),
    legend.title = element_text(size = text_size),
    strip.text = element_text(size = text_size),
  )
  
  # colour for NA values
  grey_light = 'grey90'
    # lightish colour to show data
  grey_mid = 'grey80'
    # dark grey for outlines
  grey_dark = 'grey50'
    
  weight = comptox_substances$`MolWeight_g/mol`[comptox_substances$test_substance == s]
  s_exps = d_f_comptox_sublethal$obs_phenotypes %>% filter(test_substance == s) %>% pull(experiment_id) %>% unique()
  
  df = d_f_comptox_sublethal$obs_phenotypes %>% ungroup() %>%
    filter(experiment_id %in% s_exps) %>% 
    filter(age_embryo_exposure_start_hpf == start_hpf) %>% 
    filter(age_embryo_observation_hpf %in% time_points) %>% 
    filter(test_substance == s) %>% 
    filter(!effect %in% c('miscellaneous', 'hours post fertilization', 'beats per minute', 'no embryo'))
  
  if (nrow(df) == 0) {
    message('Error: Exposure scenario not measured')
    return(NULL)
  }
  
  df$concentration = apply(df, 1, function(row) {
    unit = row[['unit']]
    conc = as.numeric(row[['concentration']])
    if (unit == 'mg/L') {
      # mg/L is converted to umol/L
      mg_to_umol(conc_mg = conc, molweight = weight)
    } else {
      # umol/L and % values remain unchanged
      conc
    }
  })
  # all units are changed accordingly, only Methanol keeps the % concentration
  df$unit = 'umol/L'
  
  # getting ec50 and lc50 values
  df_conditions = expand_grid(
    s = s,
    start = start_hpf,
    end = time_points,
    type = c('lethal', 'sublethal')
  )
  ec_lc_df = mapply(
    function(s, t1, t2, type) {
      n = set_model_name(t1, t2)
      m = models_all[[s]][[type]][[n]]
      lvl = case_when(
        type == 'lethal' ~ 50,
        type == 'sublethal' ~ 50
      )
      col = case_when(
        type == 'lethal' ~ 'LC50',
        type == 'sublethal' ~ 'EC50',
      )
      
      if (class(m) == 'drc') {
        ec = ED(m, lvl, display = F)
  
        return(
          data.frame(
            test_substance = s,
            start_hpf = t1,
            age_embryo_observation_hpf = t2,
            type = type,
            effect_level = lvl,
            ec = ec[1],
            ec_ste = ec[2],
            colour = col
          )
        )
      } else {
        return(NULL)
      }
    },
    s = as.character(df_conditions$s),
    t1 = as.character(df_conditions$start),
    t2 = as.character(df_conditions$end),
    type = df_conditions$type,
    SIMPLIFY = F
  ) %>% rbindlist()
  
  # these effect will be plotted, others occur seldomly and are omitted
  selected_effects = c(
    'normal',
    'coagulated',
    'no hatching',
    # 'early hatching',
    'no blood circulation',
    'blood congestion',
    'abnormal heart',
    'no heartbeat',
    'decreased heartbeat',
    # 'increased heartbeat',
    'deformation yolk sac',
    'pericardium edema',
    'yolk sac edema',
    # 'increased pigmentation',
    'low pigmentation',
    'no pigmentation',
    # 'scoliosis',
    'modified structure of chorda',
    'abnormal tail',
    'lack of somites',
    'abnormal tail tip',
    'abnormal tail length',
    'no tail detachment',
    # 'abnormal tail fin',
    'deformation head',
    'smaller head',
    'abnormal eye size',
    # 'no eye',
    # 'malformation sacculi / otoliths',
    # 'no spontaneous tail contraction',
    # 'no movement after hatching',
    'shivering',
    # 'tremor',
    # 'no swim bladder',
    'developmental delay'
    # 'necrosis'
  )
  df = df %>% filter(effect %in% selected_effects)
  
  # a data frame containing effects and timepoints at which they cannot occur. This will be marked in the plot.
  timepoints_prohibited = effect_list_info %>%
    dplyr::select(effect_label, timepoint_start, timepoint_end) %>%
    rename(effect = effect_label) %>%
    pmap(possibly(function(effect, timepoint_start, timepoint_end) {
      data.frame(effect = effect,
                 age_embryo_observation_hpf = time_points[time_points < timepoint_start |
                                                      time_points > timepoint_end])
    })) %>% rbindlist()
  
  # one effect with a long name is shortened
  selected_effects[selected_effects == 'modified structure of chorda'] = 'mod. structure of chorda'
  df$effect[df$effect == 'modified structure of chorda'] = 'mod. structure of chorda'
  timepoints_prohibited[timepoints_prohibited$effect == 'modified structure of chorda'] = 'mod. structure of chorda'
  
  p_fp = df %>% 
    group_by(concentration, age_embryo_observation_hpf) %>% 
    mutate(n_embryos_in_dg = nrow(unique(pick(experiment_id, container, position_x, position_y)))) %>% 
    group_by(concentration, age_embryo_observation_hpf, effect) %>% 
    reframe(perc_in_dg = n() / n_embryos_in_dg) %>% 
    unique() %>% 
    group_by(effect) %>% 
    mutate(mean_prevalence = mean(perc_in_dg)) %>% 
    # filter(mean_prevalence > 0.1) %>% 
    filter(concentration != 0) %>% 
    ggplot(aes(
      y = factor(effect, levels = rev(selected_effects)),
      x = concentration,
      colour = perc_in_dg
    )) +
    geom_point(shape = 'square', size = 2.5, alpha = 0.8) +
    geom_hline(
      data = timepoints_prohibited,
      aes(yintercept = effect),
      colour = grey_mid,
      linewidth = 1
    ) +
    scale_x_log10(
      guide = 'axis_logticks'
    ) +
    labs(
      x = 'Concentration (µmol/L)'
    ) +
    scale_y_discrete(
      limits = rev(selected_effects)
    ) +
    scale_colour_gradient(
      name = 'Percentage of embryos\nexhibiting effect',
      low = '#ffefff', high = '#660066',
      limits = c(0, 1),
      breaks = c(0, 0.5, 1),
      guide = guide_colorbar(direction = 'horizontal')
    ) +
    facet_wrap(vars(age_embryo_observation_hpf), ncol = length(time_points)) +
    theme_time_conc +
    theme(
      axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
      strip.text = element_blank(),
      legend.position = 'right',
      legend.title.position = 'top',
      legend.key.width = unit(0.5, 'cm'),
      plot.margin = unit(c(-3, 4.5, 4.5, 4.5), 'points')
    )
  
  p_conc = df %>% 
    dplyr::select(
      test_substance,
      experiment_id,
      container,
      position_x,
      position_y,
      concentration,
      age_embryo_observation_hpf
    ) %>% 
    # filter(mean_prevalence > 0.1) %>% 
    filter(concentration != 0) %>% 
    unique() %>% 
    ggplot(aes(
      y = test_substance,
      x = concentration
    )) +
    geom_violin(linewidth = 0.2, fill = grey_light, colour = grey_dark, alpha = 0.5) +
    # geom_density_ridges(
    #   linewidth = 0.2,
    #   fill = grey_light,
    #   colour = grey_dark,
    #   alpha = 0.5,
    #   panel_scaling = T
    # ) +
    geom_point(
      data = ec_lc_df,
      mapping = aes(x = ec, y = test_substance, colour = colour),
      size = 0.8,
      inherit.aes = F
    ) +
    geom_errorbarh(
      data = ec_lc_df,
      mapping = aes(xmin = ec-ec_ste, xmax = ec+ec_ste, y = test_substance, colour = colour),
      height = 0.6,
      inherit.aes = F
    ) +
    scale_colour_manual(
      name = NULL,
      values = c(
        'EC50' = 'grey60',
        'LC50' = 'grey40'
      )
    ) +
    scale_x_log10() +
    scale_y_discrete(
      expand = expansion(add = 1)
    ) +
    facet_wrap(
      vars(age_embryo_observation_hpf), ncol = length(time_points),
      labeller = function(df) {
        data.frame(labels = sprintf('%i-%i hpf', start_hpf, as.numeric(df$age_embryo_observation_hpf)))
      }
    ) +
    theme_time_conc +
    theme(
      # axis.text.x = element_text(angle = 90, size = 8, hjust = 1, vjust = 0.5),
      axis.text.x = element_blank(),
      axis.title.x = element_blank(),
      axis.ticks.x = element_blank(),
      axis.ticks.y = element_blank(),
      axis.text.y = element_blank(),
      legend.position = 'right',
      plot.margin = unit(c(4.5, 4.5, 4.5, 4.5), 'points')
    )
  
  if (return_legend) {
    p_legend = egg::ggarrange(
      ggpubr::get_legend(p_conc) %>% as_ggplot(),
      ggpubr::get_legend(p_fp) %>% as_ggplot(),
      ncol = 2,
      draw = F,
      debug = F
    )
    return(p_legend)
  } else {
    p_time_conc = egg::ggarrange(
      p_conc + theme(legend.position = 'none'),
      p_fp + theme(legend.position = 'none'),
      ncol = 1,
      heights = c(1, 12),
      draw = F,
      debug = F
    )
    return(p_time_conc)
  }
}
ggsave(
  'results/figures/effect_propagation/time_conc_fig_legend.svg',
  plot = figure_time_conc('Bisphenol A', 0, text_size = 7, return_legend = T),
  device = 'svg',
  width = 6,
  height = 2.5,
  units = 'cm',
  bg = 'white'
)
substances = c(
  'Bisphenol A',
  'Benzovindiflupyr',
  '6-Propyl-2-thiouracil',
  'Triclosan',
  'Ferbam',
  'N,N-Dimethyldecanamide',
  'Azinphos-methyl',
  'Gemfibrozil',
  'Naproxen sodium'
)
for (s in substances) {
  ggsave(
    sprintf('results/figures/effect_propagation/time_conc_fig_%s_0hpf.svg', s),
    plot = figure_time_conc(s, 0),
    device = 'svg',
    width = 9,
    height = 9,
    units = 'cm',
    bg = 'white'
  )
}
Information about the document

This document was created with the following command:

rmarkdown::render(
    input = 'intob_analysis.Rmd',
    output_format = 'html_document',
    output_file = sprintf(
        '%s_intob_analysis.html',
        format(Sys.time(), '%y%m%d')
    )
)

R version 4.3.0 (2023-04-21)

Platform: x86_64-pc-linux-gnu (64-bit)

attached base packages:

  • grid
  • stats
  • graphics
  • grDevices
  • utils
  • datasets
  • methods
  • base

other attached packages:

  • lubridate(v.1.9.3)
  • forcats(v.1.0.0)
  • stringr(v.1.5.1)
  • dplyr(v.1.1.4)
  • purrr(v.1.0.2)
  • readr(v.2.1.5)
  • tidyr(v.1.3.1)
  • tibble(v.3.2.1)
  • tidyverse(v.2.0.0)
  • ggridges(v.0.5.6)
  • data.table(v.1.15.4)
  • ggrepel(v.0.9.5)
  • ggpubr(v.0.6.0)
  • cowplot(v.1.1.3)
  • egg(v.0.4.5)
  • ggplot2(v.3.5.1)
  • gridExtra(v.2.3)
  • corrplot(v.0.92)
  • umap(v.0.2.10.0)
  • ComplexHeatmap(v.2.18.0)
  • drc(v.3.0-1)
  • MASS(v.7.3-60.0.1)
  • jsonlite(v.1.8.8)
  • ghql(v.0.1.0)

loaded via a namespace (and not attached):

  • sandwich(v.3.1-0)
  • rlang(v.1.1.3)
  • magrittr(v.2.0.3)
  • clue(v.0.3-65)
  • GetoptLong(v.1.0.5)
  • multcomp(v.1.4-25)
  • matrixStats(v.1.3.0)
  • compiler(v.4.3.0)
  • systemfonts(v.1.0.6)
  • png(v.0.1-8)
  • vctrs(v.0.6.5)
  • httpcode(v.0.3.0)
  • pkgconfig(v.2.0.3)
  • shape(v.1.4.6.1)
  • crayon(v.1.5.2)
  • fastmap(v.1.1.1)
  • backports(v.1.4.1)
  • labeling(v.0.4.3)
  • pander(v.0.6.5)
  • utf8(v.1.2.4)
  • rmarkdown(v.2.26)
  • tzdb(v.0.4.0)
  • ragg(v.1.3.0)
  • bit(v.4.0.5)
  • xfun(v.0.43)
  • cachem(v.1.0.8)
  • highr(v.0.10)
  • broom(v.1.0.5)
  • parallel(v.4.3.0)
  • cluster(v.2.1.6)
  • R6(v.2.5.1)
  • stringi(v.1.8.3)
  • bslib(v.0.7.0)
  • RColorBrewer(v.1.1-3)
  • reticulate(v.1.35.0)
  • car(v.3.1-2)
  • jquerylib(v.0.1.4)
  • Rcpp(v.1.0.12)
  • iterators(v.1.0.14)
  • knitr(v.1.46)
  • zoo(v.1.8-12)
  • IRanges(v.2.36.0)
  • timechange(v.0.3.0)
  • Matrix(v.1.6-5)
  • splines(v.4.3.0)
  • tidyselect(v.1.2.1)
  • rstudioapi(v.0.16.0)
  • abind(v.1.4-5)
  • yaml(v.2.3.8)
  • doParallel(v.1.0.17)
  • codetools(v.0.2-20)
  • curl(v.5.2.1)
  • lattice(v.0.22-6)
  • withr(v.3.0.0)
  • askpass(v.1.2.0)
  • evaluate(v.0.23)
  • survival(v.3.5-8)
  • circlize(v.0.4.16)
  • pillar(v.1.9.0)
  • carData(v.3.0-5)
  • foreach(v.1.5.2)
  • stats4(v.4.3.0)
  • generics(v.0.1.3)
  • vroom(v.1.6.5)
  • hms(v.1.1.3)
  • S4Vectors(v.0.40.2)
  • munsell(v.0.5.1)
  • scales(v.1.3.0)
  • gtools(v.3.9.5)
  • glue(v.1.7.0)
  • tools(v.4.3.0)
  • RSpectra(v.0.16-1)
  • ggsignif(v.0.6.4)
  • mvtnorm(v.1.2-4)
  • plotrix(v.3.8-4)
  • colorspace(v.2.1-0)
  • cli(v.3.6.2)
  • textshaping(v.0.3.7)
  • fansi(v.1.0.6)
  • graphql(v.1.5.1)
  • svglite(v.2.1.3)
  • gtable(v.0.3.4)
  • rstatix(v.0.7.2)
  • sass(v.0.4.9)
  • digest(v.0.6.35)
  • BiocGenerics(v.0.48.1)
  • crul(v.1.4.2)
  • TH.data(v.1.1-2)
  • farver(v.2.1.1)
  • rjson(v.0.2.21)
  • htmltools(v.0.5.8.1)
  • lifecycle(v.1.0.4)
  • GlobalOptions(v.0.1.2)
  • bit64(v.4.0.5)
  • openssl(v.2.1.1)