Last updated on Wed Sep 9 11:33:08 2020.
If you use the following figures in your own work, please cite:
# Load required packages
library(magrittr)
library(dplyr)
library(purrr)
library(jsonlite)
library(forcats)
library(ggplot2)
library(plotly)
library(here)
library(stringr)
# We need two queries: one for uppercase titles, the other for lowercase ones
c(
pdb_queries <-uppercase = 'https://www.ebi.ac.uk/pdbe/search/pdb/select?q=title:*UCLEOSOM*%20AND%20status:REL&fl=pdb_id,citation_year,title,experimental_method,resolution,organism_scientific_name,molecule_name,molecule_type,number_of_protein_chains&rows=1000000&wt=json',
lowercase = 'https://www.ebi.ac.uk/pdbe/search/pdb/select?q=title:*ucleosom*%20AND%20status:REL&fl=pdb_id,citation_year,title,experimental_method,resolution,organism_scientific_name,molecule_name,molecule_type,number_of_protein_chains&rows=1000000&wt=json',
ncp = 'https://www.ebi.ac.uk/pdbe/search/pdb/select?q=title:*NCP*%20AND%20status:REL&fl=pdb_id,citation_year,title,experimental_method,resolution,organism_scientific_name,molecule_name,molecule_type,number_of_protein_chains&rows=1000000&wt=json'
)
# The following PDB entries have the word "nucleosome in their title, but do not
# actually contain a nucleosome, so we need to exclude them from the analysis.
# Unfortunately, there is no good way to automate this.
c(
non_nucleosome_structures <-"1hst",
"2z2r",
"5x7v",
"3uv2",
"3fs3",
"1wg3",
"1nw3",
"5ikf",
"3gyw",
"3gyv",
"1ofc",
"2ayu",
"2iw5",
"3hfd",
"6uch",
"5r4k",
"5r4m",
"5r4l",
"5r4o",
"5r4g",
"5r4h",
"5r4j",
"5r4i",
"5r4n",
"1bj6",
"6qds",
"2iwj",
"4dvk",
"1a6b",
"6qdu",
"6ne8",
"1tsu",
"1esk",
"7c4j"
)
# This is a helper pipeline to extract data
. %>%
dig_up_data <- .$response %>%
.$docs %>%
as_tibble()
# This is a helper function to detect the presence of a binding factor
function(number_of_protein_chains, title) {
has_binding_factor_one <-# It takes 8 histone chains to make a nucleosome, so if the number of
# protein chains is not divisible by 8, this means there is a binding factor,
# unless we're seeing the overlapping dinucleosome (14 proteins chains, but
# they are all histones... one octamer + one hexamer)
str_detect(title, "unusual")
oldn <-if (number_of_protein_chains %% 8 != 0 & !oldn) {
return(TRUE)
else {
} # But for multiple of 8 chains > 8 (i.e. 16, 24, 32, 40), we can have this
# number of chains by chance even with binding factors. There is no good
# way to automatically find these cases, unfortunately
str_detect(title, "COMPASS")
compass <- str_detect(title, "LSD1/CoREST")
corest <- compass || corest
binding_factor <-if (binding_factor) {
return(TRUE)
else {
} return(FALSE)
}
}
}
# Vectorize the above function
function(number_of_protein_chains_vector, title) {
has_binding_factor <-map2_lgl(number_of_protein_chains_vector, title, has_binding_factor_one)
}
# Query the PDB and clean up data
pdb_queries %>%
pdb_data <- map(fromJSON) %>%
map(dig_up_data) %>%
bind_rows() %>%
filter(!(pdb_id %in% non_nucleosome_structures)) %>%
mutate(
has_binding_factor = has_binding_factor(number_of_protein_chains, title),
experimental_method = as_factor(as.character(experimental_method)),
citation_year = as.integer(citation_year),
molecule_name = as.character(molecule_name),
molecule_type = as_factor(as.character(molecule_type))
%>%
) distinct(pdb_id, .keep_all = TRUE)
All figures are interactive (you can zoom in, and hovering over elements will show more information).
pdb_data %>%
nucleosome_structures_year <- ggplot() +
geom_bar(mapping = aes(x = citation_year, fill = experimental_method)) +
guides(fill = guide_legend(title = "Experimental method")) +
ggtitle("Structures of nucleosomes by year") +
xlab("Publication year") +
ylab("Number of PDB entries") +
theme_bw()
ggplotly(nucleosome_structures_year)
pdb_data %>%
nucleosome_structures_year_binding_factor <- ggplot() +
geom_bar(mapping = aes(x = citation_year, fill = has_binding_factor)) +
guides(fill = guide_legend(title = "Contains a binding factor")) +
ggtitle("Structures of nucleosomes by year") +
xlab("Publication year") +
ylab("Number of PDB entries") +
theme_bw()
ggplotly(nucleosome_structures_year_binding_factor)
pdb_data %>%
nucleosome_structures_method <- ggplot() +
geom_bar(mapping = aes(x = experimental_method, fill = experimental_method)) +
guides(fill = guide_legend(title = "Experimental method")) +
ggtitle("Structures of nucleosomes by experimental method") +
xlab("") +
ylab("Number of PDB entries") +
theme_bw()
ggplotly(nucleosome_structures_method)
pdb_data %>%
nucleosome_binding_factors_methods <- ggplot() +
geom_bar(mapping = aes(x = experimental_method, fill = has_binding_factor)) +
guides(fill = guide_legend(title = "Presence of a binding factor")) +
ggtitle("Structures of nucleosomes by presence of a binding factor") +
xlab("Experimental method") +
ylab("Number of PDB entries") +
theme_bw()
ggplotly(nucleosome_binding_factors_methods)
pdb_data %>%
nucleosome_binding_factors <- ggplot() +
geom_bar(mapping = aes(x = has_binding_factor, fill = has_binding_factor)) +
guides(fill = guide_legend(title = "Presence of a binding factor")) +
ggtitle("Structures of nucleosomes by presence of a binding factor") +
xlab("Presence of a binding factor") +
ylab("Number of PDB entries") +
theme_bw()
ggplotly(nucleosome_binding_factors)
2 <- pdb_data %>%
nucleosome_binding_factors_methods_ ggplot() +
geom_bar(mapping = aes(x = has_binding_factor, fill = experimental_method)) +
guides(fill = guide_legend(title = "Experimental method")) +
ggtitle("Structures of nucleosomes by presence of a binding factor") +
xlab("Presence of a binding factor") +
ylab("Number of PDB entries") +
theme_bw()
ggplotly(nucleosome_binding_factors_methods_2)
pdb_data %>%
nucleosome_xtal_species <- filter(experimental_method == "X-ray diffraction") %>%
mutate(is_histone = str_detect(molecule_name, pattern = "Histone H")) %>%
filter(is_histone == TRUE) %>%
mutate(organism_scientific_name = as_factor(as.character(organism_scientific_name))) %>%
ggplot() +
geom_bar(mapping = aes(x = organism_scientific_name,
fill = organism_scientific_name)) +
guides(fill = guide_legend(title = "Histone species")) +
ggtitle("Crystal structures of nucleosomes by histone species") +
xlab("") +
ylab("Number of PDB entries") +
theme_bw() +
theme(axis.text.x = element_text(angle = 30, hjust = 1))
ggplotly(nucleosome_xtal_species)
pdb_data %>%
nucleosome_cryoem_species <- filter(experimental_method == "Electron Microscopy") %>%
mutate(is_histone = str_detect(molecule_name, pattern = "Histone H")) %>%
filter(is_histone == TRUE) %>%
mutate(organism_scientific_name = as_factor(as.character(organism_scientific_name))) %>%
ggplot() +
geom_bar(mapping = aes(x = organism_scientific_name,
fill = organism_scientific_name)) +
guides(fill = guide_legend(title = "Histone species")) +
ggtitle("Cryo-EM structures of nucleosomes by histone species") +
xlab("") +
ylab("Number of PDB entries") +
theme_bw() +
theme(axis.text.x = element_text(angle = 30, hjust = 1))
ggplotly(nucleosome_cryoem_species)
%>% dplyr::filter(experimental_method == "X-ray diffraction") %>% .$resolution %>% summary() pdb_data
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.940 2.500 2.750 2.908 3.000 9.700
pdb_data %>%
nucleosome_xtal_resolution_species <- filter(experimental_method == "X-ray diffraction") %>%
mutate(is_histone = str_detect(molecule_name, pattern = "Histone H")) %>%
filter(is_histone == TRUE) %>%
mutate(organism_scientific_name = as_factor(as.character(organism_scientific_name))) %>%
select(resolution, organism_scientific_name) %>%
ggplot() +
geom_histogram(aes(x = resolution, fill = organism_scientific_name),
binwidth = 0.2) +
guides(fill = guide_legend(title = "Histone species")) +
ggtitle("Resolution of nucleosome crystal structures by histone species") +
xlab("Resolution (Å)") +
ylab("Number of PDB entries") +
theme_bw()
ggplotly(nucleosome_xtal_resolution_species)
%>% dplyr::filter(experimental_method == "Electron Microscopy") %>% .$resolution %>% summary() pdb_data
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.600 3.600 4.000 4.916 4.800 20.000
pdb_data %>%
nucleosome_cryoem_resolution_species <- filter(experimental_method == "Electron Microscopy") %>%
mutate(is_histone = str_detect(molecule_name, pattern = "Histone H")) %>%
filter(is_histone == TRUE) %>%
mutate(organism_scientific_name = as_factor(as.character(organism_scientific_name))) %>%
select(resolution, organism_scientific_name) %>%
ggplot() +
geom_histogram(aes(x = resolution, fill = organism_scientific_name),
binwidth = 0.2) +
guides(fill = guide_legend(title = "Histone species")) +
ggtitle("Resolution of nucleosome cryo-EM structures by histone species") +
xlab("Resolution (Å)") +
ylab("Number of PDB entries") +
theme_bw()
ggplotly(nucleosome_cryoem_resolution_species)
The graphs presented above are derived from the following dataset:
# Format table for display
pdb_data %>%
pdb_table <- arrange(desc(citation_year)) %>%
select(`PDB code` = pdb_id,
`Citation year` = citation_year,
`Experimental method` = experimental_method,
Title = title)
pdb_table