aux$cluster = as.character(aux$cluster)
# points to draw vertical lines
cutpoints = c(metric_ranges[[as.character(k)]] %>% filter(metric==metric_name) %>% pull(min_value), metric_ranges[[as.character(k)]] %>% filter(metric==metric_name) %>% pull(max_value))
plot = ggplot(aux, aes(x = .data[[metric_name]], y = 1, fill = cluster, point_color = cluster)) +
geom_density_ridges(jittered_points = TRUE, size = 0.2, alpha = 0.4, stat = "density_ridges", panel_scaling=F) +
theme_ridges() +
annotate(geom = 'text', label = annotationText, x = -Inf, y = Inf, hjust = 0, vjust = 1) +
ylab('Density') +
geom_vline(xintercept = cutpoints, linetype="dotted") +
scale_x_continuous(labels=cutpoints, breaks=cutpoints, guide = guide_axis(check.overlap = T, angle = 90))
return(plot)
}
# Calculates the k range to be tested in evaluome for the given dataframe
calculateKRange <- function(df, bs, seed) {
min_k = 2
max_k = nrow(na.omit(df))
valid_k_range = FALSE
while (!valid_k_range & min_k <= max_k){
#cat(paste0("testing k range (", min_k, ", ", max_k, ")"))
x = try(annotateClustersByMetric(df, k.range=c(min_k, max_k), bs=bs, seed=seed), silent = TRUE)
if(class(x) != "try-error") {
valid_k_range = TRUE
} else {
max_k = max_k - 1
}
}
if (min_k > max_k) {
return (NA)
}
return (c(min_k, max_k))
}
getValidKRangesPerMetric <- function(df, bs, seed) {
configs = list()
# Remove first col as it is the name of the individual
metric_names = df %>% select(-1) %>% colnames()
for (metric_name in metric_names) {
single_metric_df = df %>% select(1, !!metric_name)
configs[[metric_name]] = calculateKRange(single_metric_df, bs, seed)
}
return(configs)
}
# Receives the original data in long format and a list of metric names, and plots
# the metric distribution by using violin plots.
plot_metric_violins <- function(all, metric_names, output_folder){
data = spread(all, Metric, Value)
for (metric_name in metric_names){
x = filter(all, Metric == metric_name)
metric_values = x$Value %>% na.exclude()
shapiro_p_value = format(shapiro.test(metric_values)$p.value, digits=4)
ymax=max(metric_values) + (0.1 * max(metric_values))
plot = ggplot(data = x, aes(x=Metric, y=Value)) + geom_violin() + geom_boxplot(width=0.2) + ylim(0, ymax) + labs(x = "") + annotate(geom = 'text', label = paste("Shapiro test p-value =", shapiro_p_value), size=3, x = -Inf, y = Inf, hjust = 0, vjust = 1)
filename = paste0(metric_name, "_violin.pdf")
ggsave(plot = plot, path = output_folder, filename = filename, width = 10, height = 15, units="cm", device='pdf', dpi=600)
}
}
all_long = read.csv2("/tmp/francisco.abad@um.es18180782099001697743/metrics.tsv", header = T, sep = "\t", na.strings = "NaN", stringsAsFactors = F)
all_long$Value = as.numeric(all_long$Value)
all_long$Ontology = as.character(all_long$Ontology)
all_metrics = unique(all_long$Metric)
all = spread(all_long, Metric, Value)
all$Ontology = tools::file_path_sans_ext(all$Ontology)
# Evaluome params for searching the optimal k of each metric.
bs = 10
seed = 100
# Calculate, for each metric, the k range in which evaluome does not crash.
kranges_per_metric = getValidKRangesPerMetric(all, bs=bs, seed=seed)
# Select the minimal k range found and remove the metrics that could not be partitioned
k.range = NULL
for (metric_name in names(kranges_per_metric)) {
if (length(kranges_per_metric[[metric_name]]) == 1 && is.na(kranges_per_metric[[metric_name]])){
warning(cat('The metric', metric_name, 'cannot be splited into subgroups. Ignoring...\n'))
all = all %>% select(-!!metric_name)
} else if(is.null(k.range) || kranges_per_metric[[metric_name]][2] < k.range[2]) {
k.range = kranges_per_metric[[metric_name]]
}
}
all
View(all)
all_metrics
all_metrics [!x == "ClassesWithNoNameMetric"]
all_metrics [!all_metrics == "ClassesWithNoNameMetric"]
getStabilityInterpretation <- function(x) {
if (x < 0.60){
return("Unstable")
}
if (x <= 0.75){
return("Doubtful")
}
if (x <= 0.85){
return("Stable")
}
if (x <= 1){
return("Highly stable")
}
}
getQualityInterpretation <- function(x) {
if (x < 0.25){
return("No structure")
}
if (x <= 0.50){
return("Weak structure")
}
if (x <= 0.70){
return("Reasonable structure")
}
if (x <= 1){
return("Strong structure")
}
}
# Receives the data and the name of a metric and prints a density plot, including
# each individual as a point.
printDensityPlotWithPoints <- function(data, metric) {
aux = na.omit(data[[metric]])
aux$cluster = as.character(aux$cluster)
ggplot(aux, aes(x = .data[[metric]], y = 1, fill = cluster, point_color = cluster)) +
geom_density_ridges(jittered_points = TRUE, size = 0.2, alpha = 0.4, stat = "density_ridges", panel_scaling=F) +
theme_ridges() +
ylab('Density')
}
# Receives the data, the name of a metric, a value of k, the stability data,
# the quality data and the metric ranges, and makes a plot including these data.
printDensityPlotWithPointsAndInfo <- function(data, metric, k, stability_data, quality_data, metric_ranges){
stability_row_name = paste('Mean_stability_k_', k, sep='')
stability_k = as.numeric(stability_data %>% filter(Metric == metric_name) %>% pull(!!sym(stability_row_name)))
quality_col_name = paste('k_', k, sep = '')
quality_k = as.data.frame(assay(quality_data[[quality_col_name]])) %>% filter(Metric == metric_name) %>% pull(Avg_Silhouette_Width) %>% as.numeric()
annotationText = paste('Stability = ', format(round(stability_k, 3), nsmall = 3), '-', getStabilityInterpretation(stability_k), '\nQuality = ', format(round(quality_k, 3), nsmall = 3), '-', getQualityInterpretation(quality_k))
aux = na.omit(x[[metric_name]][[as.character(k)]])
aux$cluster = as.character(aux$cluster)
# points to draw vertical lines
cutpoints = c(metric_ranges[[as.character(k)]] %>% filter(metric==metric_name) %>% pull(min_value), metric_ranges[[as.character(k)]] %>% filter(metric==metric_name) %>% pull(max_value))
plot = ggplot(aux, aes(x = .data[[metric_name]], y = 1, fill = cluster, point_color = cluster)) +
geom_density_ridges(jittered_points = TRUE, size = 0.2, alpha = 0.4, stat = "density_ridges", panel_scaling=F) +
theme_ridges() +
annotate(geom = 'text', label = annotationText, x = -Inf, y = Inf, hjust = 0, vjust = 1) +
ylab('Density') +
geom_vline(xintercept = cutpoints, linetype="dotted") +
scale_x_continuous(labels=cutpoints, breaks=cutpoints, guide = guide_axis(check.overlap = T, angle = 90))
return(plot)
}
# Calculates the k range to be tested in evaluome for the given dataframe
calculateKRange <- function(df, bs, seed) {
min_k = 2
max_k = nrow(na.omit(df))
valid_k_range = FALSE
while (!valid_k_range & min_k <= max_k){
#cat(paste0("testing k range (", min_k, ", ", max_k, ")"))
x = try(annotateClustersByMetric(df, k.range=c(min_k, max_k), bs=bs, seed=seed), silent = TRUE)
if(class(x) != "try-error") {
valid_k_range = TRUE
} else {
max_k = max_k - 1
}
}
if (min_k > max_k) {
return (NA)
}
return (c(min_k, max_k))
}
getValidKRangesPerMetric <- function(df, bs, seed) {
configs = list()
# Remove first col as it is the name of the individual
metric_names = df %>% select(-1) %>% colnames()
for (metric_name in metric_names) {
single_metric_df = df %>% select(1, !!metric_name)
configs[[metric_name]] = calculateKRange(single_metric_df, bs, seed)
}
return(configs)
}
# Receives the original data in long format and a list of metric names, and plots
# the metric distribution by using violin plots.
plot_metric_violins <- function(all, metric_names, output_folder){
data = spread(all, Metric, Value)
for (metric_name in metric_names){
x = filter(all, Metric == metric_name)
metric_values = x$Value %>% na.exclude()
shapiro_p_value = format(shapiro.test(metric_values)$p.value, digits=4)
ymax=max(metric_values) + (0.1 * max(metric_values))
plot = ggplot(data = x, aes(x=Metric, y=Value)) + geom_violin() + geom_boxplot(width=0.2) + ylim(0, ymax) + labs(x = "") + annotate(geom = 'text', label = paste("Shapiro test p-value =", shapiro_p_value), size=3, x = -Inf, y = Inf, hjust = 0, vjust = 1)
filename = paste0(metric_name, "_violin.pdf")
ggsave(plot = plot, path = output_folder, filename = filename, width = 10, height = 15, units="cm", device='pdf', dpi=600)
}
}
all_long = read.csv2("/tmp/francisco.abad@um.es18180782099001697743/metrics.tsv", header = T, sep = "\t", na.strings = "NaN", stringsAsFactors = F)
all_long$Value = as.numeric(all_long$Value)
all_long$Ontology = as.character(all_long$Ontology)
all_metrics = unique(all_long$Metric)
all = spread(all_long, Metric, Value)
all$Ontology = tools::file_path_sans_ext(all$Ontology)
# Evaluome params for searching the optimal k of each metric.
bs = 10
seed = 100
# Calculate, for each metric, the k range in which evaluome does not crash.
kranges_per_metric = getValidKRangesPerMetric(all, bs=bs, seed=seed)
# Select the minimal k range found and remove the metrics that could not be partitioned
k.range = NULL
for (metric_name in names(kranges_per_metric)) {
if (length(kranges_per_metric[[metric_name]]) == 1 && is.na(kranges_per_metric[[metric_name]])){
warning(cat('The metric', metric_name, 'cannot be splited into subgroups. Ignoring...\n'))
all = all %>% select(-!!metric_name)
all_metrics [!all_metrics == metric_name]
} else if(is.null(k.range) || kranges_per_metric[[metric_name]][2] < k.range[2]) {
k.range = kranges_per_metric[[metric_name]]
}
}
all_metrics
colnames(all)
colnames(all)[2:]
colnames(all)[2:5]
colnames(all)[2:length(colnames(all))]
all_metrics = colnames(all)[2:length(colnames(all))]
all_metrics
library(ggridges)
library(ggplot2)
library(egg)
library(dplyr)
library(tidyr)
library(evaluomeR)
library(tools)
library(factoextra)
library(corrplot)
library(RColorBrewer)
library(optparse)
getStabilityInterpretation <- function(x) {
if (x < 0.60){
return("Unstable")
}
if (x <= 0.75){
return("Doubtful")
}
if (x <= 0.85){
return("Stable")
}
if (x <= 1){
return("Highly stable")
}
}
getQualityInterpretation <- function(x) {
if (x < 0.25){
return("No structure")
}
if (x <= 0.50){
return("Weak structure")
}
if (x <= 0.70){
return("Reasonable structure")
}
if (x <= 1){
return("Strong structure")
}
}
# Receives the data and the name of a metric and prints a density plot, including
# each individual as a point.
printDensityPlotWithPoints <- function(data, metric) {
aux = na.omit(data[[metric]])
aux$cluster = as.character(aux$cluster)
ggplot(aux, aes(x = .data[[metric]], y = 1, fill = cluster, point_color = cluster)) +
geom_density_ridges(jittered_points = TRUE, size = 0.2, alpha = 0.4, stat = "density_ridges", panel_scaling=F) +
theme_ridges() +
ylab('Density')
}
# Receives the data, the name of a metric, a value of k, the stability data,
# the quality data and the metric ranges, and makes a plot including these data.
printDensityPlotWithPointsAndInfo <- function(data, metric, k, stability_data, quality_data, metric_ranges){
stability_row_name = paste('Mean_stability_k_', k, sep='')
stability_k = as.numeric(stability_data %>% filter(Metric == metric_name) %>% pull(!!sym(stability_row_name)))
quality_col_name = paste('k_', k, sep = '')
quality_k = as.data.frame(assay(quality_data[[quality_col_name]])) %>% filter(Metric == metric_name) %>% pull(Avg_Silhouette_Width) %>% as.numeric()
annotationText = paste('Stability = ', format(round(stability_k, 3), nsmall = 3), '-', getStabilityInterpretation(stability_k), '\nQuality = ', format(round(quality_k, 3), nsmall = 3), '-', getQualityInterpretation(quality_k))
aux = na.omit(x[[metric_name]][[as.character(k)]])
aux$cluster = as.character(aux$cluster)
# points to draw vertical lines
cutpoints = c(metric_ranges[[as.character(k)]] %>% filter(metric==metric_name) %>% pull(min_value), metric_ranges[[as.character(k)]] %>% filter(metric==metric_name) %>% pull(max_value))
plot = ggplot(aux, aes(x = .data[[metric_name]], y = 1, fill = cluster, point_color = cluster)) +
geom_density_ridges(jittered_points = TRUE, size = 0.2, alpha = 0.4, stat = "density_ridges", panel_scaling=F) +
theme_ridges() +
annotate(geom = 'text', label = annotationText, x = -Inf, y = Inf, hjust = 0, vjust = 1) +
ylab('Density') +
geom_vline(xintercept = cutpoints, linetype="dotted") +
scale_x_continuous(labels=cutpoints, breaks=cutpoints, guide = guide_axis(check.overlap = T, angle = 90))
return(plot)
}
# Calculates the k range to be tested in evaluome for the given dataframe
calculateKRange <- function(df, bs, seed) {
min_k = 2
max_k = nrow(na.omit(df))
valid_k_range = FALSE
while (!valid_k_range & min_k <= max_k){
#cat(paste0("testing k range (", min_k, ", ", max_k, ")"))
x = try(annotateClustersByMetric(df, k.range=c(min_k, max_k), bs=bs, seed=seed), silent = TRUE)
if(class(x) != "try-error") {
valid_k_range = TRUE
} else {
max_k = max_k - 1
}
}
if (min_k > max_k) {
return (NA)
}
return (c(min_k, max_k))
}
getValidKRangesPerMetric <- function(df, bs, seed) {
configs = list()
# Remove first col as it is the name of the individual
metric_names = df %>% select(-1) %>% colnames()
for (metric_name in metric_names) {
single_metric_df = df %>% select(1, !!metric_name)
configs[[metric_name]] = calculateKRange(single_metric_df, bs, seed)
}
return(configs)
}
# Receives the original data in long format and a list of metric names, and plots
# the metric distribution by using violin plots.
plot_metric_violins <- function(all, metric_names, output_folder){
data = spread(all, Metric, Value)
for (metric_name in metric_names){
x = filter(all, Metric == metric_name)
metric_values = x$Value %>% na.exclude()
shapiro_p_value = format(shapiro.test(metric_values)$p.value, digits=4)
ymax=max(metric_values) + (0.1 * max(metric_values))
plot = ggplot(data = x, aes(x=Metric, y=Value)) + geom_violin() + geom_boxplot(width=0.2) + ylim(0, ymax) + labs(x = "") + annotate(geom = 'text', label = paste("Shapiro test p-value =", shapiro_p_value), size=3, x = -Inf, y = Inf, hjust = 0, vjust = 1)
filename = paste0(metric_name, "_violin.pdf")
ggsave(plot = plot, path = output_folder, filename = filename, width = 10, height = 15, units="cm", device='pdf', dpi=600)
}
}
all_long = read.csv2("/home/fabad/Descargas/metrics.tsv", header = T, sep = "\t", na.strings = "NaN", stringsAsFactors = F)
all_long$Value = as.numeric(all_long$Value)
all_long$Ontology = as.character(all_long$Ontology)
all = spread(all_long, Metric, Value)
all$Ontology = tools::file_path_sans_ext(all$Ontology)
View(all)
# Evaluome params for searching the optimal k of each metric.
bs = 10
seed = 100
# Calculate, for each metric, the k range in which evaluome does not crash.
kranges_per_metric = getValidKRangesPerMetric(all, bs=bs, seed=seed)
# Select the minimal k range found and remove the metrics that could not be partitioned
k.range = NULL
for (metric_name in names(kranges_per_metric)) {
if (length(kranges_per_metric[[metric_name]]) == 1 && is.na(kranges_per_metric[[metric_name]])){
warning(cat('The metric', metric_name, 'cannot be splited into subgroups. Ignoring...\n'))
all = all %>% select(-!!metric_name)
} else if(is.null(k.range) || kranges_per_metric[[metric_name]][2] < k.range[2]) {
k.range = kranges_per_metric[[metric_name]]
}
}
all_metrics = colnames(all)[2:length(colnames(all))]
all_metrics
x = annotateClustersByMetric(all, k.range=k.range, bs=bs, seed=seed)
stability_data = as.data.frame(assay(x[['stability_data']]))
quality_data = x[['quality_data']]
kOptTable <- getOptimalKValue(stability_data, quality_data)
metric_ranges = getMetricRangeByCluster(all, k.range=k.range, bs=bs, seed=seed)
i=1
all_metrics
metric_name = all_metrics[i]
optimal_k = kOptTable %>% filter(Metric == all_metrics[i]) %>% pull(Global_optimal_k)
plot = printDensityPlotWithPointsAndInfo(x, metric_name, optimal_k, stability_data, quality_data, metric_ranges)
plot
all_metrics
data = spread(all, Metric, Value)
all
all_long
all = all_long
data = spread(all, Metric, Value)
metric_names
metric_name = all_metrics[1]
x = filter(all, Metric == metric_name)
metric_values = x$Value %>% na.exclude()
shapiro_p_value = format(shapiro.test(metric_values)$p.value, digits=4)
ymax=max(metric_values) + (0.1 * max(metric_values))
plot = ggplot(data = x, aes(x=Metric, y=Value)) + geom_violin() + geom_boxplot(width=0.2) + ylim(0, ymax) + labs(x = "") + annotate(geom = 'text', label = paste("Shapiro test p-value =", shapiro_p_value), size=3, x = -Inf, y = Inf, hjust = 0, vjust = 1)
plot
library(optparse)
input_reads = '/home/fabad/Descargas/alignments_minimap2/reads.txt'
reads = read.table(input_reads)
View(reads)
colnames(reads) = c('read_id')
View(reads)
seq_summary_path = '/home/fabad/Descargas/alignments_minimap2/sequencing_summary.txt'
seq_summary = read.table(seq_summary_path)
library(data.table)
seq_summary = fread(seq_summary_path)
View(seq_summary)
reads = fread(input_reads_path)
input_reads_path = '/home/fabad/Descargas/alignments_minimap2/reads.txt'
seq_summary_path = '/home/fabad/Descargas/alignments_minimap2/sequencing_summary.txt'
reads = fread(input_reads_path)
View(reads)
reads = fread(input_reads_path, header = F)
View(reads)
colnames(reads) = c('read_id')
library(tidyr)
library(dplyr)
seq_summary_filtered = seq_summary %>% filter(read_id %IN% reads$read_id)
seq_summary_filtered = seq_summary %>% filter(read_id %in% reads$read_id)
is.na(NA)
!TRUE
is.na(NA)
!is.na(NA)
is.na(NA) & is.na(21)
!is.na(NA) & is.na(21)
is.na(NA) & !is.na(21)
is.na(NA) | is.na(21)
is.null(NULL)
is.null(NA)
"hola" != ''
"" != ''
library(dplyr)
library(tidyr)
library(ggpubr)
library(rstudioapi)
rootPath = dirname(rstudioapi::getActiveDocumentContext()$path)
working_dir = file.path(rootPath, 'results')
setwd(working_dir)
read_classes_input_file <- function(filepath, ontology, text_corpus) {
file = read.csv2(filepath, header = T, sep = "\t", na.strings = "NaN", stringsAsFactors = F)
file$Lexical_similarity = as.numeric(file$Lexical_similarity)
file$Semantic_similarity = as.numeric(file$Semantic_similarity)
file$Score = as.numeric(file$Score)
# Replace -Inf by 0
file <- file %>% mutate_if(is.numeric, function(x) ifelse(is.infinite(x), 0, x))
file_uniq = select(file, Class_IRI, Score) %>% unique()
file_uniq$Ontology = ontology
file_uniq$TextCorpus = text_corpus
return(file_uniq)
}
read_nouns_input_file <- function(filepath, ontology, text_corpus) {
file = read.csv2(filepath, header = T, sep = "\t", na.strings = "NaN", stringsAsFactors = F)
file$Lexical_similarity = as.numeric(file$Lexical_similarity)
file$Semantic_similarity = as.numeric(file$Semantic_similarity)
file$Score = as.numeric(file$Score)
# Replace -Inf by 0
file <- file %>% mutate_if(is.numeric, function(x) ifelse(is.infinite(x), 0, x))
file_uniq = select(file, Noun_phrase, Score) %>% unique()
file_uniq$Ontology = ontology
file_uniq$TextCorpus = text_corpus
return(file_uniq)
}
plotDensities <- function(a, label_a,b, label_b, c, label_c, title) {
a_max_density = max(density(a$Score)$y)
b_max_density = max(density(b$Score)$y)
c_max_density = max(density(c$Score)$y)
max_density = max(a_max_density, b_max_density, c_max_density)
plot(density(a$Score), main=title, col="red", ylim=c(0, ceiling(max_density)))
lines(density(b$Score), col="green")
lines(density(c$Score), col="blue")
legend(x="topright", legend=c(label_a, label_b, label_c), fill=c("red", "green", "blue"))
}
# Gene Ontology - text about genes
go_genes_nouns_uniq = read_nouns_input_file("go_genetext/text2class.tsv", 'GeneOntology', 'Genes text')
# Gene Ontology - text about law
go_legal_nouns_uniq = read_nouns_input_file("go_legaltext/text2class.tsv", 'GeneOntology', 'Legal text')
# Gene Ontology - text about food
go_food_nouns_uniq = read_nouns_input_file("go_foodtext/text2class.tsv", 'GeneOntology', 'Food text')
# Gene Ontology - text about medicine
go_medicine_nouns_uniq = read_nouns_input_file("go_medicaltext/text2class.tsv", 'GeneOntology', 'Medical text')
# Food Ontology - text about genes
foodon_genes_nouns_uniq = read_nouns_input_file("foodon_genetext/text2class.tsv", 'FoodOn', 'Genes text')
# Food Ontology - text about law
foodon_legal_nouns_uniq = read_nouns_input_file("foodon_legaltext/text2class.tsv", 'FoodOn', 'Legal text')
# Food Ontology - text about food
foodon_food_nouns_uniq = read_nouns_input_file("foodon_foodtext/text2class.tsv", 'FoodOn', 'Food text')
# Food Ontology - text about medicine
foodon_medicine_nouns_uniq = read_nouns_input_file("foodon_medicaltext/text2class.tsv", 'FoodOn', 'Medical text')
# LKIF Ontology - text about genes
lkif_genes_nouns_uniq = read_nouns_input_file("lkif_genetext/text2class.tsv", 'LKIF', 'Genes text')
# LKIF Ontology - text about law
lkif_legal_nouns_uniq = read_nouns_input_file("lkif_legaltext/text2class.tsv", 'LKIF', 'Legal text')
# LKIF Ontology - text about food
lkif_food_nouns_uniq = read_nouns_input_file("lkif_foodtext/text2class.tsv", 'LKIF', 'Food text')
# LKIF Ontology - text about medicine
lkif_medicine_nouns_uniq = read_nouns_input_file("lkif_medicaltext/text2class.tsv", 'LKIF', 'Medical text')
# SNOMED Ontology - text about genes
snomed_genes_nouns_uniq = read_nouns_input_file("snomed_genetext/text2class.tsv", 'SNOMED', 'Genes text')
# SNOMED Ontology - text about law
snomed_legal_nouns_uniq = read_nouns_input_file("snomed_legaltext/text2class.tsv", 'SNOMED', 'Legal text')
# SNOMED Ontology - text about food
snomed_food_nouns_uniq = read_nouns_input_file("snomed_foodtext/text2class.tsv", 'SNOMED', 'Food text')
# SNOMED Ontology - text about medicine
snomed_medicine_nouns_uniq = read_nouns_input_file("snomed_medicaltext/text2class.tsv", 'SNOMED', 'Medical text')
# Complete dataset
experiment_data_nouns = rbind(go_food_nouns_uniq,
go_legal_nouns_uniq,
go_genes_nouns_uniq,
go_medicine_nouns_uniq,
foodon_food_nouns_uniq,
foodon_legal_nouns_uniq,
foodon_genes_nouns_uniq,
foodon_medicine_nouns_uniq,
lkif_food_nouns_uniq,
lkif_legal_nouns_uniq,
lkif_genes_nouns_uniq,
lkif_medicine_nouns_uniq,
snomed_genes_nouns_uniq,
snomed_legal_nouns_uniq,
snomed_food_nouns_uniq,
snomed_medicine_nouns_uniq) %>%
select(Ontology, TextCorpus, Noun_phrase, Score)
# Boxplots Noun phrases
comparisons <- list( c("Food text", "Legal text"), c("Food text", "Genes text"), c("Food text", "Medical text"), c("Legal text", "Genes text"), c("Legal text", "Medical text"), c("Genes text", "Medical text"))
ggboxplot(experiment_data_nouns, x="TextCorpus", y="Score", color="TextCorpus", facet.by = "Ontology") +
stat_compare_means(comparisons = comparisons, method = "wilcox.test") +
ggtitle('Text corpora comparison for each ontology') +
theme(plot.title = element_text(hjust = 0.5), axis.text.x = element_text(angle = 45, hjust=1)) +
ylim(0.3, 1.5)
comparisons <- list( c("GeneOntology", "FoodOn"), c("GeneOntology", "LKIF"), c("GeneOntology", "SNOMED"), c("FoodOn", "LKIF"),c("FoodOn", "SNOMED"),c("LKIF", "SNOMED"))
ggboxplot(experiment_data_nouns, x="Ontology", y="Score", color="Ontology", facet.by = "TextCorpus") +
stat_compare_means(comparisons = comparisons, method = "wilcox.test") +
ggtitle('Ontology comparison for each text corpus') +
theme(plot.title = element_text(hjust = 0.5), axis.text.x = element_text(angle = 45, hjust=1)) +
ylim(0.3, 1.5)
