library(tidyr)
library(ggplot2)
library(zoo)
library(RColorBrewer)
library(dplyr)
library(lubridate)
#read in the pos data
pos <- read.csv("Table_S4_Concentrations_Identified_Pharmaceuticals_PositiveMode_05052021_v2.csv", header=T, stringsAsFactors = F) #pos_Concentration_Pharmaceuticals_LuxEau_20201231.csv
pos$mode <- "pos"
head(pos)
dim(pos)
#read in the neg data
neg <- read.csv("Table_S3_Concentrations_Identified_Pharmaceuticals_NegativeMode_05052021_v2.csv", header=T, stringsAsFactors = F) #neg_Concentration_Pharmaceuticals_LuxEau_20201231.csv
neg$mode <- "neg"
neg
dim(neg)
#deduplicate if cpd measured in both pos and neg; want to take pos measurements only
dups <- Reduce(intersect, list(pos$Chemical,neg$Chemical))
length(dups)
dups
class(dups)
#remove these rows from neg before further processing
dedup_neg <- neg[!neg$Chemical %in% dups,]
dim(dedup_neg)
dedup_neg
#add asterix to neg compound names for plotting legibility
dedup_neg$Chemical <- paste0(dedup_neg$Chemical,"*")
#merge pos and dedup_neg
all_pos_dedup_neg <- rbind(pos,dedup_neg)
head(all_pos_dedup_neg)
#coerce as type numeric so"below QR" values become NA
all_pos_dedup_neg[,c(4:95)] <- suppressWarnings(sapply(all_pos_dedup_neg[,c(4:95)], as.numeric))
head(all_pos_dedup_neg)
#pivot: wide to long
long_all_pos_dedup_neg <- all_pos_dedup_neg[,c(3,4:95)] %>% pivot_longer(!Chemical, names_to=c("year","month","code"),
names_pattern ="X(.*)_(.*)_(.*)",
values_to="Conc")
head(long_all_pos_dedup_neg)
dim(long_all_pos_dedup_neg)
#translate Loc labels
code <- c(paste0("Loc0",1:9),paste0("Loc",10:13))
tr_locs <- data.frame(code,location=c("Chiers-Rodange","Alzette-Ettelbruck","Sûre-Erpeldange",
"Syr-Mertert","Mess-Noertzange","Mamer-Mersch",
"Eisch-Mersch","Attert-Colmar-Berg","Alzette-Mersch",
"Our-Wallendorf","Ernz-Blanche","Ernz-Noire",
"Gander-Emerange"),stringsAsFactors=FALSE)
head(tr_locs)
head(long_all_pos_dedup_neg)
long_all_pos_dedup_neg <- left_join(long_all_pos_dedup_neg,tr_locs,by="code")
#remove NA rows (below QR)
cc_long_all_pos_dedup_neg <- long_all_pos_dedup_neg[complete.cases(long_all_pos_dedup_neg),]
dim(cc_long_all_pos_dedup_neg)
7912-6211 = 1701 NAs, as expected.
tail(cc_long_all_pos_dedup_neg)
#prepare dates for plotting
cc_long_all_pos_dedup_neg$Time <- as.factor(paste("01",substr(cc_long_all_pos_dedup_neg$month,1,3),cc_long_all_pos_dedup_neg$year))
cc_long_all_pos_dedup_neg$Time <- as.yearmon(as.Date(cc_long_all_pos_dedup_neg$Time, format="%d %b %Y"))
cc_long_all_pos_dedup_neg$Time <- as.factor(cc_long_all_pos_dedup_neg$Time)
perm_locs <- c("Chiers-Rodange","Alzette-Ettelbruck","Sûre-Erpeldange","Syr-Mertert")
#calculate medians first
df_cc_perm_locs_all_pos_dedup_neg_permonth <- cc_long_all_pos_dedup_neg[cc_long_all_pos_dedup_neg$location %in% perm_locs,] %>%
group_by(Chemical,year,month,Time) %>%
summarise(Median_Conc_ngL=median(Conc))
tail(df_cc_perm_locs_all_pos_dedup_neg_permonth)
#range of medians for metformin in 2019 and 2020
metformin_medians <- df_cc_perm_locs_all_pos_dedup_neg_permonth[df_cc_perm_locs_all_pos_dedup_neg_permonth$Chemical=="Metformin",]
metformin_medians
write.csv(metformin_medians,"20210505_metformin_median_conc_perm_locs_2019_2020.csv")
#populate $notzero before taking log
#df_cc_perm_locs_all_pos_dedup_neg_permonth$notzero <- TRUE
#df_cc_perm_locs_all_pos_dedup_neg_permonth$notzero[df_cc_perm_locs_all_pos_dedup_neg_permonth$Median_Conc_ngL == 0] <- FALSE
#df_cc_perm_locs_all_pos_dedup_neg_permonth[df_cc_perm_locs_all_pos_dedup_neg_permonth$notzero==FALSE,]
There are no zero-value concentrations now.
#take log of Median_Conc_ngL
df_cc_perm_locs_all_pos_dedup_neg_permonth$log_Median_Conc_ngL <- log10(df_cc_perm_locs_all_pos_dedup_neg_permonth$Median_Conc_ngL)
#convert -Inf to 0 to find true minimum
#df_cc_perm_locs_all_pos_dedup_neg_permonth$log_Median_Conc_ngL[is.infinite(df_cc_perm_locs_all_pos_dedup_neg_permonth$log_Median_Conc_ngL)] <- 0
#summary(df_cc_perm_locs_all_pos_dedup_neg_permonth$log_Median_Conc_ngL)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
#-2.3010 0.4061 2.6274 2.3261 4.2804 6.1853
df_cc_perm_locs_all_pos_dedup_neg_permonth[is.infinite(df_cc_perm_locs_all_pos_dedup_neg_permonth$log_Median_Conc_ngL),]
#convert -Inf to -99 for plotting (ONLY IF THERE WERE ZERO-VALUES)
#df_cc_perm_locs_all_pos_dedup_neg_permonth$log_Median_Conc_ngL[is.infinite(df_cc_perm_locs_all_pos_dedup_neg_permonth$log_Median_Conc_ngL)] <- -99
df_cc_perm_locs_all_pos_dedup_neg_permonth[df_cc_perm_locs_all_pos_dedup_neg_permonth$Chemical=="Theophylline*",]
df_cc_perm_locs_all_pos_dedup_neg_permonth[df_cc_perm_locs_all_pos_dedup_neg_permonth$Chemical=="Pantoprozole",]
summary(df_cc_perm_locs_all_pos_dedup_neg_permonth$log_Median_Conc_ngL)
###outline the boxes where $notzero is FALSE
#options(repr.plot.width=20, repr.plot.height=12)
#ggplot(df_cc_perm_locs_all_pos_dedup_neg_permonth,
# aes(x=Time, y=reorder(Chemical,log_Median_Conc_ngL),
# fill=log_Median_Conc_ngL)) + #geom_tile() +
#geom_tile(data=df_cc_perm_locs_all_pos_dedup_neg_permonth,
# aes(size=factor(notzero,c(FALSE,TRUE)),
# colour=factor(notzero,c(FALSE,TRUE)))) +
#guides(color = FALSE, size = FALSE) +
#scale_colour_manual("notzero", values=c("black", "white")) +
#scale_size_manual("notzero", values=c(0.2, 0)) +
#xlab("Month-Year, N = 4 permanent sampling points") + ylab("Compound") + theme_classic() +
#theme(axis.text.x = element_text(size=18), axis.text.y = element_text(size=15)) + #tick text
#theme(axis.title.x = element_text(face="bold", size=20), axis.title.y = element_text(face="bold",size=20)) + #axis title
#scale_fill_distiller(palette="RdYlBu",limits=c(-3,7),breaks=c(-3,-1,1,3,5,7),name="log10(median Conc)") + #, limits= c(-4,8),breaks = c(-4,-2,0,2,4,6,8),labels = c(-4,-2,0,2,4,6,8)) +
#theme(legend.title = element_text(size=20, face="bold"), legend.text= element_text(size=15))
###use alpha as 2nd layer for FALSE/TRUE in $notzero
#ggplot(df_cc_perm_locs_all_pos_dedup_neg_permonth,
# aes(x=Time, y=reorder(Chemical,log_Median_Conc_ngL)))+#,
#fill=log_Median_Conc_ngL)) + #geom_tile() +
#geom_tile(data=df_cc_perm_locs_all_pos_dedup_neg_permonth,
# aes(size=factor(iszero,c(TRUE,FALSE)),
# colour=factor(iszero,c(TRUE,FALSE)))) +
#geom_tile(aes(fill=log_Median_Conc_ngL,alpha=notzero), colour='grey20')+
#guides(color = FALSE, size = FALSE) +
#scale_colour_manual("iszero", values=c("black", "white")) +
#scale_fill_manual("iszero", values=c("black","white")) +
#scale_size_manual("iszero", values=c(0.2, 0)) +
#xlab("Month-Year, N = 4 permanent sampling points") + ylab("Compound") + theme_classic() +
#theme(axis.text.x = element_text(size=18), axis.text.y = element_text(size=15)) + #tick text
#theme(axis.title.x = element_text(face="bold", size=20), axis.title.y = element_text(face="bold",size=20)) + #axis title
#scale_fill_distiller(palette="RdYlBu",limits=c(-3,7),breaks=c(-3,-1,1,3,5,7),name="log10(median Conc)") + #, limits= c(-4,8),breaks = c(-4,-2,0,2,4,6,8),labels = c(-4,-2,0,2,4,6,8)) +
#theme(legend.title = element_text(size=20, face="bold"), legend.text= element_text(size=15))
options(repr.plot.width=20, repr.plot.height=12)
ggplot(df_cc_perm_locs_all_pos_dedup_neg_permonth, aes(x=Time, y=reorder(Chemical,log_Median_Conc_ngL), fill=log_Median_Conc_ngL)) + geom_tile() +
#geom_tile(data=df_cc_perm_locs_all_pos_dedup_neg_permonth,aes(size=factor(iszero,c(TRUE,FALSE))),alpha=0,colour="blue") + #scale_size_discrete("Your legend", range=c(3, 0.5)) +
#+ geom_tile(data=df2, aes(size=factor(z, c(TRUE, FALSE))), alpha=0, color="blue") + scale_size_discrete("Your legend", range=c(3, 0.5))
xlab("Month-Year, N = 4 permanent sampling points") + ylab("Compound") + theme_classic() +
theme(axis.text.x = element_text(size=18), axis.text.y = element_text(size=15)) + #tick text
theme(axis.title.x = element_text(face="bold", size=20), axis.title.y = element_text(face="bold",size=20)) + #axis title
scale_fill_distiller(palette="RdYlBu",limits=c(-3,4),name="log10(MedianConc)") + #, limits= c(-4,8),breaks = c(-4,-2,0,2,4,6,8),labels = c(-4,-2,0,2,4,6,8)) +
theme(legend.title = element_text(size=20, face="bold"), legend.text= element_text(size=15))
Figure 4. Temporal heat map showing median concentration values (original units: ng/L) per compound measured per sampling month-year plotted using a base-10 logarithmic scale. Median values were calculated across the concentrations measured at the four permanent sampling locations for the respective compound and month-year. White boxes indicate concentration values that were below the respective quantification range, which were therefore discarded from median calculation. All compounds were measured in positive mode except for those marked with an asterisks, which were measured in negative mode.
#to save as pdf or png
heatmap_permonth <- ggplot(df_cc_perm_locs_all_pos_dedup_neg_permonth, aes(x=Time, y=reorder(Chemical,log_Median_Conc_ngL), fill=log_Median_Conc_ngL)) + geom_tile() +
xlab("Month-Year, N = 4 permanent sampling points") + ylab("Chemical") + theme_classic() +
theme(axis.text.x = element_text(size=10), axis.text.y = element_text(size=7)) + #tick text
theme(axis.title.x = element_text(face="bold", size=15), axis.title.y = element_text(face="bold",size=15)) + #axis title
scale_fill_distiller(palette="RdYlBu",limits=c(-3,4),name="log10(MedianConc)") +
theme(legend.title = element_text(size=12, face="bold"), legend.text= element_text(size=10))
#ggsave("temporal_heatmap_per_month_permanent_sampling_points_20210422.pdf", heatmap_permonth, width = 297, height = 210, units = "mm")
ggsave("temporal_heatmap_per_month_permanent_sampling_points_20210505.png", heatmap_permonth, width = 297, height = 210, units = "mm")
head(cc_long_all_pos_dedup_neg)
#calculate medians for 2019 only across all months
df_cc_2019_long_all_pos_dedup_neg = cc_long_all_pos_dedup_neg[cc_long_all_pos_dedup_neg$year=="2019",] %>%
group_by(Chemical,location) %>%
summarise(Median_Conc_ngL=median(Conc))
tail(df_cc_2019_long_all_pos_dedup_neg)
dim(df_cc_2019_long_all_pos_dedup_neg)
#testing
foo <- cc_long_all_pos_dedup_neg[cc_long_all_pos_dedup_neg$Chemical=="Warfarin"&
cc_long_all_pos_dedup_neg$location=="Syr-Mertert"&
cc_long_all_pos_dedup_neg$year=="2019",
"Conc"]
foo <- as.numeric(unlist(foo))
median(foo)
#populate $notzero before taking log
#df_cc_2019_long_all_pos_dedup_neg$notzero <- TRUE
#df_cc_2019_long_all_pos_dedup_neg$notzero[df_cc_2019_long_all_pos_dedup_neg$Median_Conc_ngL == 0] <- FALSE
#take log of Median_Conc_ngL
#df_cc_2019_long_all_pos_dedup_neg$log_Median_Conc_ngL <- log10(df_cc_2019_long_all_pos_dedup_neg$Median_Conc_ngL)
summary(df_cc_2019_long_all_pos_dedup_neg$log_Median_Conc_ngL)
#convert -Inf to 0 to find true minimum
#df_cc_2019_long_all_pos_dedup_neg$log_Median_Conc_ngL[is.infinite(df_cc_2019_long_all_pos_dedup_neg$log_Median_Conc_ngL)] <- 0
#summary(df_cc_2019_long_all_pos_dedup_neg$log_Median_Conc_ngL)
#Min. 1st Qu. Median Mean 3rd Qu. Max.
#-2.301 0.391 2.768 2.371 4.320 6.072
#convert -Inf to -99 for plotting (ONLY IF THERE WERE ZERO-VALUES)
#df_cc_2019_long_all_pos_dedup_neg$log_Median_Conc_ngL[is.infinite(df_cc_2019_long_all_pos_dedup_neg$log_Median_Conc_ngL)] <- -99
df_cc_2019_long_all_pos_dedup_neg[df_cc_2019_long_all_pos_dedup_neg$Chemical=="Bicalutamide*",]
ggplot(df_cc_2019_long_all_pos_dedup_neg, aes(x=reorder(location,log_Median_Conc_ngL), y=reorder(Chemical,log_Median_Conc_ngL), fill=log_Median_Conc_ngL)) +
geom_tile() +
xlab("2019 Sampling Points, N = 6 months") + ylab("Chemical") + theme_classic() +
theme(axis.text.x = element_text(size=18,angle=10,hjust=1), axis.text.y = element_text(size=15)) + #tick text
theme(axis.title.x = element_text(face="bold", size=30), axis.title.y = element_text(face="bold",size=30)) + #axis title
scale_fill_distiller(palette="RdYlBu",limits=c(-3,4.1),name="log10(MedianConc)") +
theme(legend.title = element_text(size=20, face="bold"), legend.text= element_text(size=15))#legends
Figure 2. Spatial heat map showing median concentration values (original units: ng/L) per compound measured per sampling location over 6 months in 2019, plotted using a base-10 logarithmic scale. Median values were calculated across the concentrations measured over the relevant months of sampling for the respective compound and location. White boxes indicate that there were no concentration values within the quantification range. All compounds were measured in positive mode except for those marked with an asterisks, which were measured in negative mode.
#to save as pdf
heatmap_2019_perloc <- ggplot(df_cc_2019_long_all_pos_dedup_neg, aes(x=reorder(location,log_Median_Conc_ngL), y=reorder(Chemical,log_Median_Conc_ngL), fill=log_Median_Conc_ngL)) +
geom_tile() +
xlab("2019 Sampling Points, N = 6 months") + ylab("Chemical") + theme_classic() +
theme(axis.text.x = element_text(size=10,angle=10,hjust=1), axis.text.y = element_text(size=7)) + #tick text
theme(axis.title.x = element_text(face="bold", size=15), axis.title.y = element_text(face="bold",size=15)) + #axis title
scale_fill_distiller(palette="RdYlBu",limits=c(-3,4.1),name="log10(MedianConc)") +
theme(legend.title = element_text(size=12, face="bold"), legend.text= element_text(size=10))#legends
#ggsave("spatial_heatmap_2019_per_loc_20210422.pdf", heatmap_2019_perloc, width = 297, height = 210, units = "mm")
ggsave("spatial_heatmap_2019_per_loc_20210505.png", heatmap_2019_perloc, width = 297, height = 210, units = "mm")
#recalculate medians for 2020 only across all months
df_cc_2020_long_all_pos_dedup_neg <- cc_long_all_pos_dedup_neg[cc_long_all_pos_dedup_neg$year==2020,] %>%
group_by(Chemical,location) %>%
summarise(Median_Conc_ngL=median(Conc))
head(df_cc_2020_long_all_pos_dedup_neg)
dim(df_cc_2020_long_all_pos_dedup_neg)
#testing
bar <- cc_long_all_pos_dedup_neg[cc_long_all_pos_dedup_neg$Chemical=="3-hydroxycarbamazepine"&
cc_long_all_pos_dedup_neg$location=="Alzette-Ettelbruck"&
cc_long_all_pos_dedup_neg$year=="2020","Conc"]
bar <- as.numeric(unlist(bar))
median(bar)
#populate $notzero before taking log
#df_cc_2020_long_all_pos_dedup_neg$notzero <- TRUE
#df_cc_2020_long_all_pos_dedup_neg$notzero[df_cc_2020_long_all_pos_dedup_neg$Median_Conc_ngL == 0] <- FALSE
#take log of Median_Conc_ngL
#df_cc_2020_long_all_pos_dedup_neg$log_Median_Conc_ngL <- log10(df_cc_2020_long_all_pos_dedup_neg$Median_Conc_ngL)
#convert -Inf to 0 to find true minimum
#df_cc_2020_long_all_pos_dedup_neg$log_Median_Conc_ngL[is.infinite(df_cc_2020_long_all_pos_dedup_neg$log_Median_Conc_ngL)] <- 0
#summary(df_cc_2020_long_all_pos_dedup_neg$log_Median_Conc_ngL)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
#-2.301030 0.006038 1.973128 1.911851 3.556544 5.763277
#convert -Inf to -99 for plotting (ONLY IF THERE WERE ZERO-VALUES)
#df_cc_2020_long_all_pos_dedup_neg$log_Median_Conc_ngL[is.infinite(df_cc_2020_long_all_pos_dedup_neg$log_Median_Conc_ngL)] <- -99
summary(df_cc_2020_long_all_pos_dedup_neg$log_Median_Conc_ngL)
ggplot(df_cc_2020_long_all_pos_dedup_neg, aes(x=reorder(location,log_Median_Conc_ngL), y=reorder(Chemical,log_Median_Conc_ngL), fill=log_Median_Conc_ngL)) +
geom_tile() +
xlab("2020 Sampling Points, N = 5 months") + ylab("Chemical") + theme_classic() +
theme(axis.text.x = element_text(size=18,angle=10,hjust=1), axis.text.y = element_text(size=15)) + #tick text
theme(axis.title.x = element_text(face="bold", size=30), axis.title.y = element_text(face="bold",size=30)) + #axis title
scale_fill_distiller(palette="RdYlBu",limits=c(-3,4),name="log10(MedianConc)") +
theme(legend.title = element_text(size=20, face="bold"), legend.text= element_text(size=15))#legends
Figure 3. Spatial heat map showing median concentration values (original units: ng/L) per compound measured per sampling location over 5 months in 2020, plotted using a base-10 logarithmic scale. Median values were calculated across the concentrations measured over the relevant months of sampling for the respective compound and location. White boxes indicate that there were no concentration values within the quantification range. All compounds were measured in positive mode except for those marked with an asterisks, which were measured in negative mode.
#to save as pdf
heatmap_2020_perloc <- ggplot(df_cc_2020_long_all_pos_dedup_neg, aes(x=reorder(location,log_Median_Conc_ngL), y=reorder(Chemical,log_Median_Conc_ngL), fill=log_Median_Conc_ngL)) +
geom_tile() +
xlab("2020 Sampling Points, N = 5 months") + ylab("Chemical") + theme_classic() +
theme(axis.text.x = element_text(size=10,angle=10,hjust=1), axis.text.y = element_text(size=7)) + #tick text
theme(axis.title.x = element_text(face="bold", size=15), axis.title.y = element_text(face="bold",size=15)) + #axis title
scale_fill_distiller(palette="RdYlBu",limits=c(-3,4),name="log10(MedianConc)") +
theme(legend.title = element_text(size=12, face="bold"), legend.text= element_text(size=10))#legends
#ggsave("spatial_heatmap_2020_per_loc_20210422.pdf", heatmap_2020_perloc, width = 297, height = 210, units = "mm")
ggsave("spatial_heatmap_2020_per_loc_20210505.png", heatmap_2020_perloc, width = 297, height = 210, units = "mm")
Idea would be to show distribution of conc values, not just median.
Zero-value concentrations could indicate point sources, non-zero-value concentrations could up-prioritise that compound for Suspect Screening.
head(cc_long_all_pos_dedup_neg)
#what are the lowest and highest median Conc overall?
overall_median_conc <- cc_long_all_pos_dedup_neg %>% group_by(Chemical) %>% summarise(Median_Conc_ngL=median(Conc))
head(overall_median_conc)
summary(overall_median_conc$Median_Conc_ngL)
##output median concs
#write.csv(overall_median_conc, "20210505_overall_median_conc_all_locs_all_time.csv")
cc_long_all_pos_dedup_neg$Year <- as.factor(cc_long_all_pos_dedup_neg$year)
#calculate logConc
cc_long_all_pos_dedup_neg$logConc <- log10(cc_long_all_pos_dedup_neg$Conc)
head(cc_long_all_pos_dedup_neg)
bp <- ggplot(cc_long_all_pos_dedup_neg, aes(x=reorder(Chemical,logConc, FUN=median),y=logConc)) +
xlab("Chemical") + ylab("log10(Conc)")
#with no jittering
bp + geom_boxplot() + geom_point(alpha=0.3) + #shape="." for very small
theme(axis.text.x = element_text(size=14, angle=65, hjust=1),axis.text.y = element_text(size=18)) +
theme(axis.title.x = element_text(face="bold", size=30), axis.title.y = element_text(face="bold",size=30))
#to save as pdf
bps_sorted_median_nojit <- bp + geom_boxplot() + geom_point(alpha=0.3) + #shape="." for very small
theme(axis.text.x = element_text(size=10, angle=65, hjust=1),axis.text.y = element_text(size=13)) +
theme(axis.title.x = element_text(face="bold", size=16), axis.title.y = element_text(face="bold",size=16))
#ggsave("boxplots_20210116_nojittering.pdf", bps_sorted_median_nojit, width = 297, height = 210, units = "mm")
#with jittering
#bp + geom_boxplot() + geom_jitter(alpha=0.3, position=position_jitter(0.2)) +
#xlab('log Concentration [ng/L]') +
#theme(axis.text.x = element_text(size=14, angle=65, hjust=1),axis.text.y = element_text(size=18)) +
#theme(axis.title.x = element_text(face="bold", size=30), axis.title.y = element_text(face="bold",size=30))
#to save as pdf
#bps_sorted_median <- bp + geom_boxplot() + geom_jitter(alpha=0.3, position=position_jitter(0.2)) +
#theme(axis.text.x = element_text(size=10, angle=65, hjust=1),axis.text.y = element_text(size=13)) +
#theme(axis.title.x = element_text(face="bold", size=16), axis.title.y = element_text(face="bold",size=16))
#ggsave("boxplots_20210113_jittering.pdf", bps_sorted_median, width = 297, height = 210, units = "mm")
#colour code by year
update_geom_defaults("point", list(colour = NULL))
c <- ggplot(cc_long_all_pos_dedup_neg,aes(x=reorder(Chemical,logConc,FUN=median),y=logConc)) +
geom_boxplot(outlier.shape=NA) + xlab('Chemical') + ylab('log10(Conc)')
c + geom_point(aes(color=Year),alpha=0.2) +
theme(axis.text.x = element_text(size=14, angle=65, hjust=1),axis.text.y = element_text(size=18)) +
theme(axis.title.x = element_text(face="bold", size=30), axis.title.y = element_text(face="bold",size=30)) +
theme(legend.position = c(0.95, 0.1)) +
scale_color_manual(values=c("#332288","#009E73")) + #'#6633FF','#66FF33')) + #("#E69F00","#0072B2")) +
theme(legend.title = element_text(size=20), legend.text=element_text(size=15))
#to save as pdf
c_sorted_median_years <- c + geom_point(aes(color=Year),alpha=0.3,size=0.8) +
theme(axis.text.x = element_text(size=8, angle=65, hjust=1),axis.text.y = element_text(size=10)) +
theme(axis.title.x = element_text(face="bold", size=15), axis.title.y = element_text(face="bold",size=15)) +
theme(legend.position = c(0.95, 0.1)) +
scale_color_manual(values=c("#332288","#009E73")) +
theme(legend.title = element_text(size=12), legend.text=element_text(size=10)) + guides(colour = guide_legend(override.aes = list(size=2)))
#ggsave("boxplots_20210116_nojittering_years.pdf", c_sorted_median_years , width = 297, height = 210, units = "mm")
#ggsave("boxplots_20210116_nojittering_years.png", c_sorted_median_years , width = 297, height = 210, units = "mm")
head(cc_long_all_pos_dedup_neg)
#with separate boxes, one for 2019, one for 2020
ggplot(cc_long_all_pos_dedup_neg,aes(x=reorder(Chemical,logConc,FUN=median),y=logConc, color=Year)) +
geom_boxplot(outlier.shape=NA) + xlab('Chemical') + ylab('log10(Conc)') + geom_point(alpha=0.2) +
theme(axis.text.x = element_text(size=14, angle=65, hjust=1),axis.text.y = element_text(size=18)) +
theme(axis.title.x = element_text(face="bold", size=30), axis.title.y = element_text(face="bold",size=30)) +
theme(legend.position = c(0.95, 0.1)) +
scale_color_manual(values=c("#332288","#009E73")) + #'#6633FF','#66FF33')) + #("#E69F00","#0072B2")) +
theme(legend.title = element_text(size=20), legend.text=element_text(size=15))
Initiate plot to subset top-x:
twoboxes_peryear <- ggplot(cc_long_all_pos_dedup_neg,aes(x=reorder(Chemical,logConc,FUN=median),y=logConc, color=Year)) +
geom_boxplot(outlier.shape=NA) + xlab('Chemical') + ylab('log10(Conc)') + geom_point(alpha=0.2) +
theme(axis.text.x = element_text(size=8, angle=65, hjust=1),axis.text.y = element_text(size=10)) +
theme(axis.title.x = element_text(face="bold", size=15), axis.title.y = element_text(face="bold",size=15)) +
theme(legend.position = c(0.95, 0.1)) +
scale_color_manual(values=c("#332288","#009E73")) +
theme(legend.title = element_text(size=12), legend.text=element_text(size=10)) + guides(colour = guide_legend(override.aes = list(size=0.8)))
#save as pdf
#ggsave("two_boxplots_20210119_nojittering_years.pdf", twoboxes_peryear , width = 297, height = 210, units = "mm")
#ggsave("two_boxplots_20210119_nojittering_years.png", twoboxes_peryear , width = 297, height = 210, units = "mm")
#plot only top 50 compounds
#found a better solution
ggbld <- ggplot_build(twoboxes_peryear)
compounds_sorted_ascending_medianlogConc <- ggbld$layout$coord$labels(ggbld$layout$panel_params)[[1]]$x.labels
compounds_sorted_ascending_medianlogConc
length(compounds_sorted_ascending_medianlogConc)
#take the top x compounds (sorted by median)
toplot_compounds_sorted_ascending_medianlogConc <- compounds_sorted_ascending_medianlogConc[37:86]
length(compounds_sorted_ascending_medianlogConc)
tail(toplot_compounds_sorted_ascending_medianlogConc)
top50_medians_percompound_peryear <- cc_long_all_pos_dedup_neg[cc_long_all_pos_dedup_neg$Chemical %in% toplot_compounds_sorted_ascending_medianlogConc, ]
dim(top50_medians_percompound_peryear)
#top10_medians_percompound_peryear <- cc_all_pos_dedup_neg[cc_all_pos_dedup_neg$Compound %in% toplot_compounds_sorted_ascending_medianlogConc, ]
#dim(top10_medians_percompound_peryear)
dim(cc_long_all_pos_dedup_neg)
#replot boxplot
top50_twoboxes_peryear <- ggplot(top50_medians_percompound_peryear,aes(x=reorder(Chemical,logConc,FUN=median),y=logConc, color=Year)) +
#top10_twoboxes_peryear <- ggplot(top10_medians_percompound_peryear,aes(x=reorder(Compound,logConc,FUN=median),y=logConc, color=Year)) +
geom_boxplot(outlier.shape=NA) + xlab('Chemical') + ylab('log10(Conc)') + geom_point(alpha=0.2) +
theme(axis.text.x = element_text(size=14, angle=65, hjust=1),axis.text.y = element_text(size=18)) +
theme(axis.title.x = element_text(face="bold", size=20), axis.title.y = element_text(face="bold",size=15)) +
theme(legend.position = c(0.95, 0.1)) +
scale_color_manual(values=c("#332288","#009E73")) +
theme(legend.title = element_text(size=20), legend.text=element_text(size=15)) + guides(colour = guide_legend(override.aes = list(size=0.8)))
top50_twoboxes_peryear
Figure 5. Boxplots showing the range of concentrations (original units: ng/L) measured for the top-50 highest concentration pharmaceutical chemicals across all months and sampling locations in 2019 and 2020, plotted using a base-10 logarithmic scale. Concentration values that were below the respective quantification ranges were excluded. All chemicals were measured in positive mode.
#save as pdf
#ggsave("top50_twoboxes_peryear_20210505_nojittering_years.pdf", top50_twoboxes_peryear , width = 297, height = 210, units = "mm")
ggsave("top50_twoboxes_peryear_20210505_nojittering_years.png", top50_twoboxes_peryear , width = 297, height = 210, units = "mm")
#options(repr.plot.width=20, repr.plot.height=12)
#with jittering
top50_jitter_twoboxes_peryear <- ggplot(top50_medians_percompound_peryear,aes(x=reorder(Chemical,logConc,FUN=median),y=logConc, color=Year)) +
geom_boxplot() + xlab('Chemical') + ylab('log10(Conc)') + geom_jitter(alpha=0.3, position=position_jitter(0.2)) +
theme(axis.text.x = element_text(size=14, angle=65, hjust=1),axis.text.y = element_text(size=18)) +
theme(axis.title.x = element_text(face="bold", size=20), axis.title.y = element_text(face="bold",size=15)) +
theme(legend.position = c(0.95, 0.1)) +
scale_color_manual(values=c("#332288","#009E73")) +
theme(legend.title = element_text(size=20), legend.text=element_text(size=15)) + guides(colour = guide_legend(override.aes = list(size=0.8)))
top50_jitter_twoboxes_peryear
#save as pdf
#ggsave("top50_jitter_twoboxes_peryear_20210505_years.pdf", top50_jitter_twoboxes_peryear , width = 297, height = 210, units = "mm")
ggsave("top50_jitter_twoboxes_peryear_20210505_2_years.png", top50_jitter_twoboxes_peryear , width = 297, height = 210, units = "mm")
#without points top50
#readjust indexing as necessary
top50_nopoints_twoboxes_peryear <- ggplot(top50_medians_percompound_peryear,aes(x=reorder(Chemical,logConc,FUN=median),y=logConc, color=Year)) +
#top10_twoboxes_peryear <- ggplot(top10_medians_percompound_peryear,aes(x=reorder(Compound,logConc,FUN=median),y=logConc, color=Year)) +
geom_boxplot(outlier.shape=NA) + xlab('Chemical') + ylab('log10(Conc)') +
theme(axis.text.x = element_text(size=14, angle=65, hjust=1),axis.text.y = element_text(size=18)) +
theme(axis.title.x = element_text(face="bold", size=20), axis.title.y = element_text(face="bold",size=15)) +
theme(legend.position = c(0.95, 0.1)) +
scale_color_manual(values=c("#332288","#009E73")) +
theme(legend.title = element_text(size=20), legend.text=element_text(size=15)) + guides(colour = guide_legend(override.aes = list(size=0.8)))
top50_nopoints_twoboxes_peryear
#save as pdf
#ggsave("top50_nopoints_twoboxes_peryear_20210505_years.pdf", top50_nopoints_twoboxes_peryear , width = 297, height = 210, units = "mm")
ggsave("top50_nopoints_twoboxes_peryear_20210505_2_years.png", top50_nopoints_twoboxes_peryear , width = 297, height = 210, units = "mm")
#without points, top30
#readjust indexing as necessary
top30_nopoints_twoboxes_peryear <- ggplot(top50_medians_percompound_peryear,aes(x=reorder(Chemical,logConc,FUN=median),y=logConc, color=Year)) +
#top10_twoboxes_peryear <- ggplot(top10_medians_percompound_peryear,aes(x=reorder(Compound,logConc,FUN=median),y=logConc, color=Year)) +
geom_boxplot(outlier.shape=NA) + xlab('Chemical') + ylab('log10(Conc)') +
theme(axis.text.x = element_text(size=14, angle=65, hjust=1),axis.text.y = element_text(size=18)) +
theme(axis.title.x = element_text(face="bold", size=20), axis.title.y = element_text(face="bold",size=15)) +
theme(legend.position = c(0.95, 0.1)) +
scale_color_manual(values=c("#332288","#009E73")) +
theme(legend.title = element_text(size=20), legend.text=element_text(size=15)) + guides(colour = guide_legend(override.aes = list(size=0.8)))
top30_nopoints_twoboxes_peryear
#save as pdf
#ggsave("top30_nopoints_twoboxes_peryear_20210504_nojittering_years.pdf", top30_nopoints_twoboxes_peryear , width = 297, height = 210, units = "mm")
#ggsave("top30_nopoints_twoboxes_peryear_20210504_nojittering_years.png", top30_nopoints_twoboxes_peryear , width = 297, height = 210, units = "mm")