##############################################################################################
####################### Removal Requirement ##################################################
# Script to visualise removal requirement, purification treatment efficiency 
# and the combined required purification treatment effort
#
# It is assumed that preprocessing of data has been done
# according to the paper: "A water quality index for the removal requirement and the 
# purification treatment effort of micropollutants" https://doi.org/10.2166/ws.2020.289 
#
# Version 23 okt 2020
# autor: Tessa Pronk, KWR Watercycle Research Institute
#
# This script needs: Par_Exceeding_Per_Loc.xlsx 
# This script needs: Par_numbers.xlsx
# This script needs: Epicas_fin 
# These files are provided in the data package 10.5281/zenodo.4165486
#############################################################################################

library(openxlsx)
library(tidyverse)

setwd("")

# Get sheetnames
sh_nameloc<-getSheetNames("Par_Exceeding_Per_Loc.xlsx")
sh_nameloc

# For which location is visualisation done? 
# This is a name in one of the sheets of the Par_Exceeding_Per_Loc.xlsx
Loc<-"Andijk"

# read in a dataset
Dataset<-read.xlsx("Par_Exceeding_Per_Loc.xlsx", sheet=which(sh_nameloc==Loc),rowNames=TRUE)
Dataset<-Dataset[,-ncol(Dataset)] # remove casnumbers, the last column
head(Dataset)

# read a file with total measured parameters
sh_namepar<-getSheetNames("Par_numbers.xlsx")
parset<-read.xlsx("Par_numbers.xlsx", sheet=which(sh_namepar==Loc),rowNames=TRUE)
parnramp<-colSums(Dataset != 0) # exceeding parameters are those that are not zero
parnr<-parset[,'MeasuredPar'] # Total number of reported parameters


#############################################################################################
##################### Part 1: DRAW PLOTS WQI RR Index #######################################

WQI_RR<-colSums(Dataset) # the water quality index for removal requirement 

windows(12,8,rescale="fit")
par(mar = c(4.5, 4, 2, 1))
     

  adj1<-4.2  # Adjustment of size of points in visualization # exceeding chemicals 
  adj2<-54   # Adjustment of size of points in visualization # measured chemicals total
  maxX<-max(WQI_RR)

  bgcol<-rgb(0, 0, 225, max = 255, alpha = 80, names = "blue50")

  # Plot the WQI_RR with sizes for exceeding chemicals and total measured chemicals
  # Mind the ylim, adjust if needed
  plot(WQI_RR,ylim=c(0,1200),col="black", xaxt = 'n',pch=21,cex=(as.numeric(parnr))/adj2,
     bg= bgcol,xlab="Year",ylab="Removal Requirement Index",main= paste("",Loc))
  points(WQI_RR,col="black",pch=16,cex=as.numeric(parnramp)/adj1)

  axis(1,rownames(parset),at=c(1:length(rownames(parset))))

  # Calculate a trendline and plot this in the graph
  fit <- glm(as.numeric(WQI_RR)~c(1:length(WQI_RR)))
  co <- coef(fit)
  abline(fit, col="red", lwd=3)

  # Is the trend significant (p-value)?
  summary(fit)$coefficients[2,4]

  
# Visualise the legend  
windows(12,8,rescale="fit") 
  plot(1, type="n", xlab="", ylab="",xaxt = 'n', yaxt = 'n',xlim=c(0,10),ylim=c(0,10))

  legend(1,10,"Highest and lowest numbers of parameters:",bty="n")

  legend(1,9,c(paste(min(parnramp),"parameters exceeding",sep=" "),
    paste("  ",max(parnramp),"parameters exceeding",sep=" ")),pch=c(16,16),
    pt.cex=c(min(parnramp)/adj1,max(parnramp)/adj1),bty="n",x.intersp=c(3,3),y.intersp=3) 
                  #  
  legend(1,5,c(paste(min(parset),"parameters reported",sep=" "),
    paste("  ",max(parset),"parameters reported",sep=" ")),pch=c(21,21),
    pt.cex=c(min(parset)/adj2,max(parset)/adj2),pt.bg=c(bgcol,bgcol),bty="n",x.intersp=c(3,3),y.intersp=3)  

  
##########################################################################################################
############## Part 2: DRAW STREAMGRAPH PLOTS WQI RR Index ###############################################

# see https://stackoverflow.com/questions/13084998/streamgraphs-in-r/28541089
# this function below is used for the stream plot.
# OR use:
# library(devtools)
# source_url('https://gist.github.com/menugget/7864454/raw/f698da873766347d837865eecfa726cdf52a6c40/plot.stream.4.R')

# First run this function for stream charts before plotting:

plot.stream <- function(
  x, y, 
  order.method = "as.is", frac.rand=0.1, spar=0.2,
  center=TRUE,
  ylab="", xlab="",  
  border = NULL, lwd=1, 
  col=rainbow(length(y[1,])),
  ylim=NULL, 
  ...
){
  
  if(sum(y < 0) > 0) error("y cannot contain negative numbers")
  
  if(is.null(border)) border <- par("fg")
  border <- as.vector(matrix(border, nrow=ncol(y), ncol=1))
  col <- as.vector(matrix(col, nrow=ncol(y), ncol=1))
  lwd <- as.vector(matrix(lwd, nrow=ncol(y), ncol=1))
  
  if(order.method == "max") {
    ord <- order(apply(y, 2, which.max))
    y <- y[, ord]
    col <- col[ord]
    border <- border[ord]
  }
  
  if(order.method == "first") {
    ord <- order(apply(y, 2, function(x) min(which(r>0))))
    y <- y[, ord]
    col <- col[ord]
    border <- border[ord]
  }
  
  bottom.old <- x*0
  top.old <- x*0
  polys <- vector(mode="list", ncol(y))
  for(i in seq(polys)){
    if(i %% 2 == 1){ #if odd
      top.new <- top.old + y[,i]
      polys[[i]] <- list(x=c(x, rev(x)), y=c(top.old, rev(top.new)))
      top.old <- top.new
    }
    if(i %% 2 == 0){ #if even
      bottom.new <- bottom.old - y[,i]
      polys[[i]] <- list(x=c(x, rev(x)), y=c(bottom.old, rev(bottom.new)))
      bottom.old <- bottom.new
    }
  }
  
  ylim.tmp <- range(sapply(polys, function(x) range(x$y, na.rm=TRUE)), na.rm=TRUE)
  outer.lims <- sapply(polys, function(r) rev(r$y[(length(r$y)/2+1):length(r$y)]))
  mid <- apply(outer.lims, 1, function(r) mean(c(max(r, na.rm=TRUE), min(r, na.rm=TRUE)), na.rm=TRUE))
  
  #center and wiggle
  if(center) {
    g0 <- -mid + runif(length(x), min=frac.rand*ylim.tmp[1], max=frac.rand*ylim.tmp[2])
  } else {
    g0 <- runif(length(x), min=frac.rand*ylim.tmp[1], max=frac.rand*ylim.tmp[2])
  }
  
  fit <- smooth.spline(g0 ~ x, spar=spar)
  
  for(i in seq(polys)){
    polys[[i]]$y <- polys[[i]]$y + c(fit$y, rev(fit$y))
  }
  
  if(is.null(ylim)) ylim <- range(sapply(polys, function(x) range(x$y, na.rm=TRUE)), na.rm=TRUE)
  plot(x,y[,1], ylab=ylab, xlab=xlab, ylim=ylim, t="n", ...)
  for(i in seq(polys)){
    polygon(polys[[i]], border=border[i], col=col[i], lwd=lwd[i])
  }
  
}
#######################################################################################################
# Now plot the streamgraph for the dataset

# First transpose the dataset 
Dataset_t<-t(Dataset)
Dataset_t<-Dataset_t[,-which(colSums(Dataset_t)<1)]

# Draw the streamplot
windows(20,7)

set.seed(1)
m <- nrow(Dataset_t) # number of years
n <- ncol(Dataset_t) # number of substances
y <- matrix(0, nrow=m, ncol=n)
colnames(y) <- seq(n)
for(i in 1:n){
  fited <- smooth.spline(Dataset_t[,i],df=10)
  y[,i] <- fited$y
}

Dataset_t1 <- replace(y, y<0.01, 0)

#order by when 1st value occurs
ord <- order(apply(Dataset_t1, 2, function(r) min(which(r>0))))
Dataset_t2 <- Dataset_t1[, ord]
COLS <- rainbow(ncol(Dataset_t2))

plot.stream(seq(nrow(Dataset_t2)),Dataset_t2, axes=FALSE,
            main=Loc,
            ylab="Removal Requirement", xlim=c(0, m+10), xaxs="i",yaxs="i", #ylim=c(-81,81),
            center=TRUE, spar=0.2, frac.rand=0.1, col=COLS, border=1, lwd=0.1)

years<-rownames(parset)

legend("right",colnames(Dataset_t)[ord],pch=c(rep(15,n)),col=COLS,cex=0.7) 
axis(1,years,at=seq(1,length(years),1)) # draw x-axis: years
axis(2,c(0,round(max(Dataset_t2)),0)) # draw the size of the maximum exceedance for reference

# draw only the legend
windows(60,60)

  set.seed(1)
  m <- nrow(Dataset_t) # number of years
  n <- ncol(Dataset_t) # number of substances
  y <- matrix(0, nrow=m, ncol=n)
  colnames(y) <- seq(n)
  for(i in 1:n){
    fited <- smooth.spline(Dataset_t[,i],df=10)
    y[,i] <- fited$y
  }
  
  Dataset_t1 <- replace(y, y<0.01, 0)
  
  #order by when 1st value occurs
  ord <- order(apply(Dataset_t1, 2, function(r) min(which(r>0))))
  Dataset_t2 <- Dataset_t1[, ord]
  COLS <- rainbow(ncol(Dataset_t2))
  
  plot(1, type="n", xlab="", ylab="",xaxt = 'n', yaxt = 'n',xlim=c(0,10),ylim=c(0,10))
   legend(0,10,colnames(Dataset_t)[ord],pch=c(rep(15,n)),col=COLS,cex=1) 
  

###############################################################################################################
####### Part 3: Calculate Removal efficiency per chemcial from biodegradation and log KOW #####################


# Read in EPI-suite results for the compounds in the measurement programme
a1<-scan("epicas_fin",what="character",sep="\n")
a2<-lapply(a1,paste,"XXend") 
a2<-paste(a2,collapse = " ") # plak alle woorden aan elkaar tot een grote text
a3<-unlist(strsplit(a2, fixed=TRUE,"CAS Num: ", perl=TRUE)) # split on 'CAS Num:' to get separate chemicals

a4<-lapply(a3,strsplit,fixed=TRUE,"XXend",perl=TRUE)        # split each chemical into separate lines again
a4<-a4[-1]


EPI_Matrix<-data.frame(matrix(vector(), length(a4), 5,
                  dimnames=list(c(), c("CAS", "CHEM", "LOGKOW","BioDegr","invalid"))),stringsAsFactors=F)

# Extract data from the EpiSuite file, per chemical
for (chem in 1: length(a4)){ 

  EPI_Matrix[chem,1]<- unlist(a4[[chem]])[1] # casnummer
  
  # chemische naam (verwijder alles voor ':')
  EPI_Matrix[chem,2]<- gsub(".*:","",unlist(a4[[chem]])[grep("CHEM ",unlist(a4[[chem]]))] ) 

  # If there is an experimental value for LogKow, take that one. If there are several, take the first. 
  # Otherwise, take the predicted value LogKow
  if(length(grep("Log Kow \\(Exper\\. database match",a4[[chem]]))==0){
    EPI_Matrix[chem,3]<- gsub(".*=","",unlist(a4[[chem]])[grep("KOWWIN ",unlist(a4[[chem]]))] )
  } else {
    EPI_Matrix[chem,3]<- gsub(".*=","",unlist(a4[[chem]])[grep("Log Kow \\(Exper\\. database match",unlist(a4[[chem]]))[1]] )
  }
 
  # Biodegradability (verwijder alles voor ':' en dan alles behalve cijfers)
  EPI_Matrix[chem,4]<- gsub("[^0-9.]", "",gsub(".*:","",unlist(a4[[chem]])[grep("Biowin3",unlist(a4[[chem]]))] ))
  
  # If a chemical falls out of the modeling domain, list as "invalid
  if (length(grep(" WARNING: Inorganic Compound ",unlist(a4[[chem]]))>0)) {   EPI_Matrix[chem,5]<- "invalid"}

} # end for chem


# Make numeric, and remove empty spaces:
EPI_Matrix$LOGKOW<-as.numeric(as.character(EPI_Matrix$LOGKOW))
EPI_Matrix$BioDegr<-as.numeric(as.character(EPI_Matrix$BioDegr))
EPI_Matrix$CHEM<-gsub(" ", "", EPI_Matrix$CHEM)
EPI_Matrix$CAS<-gsub(" ", "", EPI_Matrix$CAS)
head(EPI_Matrix)

# Calculate removal scores
# 0 is all removal. 3 is no removal.
EPI_Matrix$LOGKOW_SCORE<- NA
EPI_Matrix$LOGKOW_SCORE[which(EPI_Matrix$LOGKOW>6)]<-0
EPI_Matrix$LOGKOW_SCORE[which((EPI_Matrix$LOGKOW>3) & (EPI_Matrix$LOGKOW<6))]<-1
EPI_Matrix$LOGKOW_SCORE[which((EPI_Matrix$LOGKOW>0) & (EPI_Matrix$LOGKOW<3))]<-2
EPI_Matrix$LOGKOW_SCORE[which(EPI_Matrix$LOGKOW<0)]<-3

EPI_Matrix$BioDegr_SCORE<- NA
EPI_Matrix$BioDegr_SCORE[which(EPI_Matrix$BioDegr>4.75)]<-1.5
EPI_Matrix$BioDegr_SCORE[which((EPI_Matrix$BioDegr>3.25) & (EPI_Matrix$BioDegr<4.75))]<-2
EPI_Matrix$BioDegr_SCORE[which((EPI_Matrix$BioDegr>2.25) & (EPI_Matrix$BioDegr<3.25))]<-2.5
EPI_Matrix$BioDegr_SCORE[which(EPI_Matrix$BioDegr<2.25)]<-3

MaxRem<-100

# Calculate average removal label (not used)
EPI_Matrix$AvREMLabel<-NA

EPI_Matrix$AvREMLabel<-apply(cbind(EPI_Matrix$LOGKOW_SCORE,EPI_Matrix$BioDegr_SCORE),1,mean,na.rm=TRUE)
EPI_Matrix$AvREMLabel[which(EPI_Matrix$invalid=="invalid")]<-0.33 # higest removal
EPI_Matrix$AvREMLabel_REMOVAL<-(1-EPI_Matrix$AvREMLabel/3)*MaxRem

# Calculate RIWA_REMOVAL (used)
EPI_Matrix$RIWA_REMOVAL<-NA

for (jj in 1:nrow(EPI_Matrix)){
  # 3 = 0% (not removable), 0=100% (complete removal)
  # RIWA_REMOVAL is % removed. 
  EPI_Matrix$RIWA_REMOVAL[jj]<-(1-(EPI_Matrix$LOGKOW_SCORE[jj]/3)*(EPI_Matrix$BioDegr_SCORE[jj]/3))*100 
  # For the invalid chemicals, assign high removal 
  if(!is.na(EPI_Matrix$invalid[jj])){ EPI_Matrix$RIWA_REMOVAL[jj]<-90}
  # For some specific invalid chemicals, assign low removal (based on expert opinion)
  if(EPI_Matrix$CHEM[jj]=="NITRITE"||EPI_Matrix$CHEM[jj]=="CHLORIDEION"||EPI_Matrix$CHEM[jj]=="SULFATE"||
     EPI_Matrix$CHEM[jj]=="FLUORIDE")
    {EPI_Matrix$RIWA_REMOVAL[jj]<-17}
}

head(EPI_Matrix)


###############################################################################################################
####### Part 4: Plot the  PTE purification treatment effort ####################################################

MaxRem<-100
Dataset<-read.xlsx("Par_Exceeding_Per_Loc.xlsx", sheet=which(sh_nameloc==Loc),rowNames=TRUE)

# In EPI_matrix, casnumbers have preceding zero's. Make the casnumbers of Dataset the same.
Dataset$cas.nummer<-str_pad(Dataset$cas.nummer, 11, pad = "0")

# Merge Dataset and EPI_Matrix
Dataset2<-merge(EPI_Matrix,Dataset,by.x='CAS',by.y='cas.nummer',all=FALSE)

# Calculate per year average removal for chemicals with a removal requirement 
WQI_PTE <- vector() # make an empty vector to place results of the loop below
   
   for (ii in grep("[0-9]{4}",colnames(Dataset2))){
        # Take the average of the RIWA_REMOVAL for all chemicals in a year that have a Removal requirment > 0
        WQI_PTE<-c(WQI_PTE,100-mean(Dataset2[which(Dataset2[,ii]>0),'RIWA_REMOVAL']))
   }
   
   pntsz<-colSums(Dataset[,-ncol(Dataset)],na.rm=TRUE)

# Plot WQI_PTE  
windows(8,6,rescale="fit")   
   # the scaling factor is 300, adjust if neccesary.     
   plot(WQI_PTE,pch=15,cex=pntsz/300,xaxt = 'n', ylim=c(0,100),col='orange',ylab="WQI PTE", main=Loc,xlab="")
   
   # calculate the trend in WQI_PTE
   fit <- glm(as.numeric(WQI_PTE)~c(1:length(grep("[0-9]{4}",colnames(Dataset)))))
     paste("Significance PTE slope",Loc,summary(fit)$coefficients[2,4],", slope" , summary(fit)$coefficients[2,1]  )
     abline(fit, col="orange", lwd=3)
     axis(1,years,at=c(1:length(years)))

# plot a legend
windows(8,6,rescale="fit")     

   plot(1, type="n", xlab="", ylab="",xaxt = 'n', yaxt = 'n',xlim=c(0,10),ylim=c(0,10), 
     pch=21,col="red",bg="orange")
   legend(0,10,"Shown WQI Removal Requirement (RR) (square size)",bty="n")
   legend(0,9,"for chemicals that have a calculated Purification Treatement Effort (PTE):",bty="n")

   legend(0,8,c(paste(round(min(pntsz),0),"WQI RR",sep=" "),
             paste(round(max(pntsz),0),"WQI RR",sep=" ")),
     pch=c(15,15),col=c("orange","orange"),
     pt.cex=c(min(pntsz)/300,max(pntsz)/300),x.intersp=c(3,3),y.intersp=3,bty="n") 


###############################################################################
###########  Part 5: Plot the combined WQI RR PTE  ###########################################     
     
ZII<-Dataset
ZOI<-ZII
Nomatch<-vector()
     for (jj in 1: nrow(ZII)){
          remove<-EPI_Matrix$RIWA_REMOVAL[grep(ZII$cas.nummer[jj],EPI_Matrix$CAS)] # what is the removal efficiency (0% = not removable)
        if(length(grep(ZII$cas.nummer[jj],EPI_Matrix$CAS))<1|length(grep(ZII$cas.nummer[jj],EPI_Matrix$CAS))>1){Nomatch<-c(Nomatch,jj) }
        if(length(grep(ZII$cas.nummer[jj],EPI_Matrix$CAS))==1){
                 # How much is removed? 3 = 0% (not removable), 0=100% (complete removal)
          ZII[jj,grep(Loc,colnames(ZII))]<-((as.numeric(ZII[jj,grep(Loc,colnames(ZII))])-remove)/(MaxRem-remove))*MaxRem
          #ZII[jj,which(ZII[jj,]<0)]<-ZOI[jj,which(ZII[jj,]<0)]- remove # this it also include neg. values
        } # end if
     } # end jj rows amplist
  
   # All that is removed enough to below threshold value, has a negative value.
   # This means the remaining removal requirement is zero.
   ZII[ZII<0]<-0
   ZII[which(apply(ZII[,grep(Loc,colnames(ZII))],1,sum,na.rm=TRUE)>1),] # which chemicals have a remaining removal requirment?

   # new situation
   ZIIndex<-apply(ZII[,grep(Loc,colnames(ZII))],2,sum,na.rm=TRUE)
 # old situation
   ZOIndex<-apply(ZOI[,grep(Loc,colnames(ZII))],2,sum,na.rm=TRUE)
       
# For some parameters, the removal effort may not be calculated (no data in EpiSuite) and these are omitted:
   rownames(ZII)[Nomatch]

# Plot the WQI_RR_PTE      
windows(8,6,rescale="fit")        
  my_palette <- colorRampPalette(c("yellow", "orange", "red"))(n = 20)
  scf<-120 # 120 is a scaling factor, adjust if neccesary
  WQI_PTE<-c(70,25,WQI_PTE) # 25 and 75 are scaling factors for the color scheme.
  WQI_PTE
  col=my_palette[as.numeric(cut(as.numeric(WQI_PTE),breaks = 20))]
  
  plot(ZIIndex,col=col[-c(1:2)], pch=16,ylim=c(0,700),cex=ZOIndex/scf,
      ylab="required purification treatment level",xlab="year",main=Loc,xaxt = 'n') 
  axis(1,years,at=c(1:length(years)))
       
   # calculate and plot the trendline
   fit <- glm(as.numeric(ZIIndex)~c(1:length(ZIIndex)))
   co <- coef(fit)
   paste("WQI PTE RR p-value",Loc ,round(summary(fit)$coefficients[2,4],4),"slope" , round(summary(fit)$coefficients[2,1],1))
   abline(fit, col="black", lwd=3) 

# plot the legend   
windows(8,6,rescale="fit")  
   plot(1, type="n", xlab="", ylab="",xaxt = 'n', yaxt = 'n',xlim=c(0,10),ylim=c(0,10), 
     pch=21,col="red",bg="orange",bty="n")

   legend(-0.9,10,"Removal Requirement (RR):",bty="n")

   legend(0,8,c(paste(min(ZOIndex),"WQI RR",sep=" "),
             paste(max(ZOIndex),"WQI RR",sep=" ")),
       pch=c(16,16),col=c("grey","grey"),
       pt.cex=c(min(ZOIndex)/scf,max(ZOIndex)/scf),x.intersp=c(3,3),y.intersp=3,bty="n") 

   legend(4.5,9.8,c("Low PTE", "Medium PTE", "High PTE"),title="Purification Treatment Effort (PTE):",
       col=c("yellow","orange","red"),pch=c(15,15,15),pt.cex=2,y.intersp=3,bty="n")      



#####################################################################################################
########### Part 6: Plot individual chemical  WQI_RR_PTE ###########################################     


PE<-ZII[,-ncol(ZII)]
PEI<-PE[which(apply(PE,1,sum,na.rm=TRUE)>0),] # welke stoffen overschrijden nog
ARI<-  EPI_Matrix$RIWA_REMOVAL[match(ZII$cas.nummer[which(apply(PE,1,sum,na.rm=TRUE)>0)],EPI_Matrix$CAS)]
ARIcol<-ifelse(ARI<35,"red",ifelse(ARI>70,"yellow","orange"))
  
# hier een heatplot met per stof per jaar de residual removal
windows(7,7) 
  par(mfrow = c(1,1),mar = c(0, 4, 0, 0))  
  my_palette <- colorRampPalette(c("lightblue", "darkblue", "black"))(n = 20)

  heatmap(as.matrix(PEI),na.rm=TRUE,Rowv=NA,Colv=NA,
                 scale="none",col=my_palette,symm=F,RowSideColors=ARIcol,margins=c(20,20))
  legend(0,0.25,c("low PTE < 35", "medium PTE", "high PTE > 70"),title="Purification Treatement Effort (PTE)",col=c("yellow","orange","red"),pch=c(15,15,15),cex=c(0.8))      
  legend(0.5,0.25,c("high RR PTE", "medium RR PTE", "low RR PTE"),title="Required purification treatment level",col=c("black","darkblue","lightblue"),pch=c(15,15,15),cex=c(0.8))      
  


############################ END OF SCRIPT ######################################################################