#' Quality-control ICCAT marker tag data
#' 
#' \code{qc_iccat} does a bunch of cleaning and formatting of ICCAT data. No data is lost or filtered in this process except data that doesn't pass quality filters like points on land.
#' 
#' @param unrar_name is filename of XLS file output from \code{get_iccat}
#' @param bathy_file is path to file containing bathymetry data. File must be read-able by raster::raster().
#' @return a list of two dataframes: one of cleaned ICCAT marker tag data (ctag) and one of basic qc metrics (qc)
#' 
#' @export

qc_iccat_yearmonth <- function(unrar_name, bathy_file = NULL){
  ## read the extracted file
  start_row <- openxlsx::read.xlsx(unrar_name, sheet = 1, startRow = 1, colNames = FALSE, rows = c(1:10))
  start_row <- which(start_row == "TagGrpID", arr.ind = TRUE)[1]
  ctag <- openxlsx::read.xlsx(unrar_name, sheet = 1, startRow = start_row, colNames = TRUE)
  ## special place in hell for people who store data in excel files
  if (!('TagGrpID' %in% names(ctag))) ctag <- openxlsx::read.xlsx(unrar_name, sheet = 1, startRow = start_row + 1, colNames = TRUE)

  ## setup a qc dataframe
  qc <- data.frame(var='rows_start', val=nrow(ctag))
  
  ## format dates
  ctag <- ctag %>% filter(ReDate != 'Unk')
  
  ReDate <- try(lubridate::as_datetime((as.numeric(ctag$ReDate) - 2) * 3600 * 24, origin = '1900-01-01'), TRUE)
  if (class(ReDate)[1] == 'try-error'){
    ReDate <- try(lubridate::parse_date_time(ctag$ReDate, orders = c('Ymd','mdy')), TRUE)
    if (class(ReDate)[1] == 'try-error') stop('Unable to parse ctag dates as numeric or character')
  } 
  ctag$ReDate <- ReDate
  
  RcDate <- try(lubridate::as_datetime((as.numeric(ctag$RcDate) - 2) * 3600 * 24, origin = '1900-01-01'), TRUE)
  if (class(RcDate)[1] == 'try-error'){
    RcDate <- try(lubridate::parse_date_time(ctag$RcDate, orders = c('Ymd','mdy')), TRUE)
    if (class(RcDate)[1] == 'try-error') stop('Unable to parse ctag dates as numeric or character')
  } 
  ctag$RcDate <- RcDate
  
  ## some dates are sometimes read incorrectly such as thought to be in the wrong century
  ctag$ReDate[which(ctag$ReDate > Sys.Date())] <- ctag$ReDate[which(ctag$ReDate > Sys.Date())] - difftime('2010-01-01', '1910-01-01')
  ctag$RcDate[which(ctag$RcDate > Sys.Date())] <- ctag$RcDate[which(ctag$RcDate > Sys.Date())] - difftime('2010-01-01', '1910-01-01')
  ctag$ReYear <- lubridate::year(ctag$ReDate)
  ctag$RcYear <- lubridate::year(ctag$RcDate)
  
  ## some filtering to clean up the data
  #ctag <- ctag[which(ctag$ReYear >= 1990),] # can't get env data before this really anyway
  #ctag <- ctag[!duplicated(ctag$strTags1),] # only keep instances which strTags1 is unique (drops ~500 obs for SWO)
  #ctag$uid <- cumsum(!duplicated(ctag$strTags1))
  ctag$uid <- cumsum(!duplicated(ctag[,c('SpecimenID', 'strTags1')]))
  
  ## drop unnecessary cols
  ctag <- ctag[,-which(names(ctag) %in% c('InProcID', 'RC', 'RCStageCode', 'strTags2', 'strTags3', 'strTags4', 'Rc-ReYear', 'GrpYrsRec'))]
  
  if ('LenType' %in% names(ctag) & 'RcLenType' %in% names(ctag) & !('ReLenType' %in% names(ctag))) names(ctag)[which(names(ctag) == 'LenType')] <- 'ReLenType'
  if ('LenMethod' %in% names(ctag) & 'RcLenMethod' %in% names(ctag) & !('ReLenMethod' %in% names(ctag))) names(ctag)[which(names(ctag) == 'LenMethod')] <- 'ReLenMethod'
  
  ## tidy ctag df by splitting then re-combining releases and recaptures
  ctag.re <- ctag[which(!is.na(ctag$ReLatY)),]
  ctag.re <- ctag.re[,-which(grepl('^Rc', names(ctag.re)))]
  
  ctag.rc <- ctag[which(!is.na(ctag$RcLatY)),]
  ctag.rc <- ctag.rc[,-which(grepl('^Re', names(ctag.rc)))]
  
  ctag.re$ObsType <- 'release'
  ctag.rc$ObsType <- 'recovery'
  qc[c(nrow(qc) + 1),] <- c('rows_release', nrow(ctag.re))
  qc[c(nrow(qc) + 1),] <- c('rows_recovery', nrow(ctag.rc))
  
  for (i in 1:length(names(ctag.rc))){
    if (length(which(grepl('^Rc', names(ctag.rc)[i]))) == 1) names(ctag.rc)[i] <- paste('Re', substr(names(ctag.rc[i]), 3, nchar(names(ctag.rc[i]))), sep='')
  }
  
  ctag.new <- rbind(ctag.re, ctag.rc)
  for (i in 1:length(names(ctag.new))){
    if (length(grep('^Re', names(ctag.new)[i])) == 1) names(ctag.new)[i] <- substr(names(ctag.new[i]), 3, nchar(names(ctag.new[i])))
  }
  
  ctag.new <- ctag.new[,c('uid','ObsType',names(ctag.new)[1:(ncol(ctag.new)-2)])]
  ctag.new <- ctag.new[which(ctag.new$QcErrorID == '' | is.na(ctag.new$QcErrorID)),] # only keep those without error comments
  ctag.new <- ctag.new[order(ctag.new$uid, ctag.new$Date),]
  ctag.new <- ctag.new[which(!is.na(ctag.new$Date)),]
  ctag.new$LatY <- as.numeric(ctag.new$LatY)
  ctag.new$LonX <- as.numeric(ctag.new$LonX)
  ctag.new <- ctag.new[which(!is.na(ctag.new$LatY)),]
  ctag.new <- ctag.new[which(!is.na(ctag.new$LonX)),]
  ctag.new$ObsType <- factor(ctag.new$ObsType, levels = c('release', 'recovery'))
  
  ## remove spurious tags west of 100W, this is the Atlantic after all, and within 80N to 80S which are HYCOM bounds
  #ctag.new <- ctag.new[which(ctag.new$LonX >= -100 & ctag.new$LatY <= 80 & ctag.new$LatY >= -80),]
  
  ## bathy filter to remove all ctag locations on land
  if (!is.null(bathy_file)){
    bathy <- raster::raster(bathy_file)
    ctag.new$bathy <- raster::extract(bathy, cbind(ctag.new$LonX, ctag.new$LatY))
    ctag.new <- ctag.new[which(ctag.new$bathy <= -1),]
  } else{
    print('No bathymetric filter was applied as input bathy_file was missing.')
    ctag.new$bathy <- NA
  }
  
  ## select cols of interest
  ctag.new <- ctag.new[,c('uid','ObsType','TagGrpID','SpecimenID','strTags1','SpeciesCode','Sex','FleetCode',
                          'GearCode','Date','LatY','LonX','LenCM','Len','LenUnit','LenType','LenMethod',
                          'WgtKG','Wgt','WgtUnit','WgtType','WgtMethod','QcErrorID')]
  qc[c(nrow(qc) + 1),] <- c('rows_after_qc', nrow(ctag.new))

  ## remove duplicates (any locations within 1 day and X distance, say 1km or ~0.01deg -> hence round(x, 2) )
  ctag.new$LatBin <- round(ctag.new$LatY, 2)
  ctag.new$LonBin <- round(ctag.new$LonX, 2)
  ctag.new$YearMonth <- paste0(lubridate::year(ctag.new$Date), lubridate::month(ctag.new$Date))
  ctag.new <- ctag.new[!duplicated(ctag.new[,c('YearMonth','LatBin','LonBin')]),]
  qc[c(nrow(qc) + 1),] <- c('rows_after_summarise', nrow(ctag.new))
  ctag.new <- ctag.new[,c(1:(ncol(ctag.new)-1))]
  
  ## change date/lon/lat names to standardize
  names(ctag.new)[which(names(ctag.new) == 'LonX')] <- 'lon'
  names(ctag.new)[which(names(ctag.new) == 'LatY')] <- 'lat'
  names(ctag.new)[which(names(ctag.new) == 'Date')] <- 'date'
  
  qc[c(nrow(qc) + 1),] <- c('start_date', format(min(ctag.new$date), '%Y-%m-%d'))
  qc[c(nrow(qc) + 1),] <- c('end_date', format(max(ctag.new$date), '%Y-%m-%d'))
  qc[c(nrow(qc) + 1),] <- c('x_min', round(min(ctag.new$lon), 2))
  qc[c(nrow(qc) + 1),] <- c('x_max', round(max(ctag.new$lon), 2))
  qc[c(nrow(qc) + 1),] <- c('y_min', round(min(ctag.new$lat), 2))
  qc[c(nrow(qc) + 1),] <- c('y_max', round(max(ctag.new$lat), 2))
  
  return(list(ctag = ctag.new, qc = qc))
}
