Introduction

This R Markdown document is designed to transform data that is not in CWP format into CWP format. Initially, it changes the format of the data; subsequently, it maps the data to adhere to CWP standards. This markdown is automatically created from the function: https://raw.githubusercontent.com/eblondel/geoflow-tunaatlas/master/R/tunaatlas_scripts/pre-harmonization/southern_hemisphere_oceans_catch_1deg_1m_tunaatlasccsbt_level0__surface.R, the documentation keeps the format of roxygen2 skeleton.

A summary of the mapping process is provided. The path to the dataset is specified. You will find on this same repository on GitHub the first line of each dataset. The datasets are named after the historical name provided by tRFMOs while exporting and may change. The information provided in the Rmd allows understanding correctly which dataset should be used in this markdown.

Additional operations are performed next to verify other aspects of the data, such as the consistency of the geolocation, the values, and the reported catches in numbers and tons.

If you are interested in further details, the results and codes are available for review.

Each .Rmd script requires the user to knit the dataset at the beginning of the script in order to execute the harmonization process correctly. It is also possible to run the code chunk by chunk but be sure to be in the correct working directory (i.e., the one of the .Rmd).

path_to_raw_dataset <- here::here('R/tunaatlas_scripts/pre-harmonization', 'ccsbt', 'catch', 'data', 'CEData_Surface.xlsx')

Harmonize CCSBT Surface Catch Datasets

This function harmonizes the CCSBT Surface catch datasets, preparing them for integration into the Tuna Atlas database, according to specified format requirements.

@return None; the function outputs files directly, including harmonized datasets, optional metadata, and code lists for integration within the Tuna Atlas database.

@details This function modifies the dataset to ensure compliance with the standardized format, including renaming, reordering, and recalculating specific fields as necessary. Metadata integration is contingent on the intended use within the Tuna Atlas database.

@import dplyr @import readxl @importFrom stringr str_replace @seealso @export @keywords data harmonization, fisheries, CCSBT, tuna @author Paul Taconet, IRD @author Bastien Grasset, IRD Input data sample (after importing as data.frame in R):

  # YEAR MONTH COUNTRY_CODE GEAR_CODE CCSBT_STATISTICAL_AREA LATITUDE LONGITUDE NUMBER_OF_HOURS_SEARCHED WEIGHT_(Kg)_OF_SBT_RETAINED
  # 1975    12           AU        PS                      4      -37       150                       17                       59800
  # 1976     8           AU        BB                      4      -36       150                        8                           0
  # 1976     9           AU        BB                      4      -37       150                       14                       36200
  # 1976     9           AU        BB                      4      -35       150                        4                           0
  # 1976     9           AU        BB                      4      -34       151                       21                           0
  # 1976    10           AU        BB                      4      -37       150                       51                       41000
  
  
  # Catch: final data sample:
  # FishingFleet Gear time_start   time_end AreaName School Species CatchType CatchUnits Catch
  #   AU   BB 1976-09-01 1976-10-01  6337150    ALL     SBT       ALL         MT  36.2
  #   AU   BB 1976-10-01 1976-11-01  6337150    ALL     SBT       ALL         MT  41.0
  #   AU   BB 1976-12-01 1977-01-01  6332132    ALL     SBT       ALL         MT 167.5
  #   AU   BB 1976-12-01 1977-01-01  6333134    ALL     SBT       ALL         MT  35.6
  #   AU   BB 1976-12-01 1977-01-01  6334134    ALL     SBT       ALL         MT  56.5
  #   AU   BB 1976-12-01 1977-01-01  6334135    ALL     SBT       ALL         MT  37.0

  source("https://raw.githubusercontent.com/firms-gta/geoflow-tunaatlas/master/R/sardara_functions/harmo_time_2.R")
  source("https://raw.githubusercontent.com/firms-gta/geoflow-tunaatlas/master/R/sardara_functions/harmo_spatial_5.R")
  source("https://raw.githubusercontent.com/firms-gta/geoflow-tunaatlas/master/R/sardara_functions/format_time_db_format.R")
  #packages

  

if(!require(readxl)){
    install.packages("readxl")
    require(readxl)
}

Historical name for the dataset at source CEData_Surface.xlsx

opts <- options()
options(encoding = "UTF-8")

RFMO_CE<-readxl::read_excel(path_to_raw_dataset, sheet = "CEData_Surface", col_names = TRUE, col_types = NULL,na = "")
colnames(RFMO_CE)<-gsub("\r\n", "_", colnames(RFMO_CE))
colnames(RFMO_CE)<-gsub(" ", "_", colnames(RFMO_CE))
colnames(RFMO_CE)<-gsub("\\(", "", colnames(RFMO_CE))
colnames(RFMO_CE)<-gsub("\\)", "", colnames(RFMO_CE))
RFMO_CE<-as.data.frame(RFMO_CE)

Remove lines that are read in the Excel but that are not real

RFMO_CE<- RFMO_CE[!is.na(RFMO_CE$YEAR),] 
RFMO_CE$WEIGHT_Kg_OF_SBT_RETAINED<-as.numeric(RFMO_CE$WEIGHT_Kg_OF_SBT_RETAINED)
RFMO_CE$NUMBER_OF_HOURS_SEARCHED<-as.numeric(RFMO_CE$NUMBER_OF_HOURS_SEARCHED)

FishingFleet

RFMO_CE$FishingFleet<-RFMO_CE$COUNTRY_CODE

Gear

RFMO_CE$Gear<-RFMO_CE$GEAR_CODE

replace PS by Purse Seine and BB by Pole and Line

RFMO_CE$Gear[RFMO_CE$Gear == "PS"] <- "Purse Seine"
RFMO_CE$Gear[RFMO_CE$Gear == "BB"] <- "Pole and Line"

Year and period

RFMO_CE<-harmo_time_2(RFMO_CE, "YEAR", "MONTH")

Format inputDataset time to have the time format of the DB, which is one column time_start and one time_end

RFMO_CE<-format_time_db_format(RFMO_CE)

Area

RFMO_CE<-harmo_spatial_5(RFMO_CE,"LATITUDE","LONGITUDE",1,5)

School

RFMO_CE$School<-"UNK"

Species

RFMO_CE$Species<-"SBF"

CatchType

RFMO_CE$CatchType<-"RC" #retained catches

Catch

RFMO_CE$Catch<-RFMO_CE$WEIGHT_Kg_OF_SBT_RETAINED/1000

RFMO_CE$CatchUnits<-"t"

colToKeep_captures <- c("FishingFleet","Gear","time_start","time_end","AreaName","School","Species","CatchType","CatchUnits","Catch")
catches <-RFMO_CE[,colToKeep_captures]

remove whitespaces on columns that should not have withespace

catches[,c("AreaName","FishingFleet")]<-as.data.frame(apply(catches[,c("AreaName","FishingFleet")],2,function(x){gsub(" *$","",x)}),stringsAsFactors=FALSE)

remove 0 and NA values

catches <- catches[!is.na(catches$Catch),]
catches <- catches[catches$Catch != 0,]


catches <- aggregate(catches$Catch, FUN = sum,
                     by = list(
                       FishingFleet = catches$FishingFleet,
                       Gear = catches$Gear,
                       time_start = catches$time_start,
                       time_end = catches$time_end,
                       AreaName = catches$AreaName,
                       School = catches$School,
                       Species = catches$Species,
                       CatchType = catches$CatchType,
                       CatchUnits = catches$CatchUnits
                     )
)

colnames(catches)<-c("fishing_fleet","gear_type","time_start","time_end","geographic_identifier","fishing_mode","species","measurement_type","measurement_unit","measurement_value")
catches$source_authority<-"CCSBT"
catches$measurement_type <- "RC" # Retained catches
catches$measurement <- "catch" 
catches$measurement_processing_level <- "unknown" 
catches$time_start <- as.Date(catches$time_start)
catches$time_end <- as.Date(catches$time_end)
dataset_temporal_extent <- paste(
  paste0(format(min(catches$time_start), "%Y"), "-01-01"),
  paste0(format(max(catches$time_end), "%Y"), "-12-31"),
  sep = "/"
)

output in same folder as path_to_raw_dataset

output_name_dataset <- here::here('R/tunaatlas_scripts/pre-harmonization', 'ccsbt', 'catch', 'data', 'CEData_Surface_harmonized.csv')

write.csv(catches, output_name_dataset, row.names = FALSE)
georef_dataset <- catches

@ Load pre-harmonization scripts and apply mappings

download.file('https://raw.githubusercontent.com/firms-gta/geoflow-tunaatlas/master/R/tunaatlas_scripts/pre-harmonization/map_codelists_no_DB.R', destfile = 'local_map_codelists_no_DB.R')
source('local_map_codelists_no_DB.R')
fact <- "catch"
mapping_codelist <- map_codelists_no_DB(fact, mapping_dataset = "https://raw.githubusercontent.com/fdiwg/fdi-mappings/main/global/firms/gta/codelist_mapping_rfmos_to_global.csv", dataset_to_map = georef_dataset, mapping_keep_src_code = FALSE, summary_mapping = TRUE, source_authority_to_map = c("IATTC", "CCSBT", "WCPFC"))
## 
##  mapping dimension gear_type with code list mapping
## 
##  mapping dimension species with code list mapping
## 
##  mapping dimension fishing_fleet with code list mapping
## 
##  mapping dimension fishing_mode with code list mapping

@ Handle unmapped values and save the results

georef_dataset <- mapping_codelist$dataset_mapped %>% dplyr::mutate(fishing_fleet = ifelse(fishing_fleet == 'UNK', 'NEI', fishing_fleet), gear_type = ifelse(gear_type == 'UNK', '99.9', gear_type))
data.table::fwrite(mapping_codelist$recap_mapping, here::here('R/tunaatlas_scripts/pre-harmonization', 'ccsbt', 'catch', 'data', 'CEData_Surface_recap_mapping.csv'))
data.table::fwrite(mapping_codelist$not_mapped_total, here::here('R/tunaatlas_scripts/pre-harmonization', 'ccsbt', 'catch', 'data', 'CEData_Surface_not_mapped_total.csv'))
data.table::fwrite(georef_dataset, here::here('R/tunaatlas_scripts/pre-harmonization', 'ccsbt', 'catch', 'data', 'CEData_Surface_CWP_dataset.csv'))

Display the first few rows of the mapping summaries

print(head(mapping_codelist$recap_mapping))
## # A tibble: 5 × 5
##   src_code      trg_code src_codingsystem trg_codingsystem   source_authority
##   <chr>         <chr>    <chr>            <chr>              <chr>           
## 1 UNK           UNK      schooltype_ccsbt schooltype_rfmos   CCSBT           
## 2 AU            AUS      flag_ccsbt       fishingfleet_firms CCSBT           
## 3 SBF           SBF      species_ccsbt    species_asfis      CCSBT           
## 4 Pole and Line 09.1     gear_ccsbt       isscfg_revision_1  CCSBT           
## 5 Purse Seine   01.1     gear_ccsbt       isscfg_revision_1  CCSBT