Read in catch data, clean, filter and process. FM data, unlike the BT data, comes in data files which correspond to the tabs in the spreadsheet: FM09_alla_grundutrdag_combinedbyAG_190916. They all have different layout so I will clean them separately…

Load libraries

library(tidyverse)
#> ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
#> ✔ ggplot2 3.3.6      ✔ purrr   0.3.4 
#> ✔ tibble  3.1.8      ✔ dplyr   1.0.10
#> ✔ tidyr   1.2.0      ✔ stringr 1.4.1 
#> ✔ readr   2.1.1      ✔ forcats 0.5.1
#> Warning: package 'tidyr' was built under R version 4.0.5
#> ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
#> ✖ dplyr::filter() masks stats::filter()
#> ✖ dplyr::lag()    masks stats::lag()
library(tidylog)
#> 
#> Attaching package: 'tidylog'
#> 
#> The following objects are masked from 'package:dplyr':
#> 
#>     add_count, add_tally, anti_join, count, distinct, distinct_all,
#>     distinct_at, distinct_if, filter, filter_all, filter_at, filter_if,
#>     full_join, group_by, group_by_all, group_by_at, group_by_if,
#>     inner_join, left_join, mutate, mutate_all, mutate_at, mutate_if,
#>     relocate, rename, rename_all, rename_at, rename_if, rename_with,
#>     right_join, sample_frac, sample_n, select, select_all, select_at,
#>     select_if, semi_join, slice, slice_head, slice_max, slice_min,
#>     slice_sample, slice_tail, summarise, summarise_all, summarise_at,
#>     summarise_if, summarize, summarize_all, summarize_at, summarize_if,
#>     tally, top_frac, top_n, transmute, transmute_all, transmute_at,
#>     transmute_if, ungroup
#> 
#> The following objects are masked from 'package:tidyr':
#> 
#>     drop_na, fill, gather, pivot_longer, pivot_wider, replace_na,
#>     spread, uncount
#> 
#> The following object is masked from 'package:stats':
#> 
#>     filter
library(RColorBrewer)
#> Warning: package 'RColorBrewer' was built under R version 4.0.5
library(patchwork)
sessionInfo() 
#> R version 4.0.2 (2020-06-22)
#> Platform: x86_64-apple-darwin17.0 (64-bit)
#> Running under: macOS  10.16
#> 
#> Matrix products: default
#> BLAS:   /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
#> LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
#> 
#> locale:
#> [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#>  [1] patchwork_1.1.1    RColorBrewer_1.1-3 tidylog_1.0.2      forcats_0.5.1     
#>  [5] stringr_1.4.1      dplyr_1.0.10       purrr_0.3.4        readr_2.1.1       
#>  [9] tidyr_1.2.0        tibble_3.1.8       ggplot2_3.3.6      tidyverse_1.3.2   
#> 
#> loaded via a namespace (and not attached):
#>  [1] Rcpp_1.0.8          lubridate_1.8.0     clisymbols_1.2.0   
#>  [4] assertthat_0.2.1    digest_0.6.30       utf8_1.2.2         
#>  [7] R6_2.5.1            cellranger_1.1.0    backports_1.3.0    
#> [10] reprex_2.0.1        evaluate_0.16       httr_1.4.4         
#> [13] pillar_1.8.1        rlang_1.0.6         googlesheets4_1.0.0
#> [16] readxl_1.3.1        rstudioapi_0.14     jquerylib_0.1.4    
#> [19] rmarkdown_2.16      googledrive_2.0.0   munsell_0.5.0      
#> [22] broom_1.0.1         compiler_4.0.2      modelr_0.1.8       
#> [25] xfun_0.33           pkgconfig_2.0.3     htmltools_0.5.3    
#> [28] tidyselect_1.1.2    fansi_1.0.3         crayon_1.4.2       
#> [31] tzdb_0.2.0          dbplyr_2.1.1        withr_2.5.0        
#> [34] grid_4.0.2          jsonlite_1.8.0      gtable_0.3.1       
#> [37] lifecycle_1.0.3     DBI_1.1.1           magrittr_2.0.3     
#> [40] scales_1.2.1        cli_3.4.1           stringi_1.7.8      
#> [43] cachem_1.0.6        fs_1.5.2            xml2_1.3.3         
#> [46] bslib_0.4.0         ellipsis_0.3.2      generics_0.1.2     
#> [49] vctrs_0.5.0         tools_4.0.2         glue_1.6.2         
#> [52] hms_1.1.1           fastmap_1.1.0       yaml_2.3.5         
#> [55] colorspace_2.0-3    gargle_1.2.0        rvest_1.0.3        
#> [58] knitr_1.40          haven_2.5.1         sass_0.4.2

Forsmark

1983-1986

# Need to set fileEncoding here, else error: "invalid multibyte string 18"
df83 <- read.csv("data/raw/Catch_data_FM09__1983-86_190916.csv", sep = ";", fileEncoding = "latin1")

# Tidy data. Remove unnecessary columns
df83 <- df83 %>%
  filter(Art == "ABBO", Ã…rtal < 2004) %>%
  rename(year = Årtal, week = Vecka, day = Dag, effort = Ansträngning, species = Art,
         weight = Vikt, n = Antal) %>%
  select(-c(Vtn.stånd, VindRiktn.I, VindSt.I, Vind_upp_rikn, VindSt.Upp, Ström_I_rikn,
            Ström_upp_rikn, Salthalt_I_yta, Salthalt_I_botten, Salthalt_upp_yta, 
            Salthalt_upp_botten, Drift_i, Drift_u, Drift_dim, Siktdjup, Lufttryck_i,
            Lufttryck_upp, Sjuk_kontroll, X..))
#> filter: removed 987 rows (74%), 345 rows remaining
#> rename: renamed 7 variables (year, week, day, effort, species, …)
#> select: dropped 19 variables (Vtn.stånd, VindRiktn.I, VindSt.I, Vind_upp_rikn, VindSt.Upp, …)

# We would much rather prefer to have a column for each length, so that 1 row = observation.
dat83 <- df83 %>%
  gather(length, n2, c(X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X12, X13, X14), na.rm = T)
#> gather: reorganized (X1, X2, X3, X4, X5, …) into (length, n2) [was 345x38, now 952x26]

head(df83)
head(dat83) # Now the data is in a long, tidy format. 

# Test it is correct:
subset(df83, year == 1983 & week == 30 & day == 2 & Stations_namn == "Asphällan")
subset(dat83, year == 1983 & week == 30 & day == 2 & Stations_namn == "Asphällan")

# Create a new empty column for numeric "length"
dat83$length_group <- as.numeric(substring(dat83$length, 2))

head(dat83)                            
str(dat83)
#> 'data.frame':    952 obs. of  27 variables:
#>  $ Provfiske    : chr  "FM09S" "FM09S" "FM09S" "FM09S" ...
#>  $ Area         : chr  "FM" "FM" "FM" "FM" ...
#>  $ Areanamn     : chr  "Forsmark" "Forsmark" "Forsmark" "Forsmark" ...
#>  $ Sektion      : int  1 1 1 1 1 1 1 1 1 1 ...
#>  $ Sektion_namn : chr  "Syd Biotestsjön" "Syd Biotestsjön" "Syd Biotestsjön" "Syd Biotestsjön" ...
#>  $ Station      : int  31 31 31 31 31 31 31 31 31 31 ...
#>  $ Stations_namn: chr  "Asphällan" "Asphällan" "Asphällan" "Asphällan" ...
#>  $ Position_N   : chr  "60 24,59" "60 24,59" "60 24,59" "60 24,59" ...
#>  $ Position_E   : chr  "18 11,88" "18 11,88" "18 11,88" "18 11,88" ...
#>  $ Redskap      : int  9 9 9 9 9 9 9 9 9 9 ...
#>  $ year         : int  1983 1983 1983 1983 1983 1983 1983 1983 1983 1983 ...
#>  $ week         : int  30 30 30 31 31 31 31 32 32 32 ...
#>  $ day          : int  2 2 5 3 3 5 5 2 2 4 ...
#>  $ YtTmp.I      : logi  NA NA NA NA NA NA ...
#>  $ YtTmp.Upp    : logi  NA NA NA NA NA NA ...
#>  $ effort       : int  1 1 1 1 1 1 1 1 1 1 ...
#>  $ Nr           : int  1 2 2 1 2 1 2 1 2 1 ...
#>  $ Störning     : int  0 0 0 0 0 0 0 0 0 0 ...
#>  $ TmpBtnI      : chr  "17,50" "17,80" "18,80" "17,60" ...
#>  $ TmpBtnU      : chr  "17,30" "17,30" "18,30" "17,30" ...
#>  $ species      : chr  "ABBO" "ABBO" "ABBO" "ABBO" ...
#>  $ weight       : logi  NA NA NA NA NA NA ...
#>  $ Längdgr_std  : int  1 1 1 1 1 1 1 1 1 1 ...
#>  $ n            : int  8 53 19 2 50 1 46 19 84 10 ...
#>  $ length       : chr  "X2" "X2" "X2" "X2" ...
#>  $ n2           : int  1 16 1 2 17 1 20 4 41 5 ...
#>  $ length_group : num  2 2 2 2 2 2 2 2 2 2 ...

# Our n2 column tells us how many fish are in that size-group. We want one row for each observation!
# So we need to repeat that observation by n2.

# Now repeat
dat83 <- dat83[rep(seq(nrow(dat83)), dat83$n2),]
head(dat83, 50)

# Test it is correct:
df83 %>%
  filter(year == 1983 & week == 30 & day == 2 & Stations_namn == "Asphällan") %>% 
  arrange(n) %>% 
  as.data.frame()
#> filter: removed 343 rows (99%), 2 rows remaining

dat83 %>%
  filter(year == 1983 & week == 30 & day == 2 & Stations_namn == "Asphällan") %>% 
  arrange(n) %>% 
  dplyr::select(-YtTmp.I, -YtTmp.Upp, -Störning, -TmpBtnU, -Sektion_namn, -Sektion,
                -Areanamn, -Position_N, -Position_E, -Provfiske, -Redskap) %>% 
  as.data.frame()
#> filter: removed 9,413 rows (99%), 61 rows remaining

1987-1990

df87 <- read.csv("data/raw/Catch_data_FM09__1987-90_190916.csv", sep = ";", fileEncoding = "latin1")

# Tidy data. Remove unnecessary columns
df87 <- df87 %>%
  filter(Art == "ABBO", Ã…rtal < 2004) %>%
  rename(year = Årtal, week = Vecka, day = Dag, effort = Ansträngning, species = Art,
         weight = Vikt, n = Antal) %>%
  select(-c(Vtn.stånd, VindRiktn.I, VindSt.I, Vind_upp_rikn, VindSt.Upp, Ström_I_rikn,
            Ström_upp_rikn, Salthalt_I_yta, Salthalt_I_botten, Salthalt_upp_yta, 
            Salthalt_upp_botten, Drift_i, Drift_u, Drift_dim, Siktdjup, Lufttryck_i,
            Lufttryck_upp, Sjuk_kontroll, X..))
#> filter: removed 1,114 rows (75%), 366 rows remaining
#> rename: renamed 7 variables (year, week, day, effort, species, …)
#> select: dropped 19 variables (Vtn.stånd, VindRiktn.I, VindSt.I, Vind_upp_rikn, VindSt.Upp, …)

# Convert to long data frame
# Now the X-columns will be rows AND you will get a new column that takes the old "colmn" value and put that
# in the new column n2. 
dat87 <- df87 %>%
  gather(length, n2, c(X6, X9, X11, X14, X16, X19, X21, X24, X26, X29, X31, X34, X36, X39,
                       X41, X44, X46, X49, X51, X56, X59, X61, X64, X71), na.rm = T)
#> gather: reorganized (X6, X9, X11, X14, X16, …) into (length, n2) [was 366x48, now 1384x26]

# Test it is correct:
subset(df87, year == 1987 & week == 32 & day == 2 & Stations_namn == "Asphällan")
subset(dat87, year == 1987 & week == 32 & day == 2 & Stations_namn == "Asphällan")

# Create a new empty column for numeric "length"
dat87$length_group <- as.numeric(substring(dat87$length, 2))

# Our n2 column tells us how many fish are in that size-group. We want one row for each observation!
# So we need to repeat that observation by n2.
dat87 <- dat87[rep(seq(nrow(dat87)), dat87$n2),]
head(dat87, 50)

# Test it is correct:
df87 %>%
  filter(year == 1987 & week == 32 & day == 2 & Stations_namn == "Asphällan") %>% 
  arrange(n) %>% 
  as.data.frame()
#> filter: removed 364 rows (99%), 2 rows remaining

dat87 %>%
  filter(year == 1987 & week == 32 & day == 2 & Stations_namn == "Asphällan") %>% 
  arrange(n) %>% 
  dplyr::select(-YtTmp.I, -YtTmp.Upp, -Störning, -TmpBtnU, -Sektion_namn, -Sektion,
                -Areanamn, -Position_N, -Position_E, -Provfiske, -Redskap) %>% 
  as.data.frame()
#> filter: removed 8,174 rows (99%), 52 rows remaining

1991-2000

df91 <- read.csv("data/raw/Catch_data_FM09__1991-00_190916.csv", sep = ";", fileEncoding = "latin1")

# Tidy data. Remove unnecessary columns
df91 <- df91 %>%
  filter(Art == "ABBO", Ã…rtal < 2004) %>%
  rename(year = Årtal, week = Vecka, day = Dag, effort = Ansträngning, species = Art,
         weight = Vikt, n = Antal) %>%
  select(-c(Vtn.stånd, VindRiktn.I, VindSt.I, Vind_upp_rikn, VindSt.Upp, Ström_I_rikn,
            Ström_upp_rikn, Salthalt_I_yta, Salthalt_I_botten, Salthalt_upp_yta, 
            Salthalt_upp_botten, Drift_i, Drift_u, Drift_dim, Siktdjup, Lufttryck_i,
            Lufttryck_upp, Sjuk_kontroll))
#> filter: removed 2,032 rows (81%), 478 rows remaining
#> rename: renamed 7 variables (year, week, day, effort, species, …)
#> select: dropped 18 variables (Vtn.stånd, VindRiktn.I, VindSt.I, Vind_upp_rikn, VindSt.Upp, …)

# Go from wide to long format
dat91 <- df91 %>%
  gather(length, n2, c(X9, X11, X14, X16, X19, X21, X24, X26, X29, X31, X34, X36, X39, X41, X44,
                       X46, X49, X51, X54, X56,X59, X61, X66, X69, X71, X76, X81, X101), na.rm = T)
#> gather: reorganized (X9, X11, X14, X16, X19, …) into (length, n2) [was 478x52, now 3185x26]

# Test it is correct:
subset(df91, year == 1992 & week == 32 & day == 2 & Stations_namn == "Asphällan")
subset(dat91, year == 1992 & week == 32 & day == 2 & Stations_namn == "Asphällan")

# Create a new empty column for numeric "length"
dat91$length_group <- as.numeric(substring(dat91$length, 2))

# Our n2 column tells us how many fish are in that size-group. We want one row for each observation!
# So we need to repeat that observation by n2.
dat91 <- dat91[rep(seq(nrow(dat91)), dat91$n2),]
head(dat91, 50)

# Test it is correct:
df91 %>%
  filter(year == 1992 & week == 32 & day == 2 & Stations_namn == "Asphällan") %>% 
  arrange(n) %>% 
  as.data.frame()
#> filter: removed 477 rows (>99%), one row remaining

dat91 %>%
  filter(year == 1992 & week == 32 & day == 2 & Stations_namn == "Asphällan") %>% 
  arrange(n) %>% 
  dplyr::select(-YtTmp.I, -YtTmp.Upp, -Störning, -TmpBtnU, -Sektion_namn, -Sektion,
                -Areanamn, -Position_N, -Position_E, -Provfiske, -Redskap) %>% 
  as.data.frame()
#> filter: removed 31,725 rows (>99%), 99 rows remaining

2001-2004

df01 <- read.csv("data/raw/Catch_data_FM09__2001-06_190916.csv", sep = ";", fileEncoding = "latin1")

# Tidy data. Remove unnecessary columns
df01 <- df01 %>%
  filter(Art == "ABBO", Ã…rtal < 2004) %>%
  rename(year = Årtal, week = Vecka, day = Dag, effort = Ansträngning, species = Art,
         weight = Vikt, n = Antal) %>%
  select(-c(Vtn.stånd, VindRiktn.I, VindSt.I, Vind_upp_rikn, VindSt.Upp, Ström_I_rikn,
            Ström_upp_rikn, Salthalt_I_yta, Salthalt_I_botten, Salthalt_upp_yta, 
            Salthalt_upp_botten, Drift_i, Drift_u, Drift_dim, Siktdjup, Lufttryck_i,
            Lufttryck_upp, Sjuk_kontroll, X..))
#> filter: removed 2,293 rows (79%), 601 rows remaining
#> rename: renamed 7 variables (year, week, day, effort, species, …)
#> select: dropped 19 variables (Vtn.stånd, VindRiktn.I, VindSt.I, Vind_upp_rikn, VindSt.Upp, …)


# Go from wide to long format
dat01 <- df01 %>%
  gather(length, n2, c(X7, X9, X10, X11, X12, X13, X14, X15, X16, X17, X18, X19, X20, X21, X22, X23, X24, X25,
                       X26, X27, X28, X29, X30, X31, X32, X33, X34, X35, X36, X37, X38, X39, X40, X41, X42, X43,
                       X44, X45, X46, X47, X48, X49, X50, X51, X52, X55, X62, X65, X80, X83, X90), na.rm = T)
#> gather: reorganized (X7, X9, X10, X11, X12, …) into (length, n2) [was 601x75, now 2189x26]

# Test it is correct:
subset(df01, year == 2002 & week == 32 & day == 2 & Stations_namn == "Asphällan")
subset(dat01, year == 2002 & week == 32 & day == 2 & Stations_namn == "Asphällan")

# Create a new empty column for numeric "length"
dat01$length_group <- as.numeric(substring(dat01$length, 2))

# Our n2 column tells us how many fish are in that size-group. We want one row for each observation!
# So we need to repeat that observation by n2.
dat01 <- dat01[rep(seq(nrow(dat01)), dat01$n2),]
head(dat01, 50)

# Test it is correct:
df01 %>%
  filter(year == 2002 & week == 32 & day == 2 & Stations_namn == "Asphällan") %>% 
  arrange(n) %>% 
  as.data.frame()
#> filter: removed 596 rows (99%), 5 rows remaining

dat01 %>%
  filter(year == 2002 & week == 32 & day == 2 & Stations_namn == "Asphällan") %>% 
  arrange(n) %>% 
  dplyr::select(-YtTmp.I, -YtTmp.Upp, -Störning, -TmpBtnU, -Sektion_namn, -Sektion,
                -Areanamn, -Position_N, -Position_E, -Provfiske, -Redskap) %>% 
  as.data.frame()
#> filter: removed 5,732 rows (>99%), 24 rows remaining

Combine all FM (reference) data

catch_FM <- rbind(dat83, dat87, dat91, dat01)

catch_FM <- catch_FM %>% filter(year > 1982 & year < 2004)
#> filter: no rows removed

#** Apply further filters ============================================================
# Remove disturbance
catch_FM <- catch_FM %>% filter(Störning == 0)
#> filter: removed 31 rows (<1%), 55,249 rows remaining

# Full data
ggplot(catch_FM, aes(factor(week), fill = factor(day))) +
  facet_wrap(~year, scales = "free_y") +
  geom_bar() +
  scale_fill_brewer(palette="Set1", name = "Weekday") +
  theme_classic(base_size = 12) +
  ggtitle("Reference area")


# Filter autumn fishing?
sort(unique(catch_FM$week))
#> [1] 30 31 32 33 34

# This is not necessary because the data only contains autumn fishing! 

# Since BT data is filter for year > 1986 this has to be done here as well.
catch_FM <- catch_FM %>% filter(year > 1986)
#> filter: removed 9,455 rows (17%), 45,794 rows remaining

# All nets have the same effort
unique(catch_FM$effort)
#> [1] 1
unique(catch_FM$week)
#> [1] 32 30 31 33 34

Biotest (warm area)

# Need to set fileEncoding here, else error: "invalid multibyte string 18"
df <- read.csv("data/raw/Catch_data_BT09_140507.csv", sep = ";", fileEncoding = "latin1")

# Tidy data. Remove unnecessary columns
df <- df %>%
  filter(Art == "ABBO", Ã…rtal < 2004) %>%
  dplyr::rename(year = Årtal, week = Vecka, day = Dag, effort = Ansträngning,
                species = Art, weight = Vikt, n = Antal) %>%
  select(-c(Vtn.stånd, VindRiktn.I, VindSt.I, Vind_upp_rikn, VindSt.Upp, Ström_I_rikn,
            Ström_upp_rikn, Salthalt_I_yta, Salthalt_I_botten, Salthalt_upp_yta,
            Salthalt_upp_botten, Drift_i, Drift_u, Drift_dim, Siktdjup, Lufttryck_i,
            Lufttryck_upp, Sjuk_kontroll))
#> filter: removed 6,470 rows (81%), 1,543 rows remaining
#> select: dropped 18 variables (Vtn.stånd, VindRiktn.I, VindSt.I, Vind_upp_rikn, VindSt.Upp, …)

# Slightly cleaner but we have a problem: X1, X2... X9 are length.
# We would much rather prefer to have a column for each length, so that 1 row = observation.
# Luckily there is a super neat function in tidyr (part of the tidyverse) called "gather".
# What I whant to do is:

# 1. Give the new column names I want to create, e.g. "length" 
# 2. Specify which columns are moved and shuffled in these columns.
# These will be the current columns for unique lengths. So check the str() and the column number using names() of the data again 

# I will move columns 13:121 = the ones starting at x8 to x116

# Now the X-columns will be rows AND you will get a new column that takes the old "column"
# value and put that in the new column n2. DOUBLE CHECK!!
dat <- df %>%
  gather(length, n2, c(X8, X9, X10, X11, X12, X13, X14, X15, X16, X17, X18, X19, X20,
                       X21, X22, X23, X24, X25, X26, X27, X28, X29, X30, X31, X32, X33,
                       X34, X35, X36, X37, X38, X39, X40, X41, X42, X43, X44, X45, X46,
                       X47, X48, X49, X50, X51, X52, X53, X54, X55, X56, X57, X58, X59,
                       X60, X61, X62, X63, X64, X65, X66, X67, X68, X69, X70, X71, X72,
                       X73, X74, X75, X76, X77, X78, X79, X80, X81, X82, X84, X85, X86,
                       X87, X88, X89, X90, X91, X93, X94, X96, X97, X98, X99, X101, X103,
                       X104, X106, X109, X111, X114, X116), na.rm = T)
#> gather: reorganized (X8, X9, X10, X11, X12, …) into (length, n2) [was 1543x122, now 5427x27]

head(dat) # Now the data is in a long, tidy format. 
head(df)

# Test it is correct:
subset(df, year == 2003 & week == 19 & day == 4 & Stations_namn == "Malören")
subset(dat, year == 2003 & week == 19 & day == 4 & Stations_namn == "Malören")

# But "length" is not numeric but a character. Create a new empty column for numeric "length"
dat$length_group <- as.numeric(substring(dat$length, 2))

# Our n2 column tells us how many fish are in that size-group.
# We want one row for each observation! So we need to repeat that observation by n2.
dat <- dat[rep(seq(nrow(dat)), dat$n2),]
head(dat, 50)

# Test it is correct:
df %>%
  filter(year == 2003 & week == 19 & day == 4 & Stations_namn == "Malören") %>% 
  arrange(n) %>% 
  as.data.frame()
#> filter: removed 1,539 rows (>99%), 4 rows remaining

dat %>%
  filter(year == 2003 & week == 19 & day == 4 & Stations_namn == "Malören") %>% 
  arrange(n) %>% 
  dplyr::select(-YtTmp.I, -YtTmp.Upp, -Störning, -TmpBtnU, -Sektion_namn, -Sektion,
                -Areanamn, -Position_N, -Position_E, -Provfiske, -Redskap) %>% 
  as.data.frame()
#> filter: removed 16,023 rows (>99%), 13 rows remaining

# Remove disturbance
# Disturbance codes:
# 2: seals damage 
# 3: Strong algal growth on the gears
# 4: Clogging by drifting algae.
# 9: Other reason. (Damage by boat traffic, other human inference etc.)

# Plot disturbance
dat %>% 
  filter(Störning > 1) %>% 
  ggplot(., aes(x = Störning)) +
  geom_histogram(bins = 10) +
  scale_x_continuous(breaks = c(2, 3, 4, 9))
#> filter: removed 14,768 rows (92%), 1,268 rows remaining


dat <- dat %>% filter(Störning == 0)
#> filter: removed 1,268 rows (8%), 14,768 rows remaining

# Overfishing effect:
# They put nets many days in a row to get an overfishing affect.
# Catches decline after a few days. I don't want this effect!
# Here we plot which days are fished (fill) over the week that has been fished (x-axis). "Overfishing" effect
# could happen if a year is fished a lot in many consecutive days. It's not clear here that is the case

# Full data
ggplot(dat, aes(factor(week), fill = factor(day))) +
  facet_wrap(~year, scales = "free") +
  geom_bar() 


# Filter autumn
dat %>% 
  filter(week > 35) %>% 
  ggplot(., aes(factor(week), fill = factor(day))) +
  facet_wrap(~year, scales = "free_y") +
  geom_bar()
#> filter: removed 3,696 rows (25%), 11,072 rows remaining


# Ok, 1984 is heavily fished all year actually.
# We'll remove it since it's very different from the rest. When that is removed, we
# can use Malin's for loop to select fishing days after .v40 (or any other week).
# The other two years with fishing before v.40 are 1996 & 2003, but that's w. 30 so we
# don't belive it has a big effect.

dat <- dat %>% filter(year > 1986)
#> filter: removed 1,208 rows (8%), 13,560 rows remaining

# In some years they have been fishing almost every week. To not have an
# overfishing-effect, we use only the first part of the fishing that season (first
# day and first week)

# Filter autumn fishing
# By filtering week > 40 & week < 49 I know that I include all October fishing trips
dat_oct <- dat %>%
  filter(week > 40 & week < 49) # OCTOBER
#> filter: removed 5,324 rows (39%), 8,236 rows remaining

sort(unique(dat_oct$year))
#>  [1] 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 2001 2002 2003
sort(unique(dat_oct$week)) 
#> [1] 41 42 43 44 45 46 47
# Since this gives me v. 41 42 43 44 45 46 47 
# year -99 -00 is lost due to disturbance

# How does the filtered data set look?
ggplot(dat_oct, aes(x = factor(week), fill = factor(day))) +
  facet_wrap(~year) +
  geom_bar() +
  scale_fill_brewer(palette="Set1", name = "Weekday") +
  theme_classic(base_size = 12) +
  ggtitle("Biotest lake")


# Rename data
catch_BT <- dat_oct

Merge areas and standardize length codes!

# Clean data to make more readable
catch_BT <- catch_BT %>%
  select(Area, Sektion, Station, year, week, day, length_group, Station,
         Längdgr_std, length_group) %>% 
  as.data.frame()
#> select: dropped 20 variables (Provfiske, Areanamn, Sektion_namn, Stations_namn, Position_N, …)

catch_FM <- catch_FM %>%
  select(Area, Sektion, Station, year, week, day, length_group, Station,
         Längdgr_std, length_group) %>% 
  as.data.frame()
#> select: dropped 19 variables (Provfiske, Areanamn, Sektion_namn, Stations_namn, Position_N, …)

catch <- rbind(catch_BT, catch_FM)

# First split up the data i two separate dataframes, one with std 2 and one with std 3.
# Call them dat_std2 och dat_std3
dat_std2 <- catch %>%
  filter(Längdgr_std == 2)
#> filter: removed 6,670 rows (12%), 47,360 rows remaining

dat_std3 <- catch %>%
  filter(Längdgr_std == 3)
#> filter: removed 47,360 rows (88%), 6,670 rows remaining

# Convert std 3 to std 2
sort(unique(dat_std3$length_group))
#>  [1]  7  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32
#> [26] 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
# 7 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
# 36 37 38 39 40 41 42 43 44 45 46 47

# Insert new column 
dat_std3$length_group2 <- NA
dat_std3$length_group2[dat_std3$length_group==7] <- 6
dat_std3$length_group2[dat_std3$length_group==9] <- 9
dat_std3$length_group2[dat_std3$length_group==10] <- 9
dat_std3$length_group2[dat_std3$length_group==11] <- 11
dat_std3$length_group2[dat_std3$length_group==12] <- 11
dat_std3$length_group2[dat_std3$length_group==13] <- 11
dat_std3$length_group2[dat_std3$length_group==14] <- 14
dat_std3$length_group2[dat_std3$length_group==15] <- 14
dat_std3$length_group2[dat_std3$length_group==16] <- 16
dat_std3$length_group2[dat_std3$length_group==17] <- 16
dat_std3$length_group2[dat_std3$length_group==18] <- 16
dat_std3$length_group2[dat_std3$length_group==19] <- 19
dat_std3$length_group2[dat_std3$length_group==20] <- 19
dat_std3$length_group2[dat_std3$length_group==21] <- 21
dat_std3$length_group2[dat_std3$length_group==22] <- 21
dat_std3$length_group2[dat_std3$length_group==23] <- 21
dat_std3$length_group2[dat_std3$length_group==24] <- 24
dat_std3$length_group2[dat_std3$length_group==25] <- 24
dat_std3$length_group2[dat_std3$length_group==26] <- 26
dat_std3$length_group2[dat_std3$length_group==27] <- 26
dat_std3$length_group2[dat_std3$length_group==28] <- 26
dat_std3$length_group2[dat_std3$length_group==29] <- 29
dat_std3$length_group2[dat_std3$length_group==30] <- 29
dat_std3$length_group2[dat_std3$length_group==31] <- 31
dat_std3$length_group2[dat_std3$length_group==32] <- 31
dat_std3$length_group2[dat_std3$length_group==33] <- 31
dat_std3$length_group2[dat_std3$length_group==34] <- 34
dat_std3$length_group2[dat_std3$length_group==35] <- 34
dat_std3$length_group2[dat_std3$length_group==36] <- 36
dat_std3$length_group2[dat_std3$length_group==37] <- 36
dat_std3$length_group2[dat_std3$length_group==38] <- 36
dat_std3$length_group2[dat_std3$length_group==39] <- 39
dat_std3$length_group2[dat_std3$length_group==40] <- 39
dat_std3$length_group2[dat_std3$length_group==41] <- 41
dat_std3$length_group2[dat_std3$length_group==42] <- 41
dat_std3$length_group2[dat_std3$length_group==43] <- 41
dat_std3$length_group2[dat_std3$length_group==44] <- 44
dat_std3$length_group2[dat_std3$length_group==45] <- 44
dat_std3$length_group2[dat_std3$length_group==46] <- 46
dat_std3$length_group2[dat_std3$length_group==47] <- 46
dat_std3$length_group2[dat_std3$length_group==48] <- 46

# Now compare actual values - Looks ok!
ggplot(dat_std3, aes(factor(length_group), factor(length_group2))) +
  geom_point(size = 2) 


# Remove length_group so that only std 2 is included
dat_std3 <- dat_std3 %>%
  select(-c(length_group))
#> select: dropped one variable (length_group)

# Rename length_group2 to length_group
dat_std3 <- dat_std3 %>%
  dplyr::rename(length_group = length_group2)

# Merge data frames
catch_full <- rbind(dat_std2, dat_std3)

# In order to get even length classes (with respect to the code) I gave a new code to std 2
# which represents the starting length in each interval, as opposed to the original
# which was the integer in the middle of 2.5 cm classes...
# E.g. if the interval is 37.6 - 40, the new code becomes 37.6.

sort(unique(catch_full$length_group))
#>  [1]  6  9 11 14 16 19 21 24 26 29 31 34 36 39 41 44 46
# 6  9 11 14 16 19 21 24 26 29 31 34 36 39 41 44 46

# Insert new column 
catch_full$new_length_group <- NA
catch_full$new_length_group[catch_full$length_group==6] <- 5.1
catch_full$new_length_group[catch_full$length_group==9] <- 7.6
catch_full$new_length_group[catch_full$length_group==11] <- 10.1
catch_full$new_length_group[catch_full$length_group==14] <- 12.6
catch_full$new_length_group[catch_full$length_group==16] <- 15.1
catch_full$new_length_group[catch_full$length_group==19] <- 17.6
catch_full$new_length_group[catch_full$length_group==21] <- 20.1
catch_full$new_length_group[catch_full$length_group==24] <- 22.6
catch_full$new_length_group[catch_full$length_group==26] <- 25.1
catch_full$new_length_group[catch_full$length_group==29] <- 27.6
catch_full$new_length_group[catch_full$length_group==31] <- 30.1
catch_full$new_length_group[catch_full$length_group==34] <- 32.6
catch_full$new_length_group[catch_full$length_group==36] <- 35.1
catch_full$new_length_group[catch_full$length_group==39] <- 37.6
catch_full$new_length_group[catch_full$length_group==41] <- 40.1
catch_full$new_length_group[catch_full$length_group==44] <- 42.6
catch_full$new_length_group[catch_full$length_group==46] <- 45.1

# Compare values
ggplot(catch_full, aes(length_group, new_length_group)) +
  geom_point(size = 2) +
  geom_abline(slope = 1, intercept = 0, color = "red")


# Remove length_group so that only new_length_group is included
catch_full <- catch_full %>%
  select(-c(length_group))
#> select: dropped one variable (length_group)

# Rename length_group2 to length_group
catch_full <- catch_full %>%
  dplyr::rename(length_group = new_length_group)

# Insert netID
sort(unique(catch_full$Station))
#>  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 21 22 23 24 25 26
#> [26] 27 28 29 30 31 32 33 34 35 36 37 38 39 40 42 43 44 45 46 47 48 49 50


t <- catch_full %>% 
  dplyr::select(Area, year, week, day, Sektion, Station)

unique(is.na(t))
#>       Area  year  week   day Sektion Station
#> 2857 FALSE FALSE FALSE FALSE   FALSE   FALSE

catch_full$netID <- paste(catch_full$Area, catch_full$year, catch_full$Station, sep = ".")
catch_full$netID2 <- paste(catch_full$Area, catch_full$year, catch_full$week, catch_full$day, catch_full$Sektion, catch_full$Station, sep = ".")

length(unique(catch_full$netID))
#> [1] 256

Save data

write.csv(catch_full, "data/cleaned/catch_BT_FM_1987-2003.csv", row.names = FALSE)