Load libraries

# Load libraries (install first if needed)
library(tidyverse); theme_set(theme_classic(base_size = 12))
library(brms)
#> Warning: package 'Rcpp' was built under R version 4.0.5
library(nlstools)
library(viridis)
library(bayesplot)
library(tidylog)
library(tidybayes)
library(RColorBrewer)
library(patchwork)
library(modelr)

# Print package versions
sessionInfo()
#> R version 4.0.2 (2020-06-22)
#> Platform: x86_64-apple-darwin17.0 (64-bit)
#> Running under: macOS  10.16
#> 
#> Matrix products: default
#> BLAS:   /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
#> LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
#> 
#> locale:
#> [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#>  [1] modelr_0.1.8       patchwork_1.1.1    RColorBrewer_1.1-2 tidybayes_3.0.1   
#>  [5] tidylog_1.0.2      bayesplot_1.7.2    viridis_0.5.1      viridisLite_0.4.0 
#>  [9] nlstools_1.0-2     brms_2.17.0        Rcpp_1.0.8         forcats_0.5.1     
#> [13] stringr_1.4.0      dplyr_1.0.7        purrr_0.3.4        readr_2.1.1       
#> [17] tidyr_1.1.4        tibble_3.1.5       ggplot2_3.3.5      tidyverse_1.3.1   
#> 
#> loaded via a namespace (and not attached):
#>   [1] readxl_1.3.1         backports_1.3.0      plyr_1.8.6          
#>   [4] igraph_1.2.5         svUnit_1.0.6         splines_4.0.2       
#>   [7] crosstalk_1.1.0.1    rstantools_2.1.1     inline_0.3.15       
#>  [10] digest_0.6.28        htmltools_0.5.2      rsconnect_0.8.16    
#>  [13] fansi_0.5.0          magrittr_2.0.1       checkmate_2.0.0     
#>  [16] tzdb_0.2.0           RcppParallel_5.1.4   matrixStats_0.61.0  
#>  [19] xts_0.12-0           prettyunits_1.1.1    colorspace_2.0-2    
#>  [22] rvest_1.0.2          ggdist_3.0.0         haven_2.3.1         
#>  [25] xfun_0.27            callr_3.7.0          crayon_1.4.2        
#>  [28] jsonlite_1.7.2       lme4_1.1-26          zoo_1.8-8           
#>  [31] glue_1.6.2           gtable_0.3.0         emmeans_1.5.0       
#>  [34] V8_3.2.0             distributional_0.2.2 pkgbuild_1.2.0      
#>  [37] rstan_2.21.2         abind_1.4-5          scales_1.1.1        
#>  [40] mvtnorm_1.1-3        DBI_1.1.1            miniUI_0.1.1.1      
#>  [43] xtable_1.8-4         clisymbols_1.2.0     stats4_4.0.2        
#>  [46] StanHeaders_2.21.0-7 DT_0.15              htmlwidgets_1.5.1   
#>  [49] httr_1.4.2           threejs_0.3.3        arrayhelpers_1.1-0  
#>  [52] posterior_1.1.0      ellipsis_0.3.2       pkgconfig_2.0.3     
#>  [55] loo_2.3.1            farver_2.1.0         sass_0.3.1          
#>  [58] dbplyr_2.1.1         utf8_1.2.2           tidyselect_1.1.1    
#>  [61] rlang_1.0.2          reshape2_1.4.4       later_1.3.0         
#>  [64] munsell_0.5.0        cellranger_1.1.0     tools_4.0.2         
#>  [67] cli_3.1.0            generics_0.1.2       broom_0.7.10        
#>  [70] ggridges_0.5.2       evaluate_0.14        fastmap_1.1.0       
#>  [73] yaml_2.2.1           processx_3.5.2       knitr_1.36          
#>  [76] fs_1.5.0             nlme_3.1-148         mime_0.12           
#>  [79] projpred_2.0.2       xml2_1.3.2           compiler_4.0.2      
#>  [82] shinythemes_1.1.2    rstudioapi_0.13      curl_4.3.2          
#>  [85] gamm4_0.2-6          reprex_2.0.1         statmod_1.4.36      
#>  [88] bslib_0.2.4          stringi_1.7.5        ps_1.5.0            
#>  [91] Brobdingnag_1.2-6    lattice_0.20-41      Matrix_1.3-4        
#>  [94] nloptr_1.2.2.2       markdown_1.1         shinyjs_1.1         
#>  [97] tensorA_0.36.2       vctrs_0.3.8          pillar_1.6.4        
#> [100] lifecycle_1.0.1      jquerylib_0.1.4      bridgesampling_1.0-0
#> [103] estimability_1.3     httpuv_1.5.5         R6_2.5.1            
#> [106] promises_1.1.1       gridExtra_2.3        codetools_0.2-16    
#> [109] boot_1.3-25          colourpicker_1.0     MASS_7.3-51.6       
#> [112] gtools_3.8.2         assertthat_0.2.1     withr_2.4.2         
#> [115] shinystan_2.5.0      mgcv_1.8-31          parallel_4.0.2      
#> [118] hms_1.1.1            grid_4.0.2           coda_0.19-4         
#> [121] minqa_1.2.4          rmarkdown_2.11       shiny_1.6.0         
#> [124] lubridate_1.8.0      base64enc_0.1-3      dygraphs_1.1.1.6

Read and add final touches to data

df <- read.csv("data/aged_catch_BT_FM_1987-2003.csv")

head(df)

# 1996 is a special year because stations changed. But since we don't do anything with station 
# we simply ignore that
# df %>%
#   group_by(year, Area, Station) %>% 
#   summarise(n = n()) %>% 
#   ggplot(., aes(year, n, fill = factor(Station))) +
#   geom_bar(stat = "identity") +
#   facet_wrap(~Area)

# df %>%
#   filter(Area == "BT") %>%
#   group_by(year, netID) %>%
#   summarise(n = n()) %>%
#   ggplot(., aes(year, n)) + geom_bar(stat="identity")

# df %>% group_by(Area) %>% summarise(mean_age = mean(age), sd_age = sd(age))
# df %>% filter(age > 2) %>% group_by(Area) %>% summarise(mean_age = mean(age), sd_age = sd(age))
# ggplot(df, aes(Area, age)) + geom_boxplot()
# df %>% filter(age > 2) %>% ggplot(., aes(Area, age)) + geom_boxplot()
# summary(lm(age~Area, data = df))
# summary(lm(age~Area, data = filter(df, age > 2)))

# Currently the data is in format 1 row, 1 ind. We want a column that has the CPUE
# by length class.

# How many nets in total per area and year? (For scaling with effort later)
df <- df %>% group_by(Area, year) %>%
  mutate(n_nets_year_test = length(unique(netID))) %>%
  mutate(n_nets_year = length(unique(netID2))) %>%
  ungroup() %>%
  as.data.frame()
#> group_by: 2 grouping variables (Area, year)
#> mutate (grouped): new variable 'n_nets_year_test' (integer) with 4 unique values and 0% NA
#> mutate (grouped): new variable 'n_nets_year' (integer) with 10 unique values and 0% NA
#> ungroup: no grouping variables

length(unique(filter(df, Area == "BT")$netID))
#> filter: removed 45,794 rows (85%), 8,236 rows remaining
#> [1] 118
length(unique(filter(df, Area == "FM")$netID))
#> filter: removed 8,236 rows (15%), 45,794 rows remaining
#> [1] 138

length(unique(filter(df, Area == "BT")$netID2))
#> filter: removed 45,794 rows (85%), 8,236 rows remaining
#> [1] 500
length(unique(filter(df, Area == "FM")$netID2))
#> filter: removed 8,236 rows (15%), 45,794 rows remaining
#> [1] 825

# We want the data as follows:
# Year, LngtClass, Age, Number

# Group by year, Area and age, summarize and get n() per length, year and area...
df2 <- df %>%
  group_by(year, Area, age) %>% 
  summarise(catch_n = n()) %>% # Get catch as a column, this is now the number of rows a given length occurs each year and area
  ungroup() %>% 
  mutate(effort_id = paste(year, Area, sep = ".")) %>% # For calculating the total effort that gave that catch (total for year and area) 
  as.data.frame()
#> group_by: 3 grouping variables (year, Area, age)
#> summarise: now 278 rows and 4 columns, 2 group variables remaining (year, Area)
#> ungroup: no grouping variables
#> mutate: new variable 'effort_id' (character) with 32 unique values and 0% NA

# Now we need to get the effort back in there
df_effort <- df %>%
  mutate(effort_id = paste(year, Area, sep = ".")) %>% 
  select(effort_id, n_nets_year_test, n_nets_year) %>% 
  distinct(effort_id, .keep_all = TRUE) %>% 
  as.data.frame()
#> mutate: new variable 'effort_id' (character) with 32 unique values and 0% NA
#> select: dropped 11 variables (Area, Sektion, Station, year, week, …)
#> distinct: removed 53,998 rows (>99%), 32 rows remaining

# Now do a left_join
df3 <- left_join(df2, df_effort, by = "effort_id") %>% as.data.frame()
#> left_join: added 2 columns (n_nets_year_test, n_nets_year)
#>            > rows only in x     0
#>            > rows only in y  (  0)
#>            > matched rows     278
#>            >                 =====
#>            > rows total       278

# Looks Ok!
df %>% filter(year == 1987 & Area == "FM")
#> filter: removed 52,160 rows (97%), 1,870 rows remaining
df %>% filter(year == 1987 & Area == "FM") %>% distinct(n_nets_year, .keep_all = TRUE)
#> filter: removed 52,160 rows (97%), 1,870 rows remaining
#> distinct: removed 1,869 rows (>99%), one row remaining
df2 %>% filter(effort_id == "1987.FM")
#> filter: removed 270 rows (97%), 8 rows remaining
df3 %>% filter(effort_id == "1987.FM")
#> filter: removed 270 rows (97%), 8 rows remaining

# Go from total catch to CPUE
df4 <- df3 %>% 
  ungroup() %>% 
  rename("area" = "Area") %>% 
  mutate(cpue_numbers_test = catch_n/n_nets_year_test,
         cpue_numbers = catch_n/n_nets_year) # Get numbers CPUE, divide by the previously create n_nets, which is # of unique net ID's in each area and year
#> ungroup: no grouping variables
#> rename: renamed one variable (area)
#> mutate: new variable 'cpue_numbers_test' (double) with 197 unique values and 0% NA
#>         new variable 'cpue_numbers' (double) with 225 unique values and 0% NA
   
head(df3, 30)
head(df4)

p1 <- df4 %>% 
  group_by(area, year) %>% 
  summarise(sum_cpue = sum(cpue_numbers)) %>% 
  ggplot(aes(year, sum_cpue, color = area)) +
  geom_point() + 
  stat_smooth() + 
  ggtitle("long ID")
#> group_by: 2 grouping variables (area, year)
#> summarise: now 32 rows and 3 columns, one group variable remaining (area)

p2 <- df4 %>% 
  group_by(area, year) %>% 
  summarise(sum_cpue = sum(cpue_numbers_test)) %>% 
  ggplot(aes(year, sum_cpue, color = area)) +
  geom_point() + 
  stat_smooth() +
  ggtitle("short ID")
#> group_by: 2 grouping variables (area, year)
#> summarise: now 32 rows and 3 columns, one group variable remaining (area)

p1/p2
#> `geom_smooth()` using method = 'loess' and formula 'y ~ x'
#> `geom_smooth()` using method = 'loess' and formula 'y ~ x'

      
# Calculate average cpue by year and area
df4

p1 <- df4 %>%
  group_by(year, area, age) %>% 
  summarise(mean_cpue = mean(cpue_numbers)) %>% 
  ggplot(aes(year, mean_cpue, fill = factor(age))) +
  geom_area() + 
  facet_wrap(~area) + 
  scale_fill_brewer(palette = "Set1") + 
  coord_cartesian(expand = 0) + 
  ggtitle("New ID")
#> group_by: 3 grouping variables (year, area, age)
#> summarise: now 278 rows and 4 columns, 2 group variables remaining (year, area)

p2 <- df4 %>%
  group_by(year, area, age) %>% 
  summarise(mean_cpue = mean(cpue_numbers_test)) %>% 
  ggplot(aes(year, mean_cpue, fill = factor(age))) +
  geom_area() + 
  facet_wrap(~area) + 
  scale_fill_brewer(palette = "Set1") + 
  coord_cartesian(expand = 0) +
  ggtitle("Old ID")
#> group_by: 3 grouping variables (year, area, age)
#> summarise: now 278 rows and 4 columns, 2 group variables remaining (year, area)

p1/p2


# Test I get 1 unique row per age, year and area
df4 %>%
  group_by(year, area, age) %>%
  summarise(n = n()) %>% 
  ungroup() %>% 
  distinct(n)
#> group_by: 3 grouping variables (year, area, age)
#> summarise: now 278 rows and 4 columns, 2 group variables remaining (year, area)
#> ungroup: no grouping variables
#> distinct: removed 277 rows (>99%), one row remaining

# Plot the log cpue as function of age to find the ages that correspond to the descending limb:
ggplot(df4, aes(factor(age), log(cpue_numbers), color = factor(year))) + 
  geom_point() +
  facet_wrap(~ area) +
  scale_color_viridis(discrete = TRUE) + 
  coord_cartesian(expand = 0) + 
  theme(aspect.ratio = 1)


# Let's use age 3 and older
ggplot(df4, aes(factor(age), log(cpue_numbers), color = factor(year))) + 
  geom_point() +
  facet_wrap(~ area) +
  scale_color_viridis(discrete = TRUE) + 
  coord_cartesian(expand = 0) + 
  theme(aspect.ratio = 1)


# Plot total catch
df2 %>%
  group_by(Area, year) %>%
  summarize(tot_catch = sum(catch_n)) %>% 
  ggplot(., aes(year, tot_catch)) + geom_bar(stat = "identity") +
  facet_wrap(~ Area)
#> group_by: 2 grouping variables (Area, year)
#> summarize: now 32 rows and 3 columns, one group variable remaining (Area)


# Plot cpue
df4 %>%
  group_by(area, year) %>%
  ggplot(., aes(year, cpue_numbers)) + geom_bar(stat = "identity") +
  facet_wrap(~ area)
#> group_by: 2 grouping variables (area, year)


# Edit variables
d <- df4 %>%
  mutate(log_cpue = log(cpue_numbers),
         area2 = ifelse(area == "BT", "Warm", "Cold"),
         cohort = year - age) 
#> mutate: new variable 'log_cpue' (double) with 225 unique values and 0% NA
#>         new variable 'area2' (character) with 2 unique values and 0% NA
#>         new variable 'cohort' (integer) with 25 unique values and 0% NA

# Remove cohorts before 1981
d <- d %>% filter(cohort > 1980, age > 2)
#> filter: removed 70 rows (25%), 208 rows remaining

sort(unique(d$cohort))
#>  [1] 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995
#> [16] 1996 1997 1998 1999 2000

max(d$year)
#> [1] 2003

Explore data

# Plot data
d %>% filter(area == "BT") %>% 
  ggplot(aes(age, log_cpue)) + 
  geom_point() + 
  stat_smooth(method = "lm", se = FALSE) + 
  facet_wrap(~ cohort)
#> filter: removed 113 rows (54%), 95 rows remaining
#> `geom_smooth()` using formula 'y ~ x'


d %>% filter(area == "FM") %>% 
  ggplot(aes(age, log_cpue)) + 
  geom_point() + 
  stat_smooth(method = "lm", se = FALSE) + 
  facet_wrap(~ cohort)
#> filter: removed 95 rows (46%), 113 rows remaining
#> `geom_smooth()` using formula 'y ~ x'


d %>% 
  ggplot(aes(age, cpue_numbers)) + 
  geom_point() + 
  stat_smooth(method = "lm", se = FALSE) + 
  facet_wrap(~ area)
#> `geom_smooth()` using formula 'y ~ x'


# Make age proportion data frame (not by year)
tot_cpue_area <- d %>%
  filter(age > 2) %>% 
  mutate(Area2 = ifelse(area == "BT", "Warm", "Cold")) %>% 
  group_by(Area2) %>% 
  summarise(sum_cpue = sum(cpue_numbers))
#> filter: no rows removed
#> mutate: new variable 'Area2' (character) with 2 unique values and 0% NA
#> group_by: one grouping variable (Area2)
#> summarise: now 2 rows and 2 columns, ungrouped

d_prop_age <- d %>%
  filter(age > 2) %>% 
  mutate(Area2 = ifelse(area == "BT", "Warm", "Cold")) %>% 
  group_by(Area2) %>% 
  mutate(sum_cpue = sum(cpue_numbers)) %>% 
  ungroup() %>% 
  group_by(Area2, age) %>% 
  summarise(sum_cpue_age = sum(cpue_numbers)) %>% 
  ungroup() 
#> filter: no rows removed
#> mutate: new variable 'Area2' (character) with 2 unique values and 0% NA
#> group_by: one grouping variable (Area2)
#> mutate (grouped): new variable 'sum_cpue' (double) with 2 unique values and 0% NA
#> ungroup: no grouping variables
#> group_by: 2 grouping variables (Area2, age)
#> summarise: now 14 rows and 3 columns, one group variable remaining (Area2)
#> ungroup: no grouping variables

d_prop_age %>% as.data.frame()

d_prop_age <- left_join(d_prop_age, tot_cpue_area) %>% 
  mutate(prop = sum_cpue_age / sum_cpue)
#> Joining, by = "Area2"
#> left_join: added one column (sum_cpue)
#>            > rows only in x    0
#>            > rows only in y  ( 0)
#>            > matched rows     14
#>            >                 ====
#>            > rows total       14
#> mutate: new variable 'prop' (double) with 14 unique values and 0% NA

pal <- rev(brewer.pal(n = 6, name = "Paired")[c(2, 6)])

p <- d_prop_age %>% 
  ggplot(aes(factor(age), prop, color = Area2)) +
  geom_point(stat = "identity") +
  scale_color_manual(values = rev(pal)) +
  theme_light() + 
  NULL

# Plot proportion by age

pp <- df %>% 
  filter(age > 2) %>% 
  mutate(Area2 = ifelse(Area == "BT", "Warm", "Cold")) %>% 
  group_by(Area2, age) %>% 
  summarise(n = n()) %>%
  mutate(freq = n / sum(n)) %>% 
  ggplot(aes(x = factor(age), y = freq, color = Area2, group = interaction(age, Area2))) +
  geom_point() +
  scale_color_manual(values = rev(pal)) +
  theme_light() + 
  labs(x = "Age", y = "Proportion") +
  guides(fill = "none") +
  theme(text = element_text(size = 12))
#> filter: removed 15,771 rows (29%), 38,259 rows remaining
#> mutate: new variable 'Area2' (character) with 2 unique values and 0% NA
#> group_by: 2 grouping variables (Area2, age)
#> summarise: now 14 rows and 3 columns, one group variable remaining (Area2)
#> mutate (grouped): new variable 'freq' (double) with 14 unique values and 0% NA

p / pp


df %>% 
  filter(age > 2) %>% 
  mutate(Area2 = ifelse(Area == "BT", "Warm", "Cold")) %>% 
  group_by(Area2, year, age) %>% 
  summarise(n = n()) %>%
  mutate(freq = n / sum(n)) %>% 
  ggplot(aes(x = factor(age), y = freq, color = Area2, group = interaction(age, Area2))) +
  geom_point(position = position_dodge(width = 0.8)) +
  geom_boxplot(position = position_dodge(width = 0.8), fill = NA) +
  scale_color_manual(values = rev(pal)) +
  theme_light() + 
  labs(x = "Age", y = "Proportion") +
  guides(fill = "none") +
  theme(text = element_text(size = 12))
#> filter: removed 15,771 rows (29%), 38,259 rows remaining
#> mutate: new variable 'Area2' (character) with 2 unique values and 0% NA
#> group_by: 3 grouping variables (Area2, year, age)
#> summarise: now 219 rows and 4 columns, 2 group variables remaining (Area2, year)
#> mutate (grouped): new variable 'freq' (double) with 214 unique values and 0% NA


ggsave("figures/supp/age_prop_diff.png", width = 3.5, height = 6.5, dpi = 600)

Save data

write.csv(d, "data/for_fitting/catch_curve_dat.csv")