Load libraries

# Load libraries (install first if needed)
library(tidyverse); theme_set(theme_classic(base_size = 12))
#> Warning: package 'tidyr' was built under R version 4.0.5
library(brms)
#> Warning: package 'Rcpp' was built under R version 4.0.5
library(nlstools)
library(viridis)
library(bayesplot)
library(tidylog)
library(tidybayes)
library(RColorBrewer)
#> Warning: package 'RColorBrewer' was built under R version 4.0.5
library(patchwork)
library(modelr)
library(janitor)

# Print package versions
sessionInfo() 
#> R version 4.0.2 (2020-06-22)
#> Platform: x86_64-apple-darwin17.0 (64-bit)
#> Running under: macOS  10.16
#> 
#> Matrix products: default
#> BLAS:   /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
#> LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
#> 
#> locale:
#> [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#>  [1] janitor_2.0.1      modelr_0.1.8       patchwork_1.1.1    RColorBrewer_1.1-3
#>  [5] tidybayes_3.0.1    tidylog_1.0.2      bayesplot_1.7.2    viridis_0.5.1     
#>  [9] viridisLite_0.4.1  nlstools_1.0-2     brms_2.17.0        Rcpp_1.0.8        
#> [13] forcats_0.5.1      stringr_1.4.1      dplyr_1.0.10       purrr_0.3.4       
#> [17] readr_2.1.1        tidyr_1.2.0        tibble_3.1.8       ggplot2_3.3.6     
#> [21] tidyverse_1.3.2   
#> 
#> loaded via a namespace (and not attached):
#>   [1] readxl_1.3.1         backports_1.3.0      plyr_1.8.6          
#>   [4] igraph_1.2.5         svUnit_1.0.6         splines_4.0.2       
#>   [7] crosstalk_1.1.0.1    TH.data_1.1-1        rstantools_2.1.1    
#>  [10] inline_0.3.15        digest_0.6.30        htmltools_0.5.3     
#>  [13] rsconnect_0.8.16     fansi_1.0.3          magrittr_2.0.3      
#>  [16] checkmate_2.0.0      googlesheets4_1.0.0  tzdb_0.2.0          
#>  [19] RcppParallel_5.1.4   matrixStats_0.61.0   xts_0.12-0          
#>  [22] sandwich_3.0-2       prettyunits_1.1.1    colorspace_2.0-3    
#>  [25] rvest_1.0.3          ggdist_3.0.0         haven_2.5.1         
#>  [28] xfun_0.33            callr_3.7.2          crayon_1.4.2        
#>  [31] jsonlite_1.8.0       lme4_1.1-26          survival_3.1-12     
#>  [34] zoo_1.8-8            glue_1.6.2           gtable_0.3.1        
#>  [37] gargle_1.2.0         emmeans_1.5.0        V8_3.2.0            
#>  [40] distributional_0.2.2 pkgbuild_1.3.1       rstan_2.21.2        
#>  [43] abind_1.4-5          scales_1.2.1         mvtnorm_1.1-3       
#>  [46] DBI_1.1.1            miniUI_0.1.1.1       xtable_1.8-4        
#>  [49] clisymbols_1.2.0     stats4_4.0.2         StanHeaders_2.21.0-7
#>  [52] DT_0.15              htmlwidgets_1.5.4    httr_1.4.4          
#>  [55] threejs_0.3.3        arrayhelpers_1.1-0   posterior_1.1.0     
#>  [58] ellipsis_0.3.2       pkgconfig_2.0.3      loo_2.3.1           
#>  [61] farver_2.1.1         sass_0.4.2           dbplyr_2.1.1        
#>  [64] utf8_1.2.2           tidyselect_1.1.2     rlang_1.0.6         
#>  [67] reshape2_1.4.4       later_1.3.0          munsell_0.5.0       
#>  [70] cellranger_1.1.0     tools_4.0.2          cachem_1.0.6        
#>  [73] cli_3.4.1            generics_0.1.2       broom_1.0.1         
#>  [76] ggridges_0.5.2       evaluate_0.16        fastmap_1.1.0       
#>  [79] yaml_2.3.5           processx_3.7.0       knitr_1.40          
#>  [82] fs_1.5.2             nlme_3.1-148         projpred_2.0.2      
#>  [85] mime_0.12            xml2_1.3.3           compiler_4.0.2      
#>  [88] shinythemes_1.1.2    rstudioapi_0.14      gamm4_0.2-6         
#>  [91] curl_4.3.2           reprex_2.0.1         statmod_1.4.36      
#>  [94] bslib_0.4.0          stringi_1.7.8        ps_1.7.1            
#>  [97] Brobdingnag_1.2-6    lattice_0.20-41      Matrix_1.3-4        
#> [100] nloptr_1.2.2.2       markdown_1.1         shinyjs_1.1         
#> [103] tensorA_0.36.2       vctrs_0.5.0          pillar_1.8.1        
#> [106] lifecycle_1.0.3      jquerylib_0.1.4      bridgesampling_1.0-0
#> [109] estimability_1.3     httpuv_1.5.5         R6_2.5.1            
#> [112] promises_1.1.1       gridExtra_2.3        codetools_0.2-16    
#> [115] boot_1.3-25          colourpicker_1.0     MASS_7.3-51.6       
#> [118] gtools_3.8.2         assertthat_0.2.1     withr_2.5.0         
#> [121] shinystan_2.5.0      multcomp_1.4-20      mgcv_1.8-31         
#> [124] parallel_4.0.2       hms_1.1.1            grid_4.0.2          
#> [127] minqa_1.2.4          coda_0.19-4          snakecase_0.11.0    
#> [130] rmarkdown_2.16       googledrive_2.0.0    shiny_1.6.0         
#> [133] lubridate_1.8.0      base64enc_0.1-3      dygraphs_1.1.1.6

Read and add final touches to data

df <- read.csv("data/cleaned/size_at_age_BT_FM_1970-2004.csv", sep = ";")

# test
df %>% arrange(length)
df %>% arrange(desc(length))

max(df$length)
#> [1] 482.7

# Check out how many ("length") unique ID observations there are
length(unique(df$ID)) 
#> [1] 10188

# Remove gear 32, see vbge script
df <- df %>% filter(!gear == 32)
#> filter: removed 292 rows (1%), 52,143 rows remaining

# I need to make the ID completely unique (since I should include ID information in my
# mixed model), otherwise they may have the same ID in different areas.
# We solve this by writing ID with area + ID. The paste function pastes things together
df$ID <- paste(df$ID, df$area, sep = "")

# Check catch age corresponds to number of read length-at-ages!
df <- df %>%
  group_by(ID) %>%
  mutate(n = n()) %>%
  mutate(length = length/10) %>%
  rename("back_calc_age" = "age") %>% 
  ungroup()
#> group_by: one grouping variable (ID)
#> mutate (grouped): new variable 'n' (integer) with 9 unique values and 0% NA
#> mutate (grouped): changed 52,143 values (100%) of 'length' (0 new NA)
#> rename: renamed one variable (back_calc_age)
#> ungroup: no grouping variables

ggplot(df, aes(factor(catch_age), factor(n))) + geom_point()


# Remove individuals with the same ID (bug)
df <- df %>% filter(n <= catch_age)
#> filter: removed 3 rows (<1%), 52,140 rows remaining

# Now every individual has the same number of rows as it's age in years
ggplot(df, aes(factor(catch_age), factor(n))) + geom_point()


# Now I create a "wide" data frame, so that I can easily create new columns by taking
# one column minus another
df_wide <- df %>%
  filter(birth_year > 1980) %>%
  spread(back_calc_age, length) %>% # Here I convert values in the "age" column to my new columns!
  # The value in these columns is taken from the "length" column
  arrange(ID)
#> filter: removed 18,932 rows (36%), 33,208 rows remaining
#> spread: reorganized (back_calc_age, length) into (1, 2, 3, 4, 5, …) [was 33208x9, now 9145x16]

# Check a single ID
df %>% filter(ID == "1983127BT")
#> filter: removed 52,138 rows (>99%), 2 rows remaining
df_wide %>% filter(ID == "1983127BT")
#> filter: removed 9,144 rows (>99%), one row remaining

# Calculation of growth and geometric length for all ages
df_growth <- data.frame(df_wide %>%
                          mutate(G1 = 100 * (log(`2`) - log(`1`)),
                                 G2 = 100 * (log(`3`) - log(`2`)),
                                 G3 = 100 * (log(`4`) - log(`3`)),
                                 G4 = 100 * (log(`5`) - log(`4`)),
                                 G5 = 100 * (log(`6`) - log(`5`)),
                                 G6 = 100 * (log(`7`) - log(`6`)),
                                 G7 = 100 * (log(`8`) - log(`7`)),
                                 G8 = 100 * (log(`9`) - log(`8`)), # 9 is the max age in the data!
                                 L1 = `1`,
                                 L2 = `2`,
                                 L3 = `3`,
                                 L4 = `4`,
                                 L5 = `5`,
                                 L6 = `6`,
                                 L7 = `7`,
                                 L8 = `8`))
#> mutate: new variable 'G1' (double) with 2,898 unique values and 10% NA
#>         new variable 'G2' (double) with 2,971 unique values and 34% NA
#>         new variable 'G3' (double) with 2,399 unique values and 56% NA
#>         new variable 'G4' (double) with 1,749 unique values and 72% NA
#>         new variable 'G5' (double) with 1,229 unique values and 83% NA
#>         new variable 'G6' (double) with 782 unique values and 90% NA
#>         new variable 'G7' (double) with 485 unique values and 94% NA
#>         new variable 'G8' (double) with 192 unique values and 98% NA
#>         new variable 'L1' (double) with 778 unique values and 0% NA
#>         new variable 'L2' (double) with 716 unique values and 10% NA
#>         new variable 'L3' (double) with 564 unique values and 34% NA
#>         new variable 'L4' (double) with 417 unique values and 56% NA
#>         new variable 'L5' (double) with 313 unique values and 72% NA
#>         new variable 'L6' (double) with 278 unique values and 83% NA
#>         new variable 'L7' (double) with 231 unique values and 90% NA
#>         new variable 'L8' (double) with 184 unique values and 94% NA

# Check it went OK
df_wide %>% filter(ID == "1983127BT")
#> filter: removed 9,144 rows (>99%), one row remaining
df_growth %>% filter(ID == "1983127BT")
#> filter: removed 9,144 rows (>99%), one row remaining

# Now make the data "long" again (each observation is a row),
# so that we can plot growth ~ length
# We need to do it separately (length and growth) to avoid duplicates

# First subset the important columns... length and growth data separately
df_g <- df_growth %>% select(c(area, ID, catch_age, G1, G2, G3, G4, G5, G6, G7, G8))
#> select: dropped 21 variables (catch_year, birth_year, gear, n, X1, …)

df_l <- df_growth %>% select(c(area, ID, catch_age, L1, L2, L3, L4, L5, L6, L7, L8))
#> select: dropped 21 variables (catch_year, birth_year, gear, n, X1, …)

# Now we have two wide data frames... let's make them long using the gather function
# - separately - and then merge them
df_g_l <- df_g %>% gather(g_age, growth, 4:11) # columns 4-11 are gathered
#> gather: reorganized (G1, G2, G3, G4, G5, …) into (g_age, growth) [was 9145x11, now 73160x5]
df_l_l <- df_l %>% gather(g_length, length, 4:11)
#> gather: reorganized (L1, L2, L3, L4, L5, …) into (g_length, length) [was 9145x11, now 73160x5]

# Now join datasets. Before I do that I need to add a new common column ("age"), so that
# R knows which length to go with each growth. Check data frames again...
head(arrange(subset(df_g_l, catch_age == 3), ID), 5)
head(arrange(subset(df_l_l, catch_age == 3), ID), 5)

# Here I'm splitting the L1, G1 stuff so that I get G and 1, because then the numbers
# will match in the two datasets, and I make sure the correct length is matched with growth
df_g_l <- df_g_l %>% separate(g_age, c("g", "back_calc_age"), sep = 1) %>% arrange(ID)
df_l_l <- df_l_l %>% separate(g_length, c("g", "back_calc_age"), sep = 1) %>% arrange(ID)

# Now I'll do a left_join to add in length to growth data. Match by ID and age
df_l_l_subset <- df_l_l %>% select(ID, back_calc_age, length)
#> select: dropped 3 variables (area, catch_age, g)
df_all <- left_join(df_g_l, df_l_l_subset, by = c("ID", "back_calc_age"))
#> left_join: added one column (length)
#>            > rows only in x        0
#>            > rows only in y  (     0)
#>            > matched rows     73,160
#>            >                 ========
#>            > rows total       73,160

# Check it went OK
df_growth %>% filter(ID == "1983127BT")
#> filter: removed 9,144 rows (>99%), one row remaining
df_all %>% filter(ID == "1983127BT")
#> filter: removed 73,152 rows (>99%), 8 rows remaining

# Remove NA growth and length individuals from this analysis. Calculate other things as well
# centering and squaring etc
dfm <- df_all %>%
  drop_na(growth) %>%
  filter(growth > 0) %>%
  drop_na(length) %>% 
  mutate(log_length = log(length),
         log_growth = log(growth),
         log_length_ct = log_length - mean(log_length),
         log_length_ct_sq = log_length_ct*log_length_ct) %>% 
  separate(ID, c("catch_year", "ID2"), sep = 4) %>% 
  mutate_at(c("catch_year", "catch_age", "back_calc_age"), as.numeric) %>% 
  mutate(birth_year = catch_year - catch_age,
         ID = paste(catch_year, ID2, sep = "")) %>% 
  select(-g) %>% 
  filter(birth_year < 1998) %>%
  filter(catch_year < 2003) %>% 
  mutate_at(c("ID", "ID2", "area"), as.factor)
#> drop_na: removed 49,097 rows (67%), 24,063 rows remaining
#> filter: removed one row (<1%), 24,062 rows remaining
#> drop_na: no rows removed
#> mutate: new variable 'log_length' (double) with 1,743 unique values and 0% NA
#>         new variable 'log_growth' (double) with 9,183 unique values and 0% NA
#>         new variable 'log_length_ct' (double) with 1,743 unique values and 0% NA
#>         new variable 'log_length_ct_sq' (double) with 1,743 unique values and 0% NA
#> mutate_at: converted 'catch_year' from character to double (0 new NA)
#>            converted 'catch_age' from integer to double (0 new NA)
#>            converted 'back_calc_age' from character to double (0 new NA)
#> mutate: new variable 'birth_year' (double) with 22 unique values and 0% NA
#>         new variable 'ID' (character) with 8,243 unique values and 0% NA
#> select: dropped one variable (g)
#> filter: removed 1,841 rows (8%), 22,221 rows remaining
#> filter: removed 545 rows (2%), 21,676 rows remaining
#> mutate_at: converted 'area' from character to factor (0 new NA)
#>            converted 'ID2' from character to factor (0 new NA)
#>            converted 'ID' from character to factor (0 new NA)

Exploratory plots

# Plot distribution of catch years
dfm %>% 
  group_by(catch_age, catch_year, area) %>% 
  summarise(n = n()) %>% 
  ungroup() %>% 
  ggplot(., aes(factor(catch_age), n, fill = area)) +
  geom_bar(stat = "identity") + 
  facet_wrap(~catch_year, scales = "free")
#> group_by: 3 grouping variables (catch_age, catch_year, area)
#> summarise: now 237 rows and 4 columns, 2 group variables remaining (catch_age, catch_year)
#> ungroup: no grouping variables


# Filter to have at least 4 data points per individual
dfm_3 <- dfm %>% filter(catch_age > 3)
#> filter: removed 5,028 rows (23%), 16,648 rows remaining
dfm_4 <- dfm %>% filter(catch_age > 4)
#> filter: removed 9,015 rows (42%), 12,661 rows remaining

# Plot distribution of data
p3 <- ggplot(dfm_3, aes(x = log_growth)) + geom_density()
p4 <- ggplot(dfm_4, aes(x = log_growth)) + geom_density()

p3 + p4


dfm <- dfm %>% filter(catch_age > 4)
#> filter: removed 9,015 rows (42%), 12,661 rows remaining

dfm %>% 
  group_by(catch_age, catch_year, area) %>% 
  summarise(n = n()) %>% 
  ungroup() %>% 
  ggplot(., aes(factor(catch_age), n, fill = area)) +
  geom_bar(stat = "identity") + 
  facet_wrap(~catch_year, scales = "free")
#> group_by: 3 grouping variables (catch_age, catch_year, area)
#> summarise: now 140 rows and 4 columns, 2 group variables remaining (catch_age, catch_year)
#> ungroup: no grouping variables


# Plot sample size per individual
min(dfm$catch_age)
#> [1] 5

dfm %>% 
  group_by(ID) %>% 
  mutate(n = n()) %>% 
  ungroup() %>% 
  ggplot(., aes(n)) +
  geom_histogram() 
#> group_by: one grouping variable (ID)
#> mutate (grouped): new variable 'n' (integer) with 6 unique values and 0% NA
#> ungroup: no grouping variables
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.


# Check relationship between catch age and # of back-calculated ages
dfm %>% 
  group_by(ID) %>%
  mutate(n = n()) %>%
  ggplot(., aes(n, catch_age)) +
  geom_point() +
  geom_abline(intercept = 0, slope = 1) +
  geom_abline(intercept = 1, slope = 1, col = "red")
#> group_by: one grouping variable (ID)
#> mutate (grouped): new variable 'n' (integer) with 6 unique values and 0% NA


# Some data without growth, hence log 0
dfm <- dfm %>% group_by(ID) %>% mutate(n = n()) %>% mutate(test = catch_age - n) %>% ungroup()
#> group_by: one grouping variable (ID)
#> mutate (grouped): new variable 'n' (integer) with 6 unique values and 0% NA
#> mutate (grouped): new variable 'test' (double) with 2 unique values and 0% NA
#> ungroup: no grouping variables

# Remove these
dfm <- dfm %>% filter(test == 1)
#> filter: removed 3 rows (<1%), 12,658 rows remaining

# Create area 2 variable
dfm <- dfm %>%
  mutate(area2 = ifelse(area == "BT", "Warm", "Cold"))
#> mutate: new variable 'area2' (character) with 2 unique values and 0% NA

# Plot again
dfm %>% 
  group_by(catch_age, catch_year, area) %>% 
  summarise(n = n()) %>% 
  ungroup() %>% 
  ggplot(., aes(factor(catch_age), n, fill = area)) +
  geom_bar(stat = "identity") + 
  facet_wrap(~catch_year, scales = "free")
#> group_by: 3 grouping variables (catch_age, catch_year, area)
#> summarise: now 140 rows and 4 columns, 2 group variables remaining (catch_age, catch_year)
#> ungroup: no grouping variables


# And now again but with sample size per age by catch year
# No catch data in FM in 1986
dfm %>% 
  group_by(back_calc_age, catch_year, area) %>% 
  summarise(n = n()) %>% 
  ungroup() %>% 
  ggplot(., aes(factor(back_calc_age), n, fill = area)) +
  geom_bar(stat = "identity") + 
  facet_wrap(~catch_year)
#> group_by: 3 grouping variables (back_calc_age, catch_year, area)
#> summarise: now 239 rows and 4 columns, 2 group variables remaining (back_calc_age, catch_year)
#> ungroup: no grouping variables


# Same but with birth year
dfm %>% 
  group_by(back_calc_age, birth_year, area) %>% 
  summarise(n = n()) %>% 
  ungroup() %>% 
  ggplot(., aes(factor(back_calc_age), n, fill = area)) +
  geom_bar(stat = "identity") + 
  facet_wrap(~birth_year)
#> group_by: 3 grouping variables (back_calc_age, birth_year, area)
#> summarise: now 244 rows and 4 columns, 2 group variables remaining (back_calc_age, birth_year)
#> ungroup: no grouping variables


# Calculate samples sizes
dfm %>% group_by(area) %>% summarise(n = n())
#> group_by: one grouping variable (area)
#> summarise: now 2 rows and 2 columns, ungrouped
dfm %>% group_by(area) %>% distinct(ID) %>% summarise(n = n())
#> group_by: one grouping variable (area)
#> distinct (grouped): removed 10,232 rows (81%), 2,426 rows remaining
#> summarise: now 2 rows and 2 columns, ungrouped

# Average number of data points per individual?
dfm %>% group_by(ID) %>% summarise(n = n()) %>% ungroup() %>% summarize(mean_n = mean(n))
#> group_by: one grouping variable (ID)
#> summarise: now 2,426 rows and 2 columns, ungrouped
#> ungroup: no grouping variables
#> summarize: now one row and one column, ungrouped

# Now plot full data.
dfm %>% 
  ggplot(., aes(x = log_length, y = log_growth, color = ID)) +
  facet_wrap(~area) +
  geom_point(size = 1, alpha = 0.2) +
  geom_line(size = 1, alpha = 0.2) +
  scale_color_viridis(discrete = T, direction = -1) + 
  guides(color = "none") +
  NULL


# Check unique years
dfm %>% group_by(area, catch_year) %>% summarise(catch_year = (unique(catch_year))) %>% data.frame()
#> group_by: 2 grouping variables (area, catch_year)
#> summarise: now 33 rows and 2 columns, one group variable remaining (area)

# Check sample size:
nrow(dfm)
#> [1] 12658
length(unique(dfm$ID))
#> [1] 2426

nrow(dfm) / length(unique(dfm$ID))
#> [1] 5.217642

max(dfm$catch_year)
#> [1] 2002

# Add dummy variable}
# Follow the method in VBGE and use dummy coding
bt <- filter(dfm, area == "BT")
#> filter: removed 10,320 rows (82%), 2,338 rows remaining
fm <- filter(dfm, area == "FM")
#> filter: removed 2,338 rows (18%), 10,320 rows remaining

dfm_dummy <- data.frame(rbind(cbind(bt, areaW=1, areaC=0), cbind(fm, areaW=0, areaC=1)))

# Plot non-linear relationship
ggplot(dfm_dummy, aes(length, growth, color = area2)) +
  geom_point()

Save data

 write.csv(dfm_dummy, "data/for_fitting/growth_scaling_dat.csv")