Load libraries
# Load libraries (install first if needed)
library(tidyverse); theme_set(theme_classic(base_size = 12))
#> Warning: package 'tidyr' was built under R version 4.0.5
library(brms)
#> Warning: package 'Rcpp' was built under R version 4.0.5
library(nlstools)
library(viridis)
library(bayesplot)
library(tidylog)
library(tidybayes)
library(RColorBrewer)
#> Warning: package 'RColorBrewer' was built under R version 4.0.5
library(patchwork)
library(modelr)
library(janitor)
# Print package versions
sessionInfo()
#> R version 4.0.2 (2020-06-22)
#> Platform: x86_64-apple-darwin17.0 (64-bit)
#> Running under: macOS 10.16
#>
#> Matrix products: default
#> BLAS: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
#> LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
#>
#> locale:
#> [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] janitor_2.0.1 modelr_0.1.8 patchwork_1.1.1 RColorBrewer_1.1-3
#> [5] tidybayes_3.0.1 tidylog_1.0.2 bayesplot_1.7.2 viridis_0.5.1
#> [9] viridisLite_0.4.1 nlstools_1.0-2 brms_2.17.0 Rcpp_1.0.8
#> [13] forcats_0.5.1 stringr_1.4.1 dplyr_1.0.10 purrr_0.3.4
#> [17] readr_2.1.1 tidyr_1.2.0 tibble_3.1.8 ggplot2_3.3.6
#> [21] tidyverse_1.3.2
#>
#> loaded via a namespace (and not attached):
#> [1] readxl_1.3.1 backports_1.3.0 plyr_1.8.6
#> [4] igraph_1.2.5 svUnit_1.0.6 splines_4.0.2
#> [7] crosstalk_1.1.0.1 TH.data_1.1-1 rstantools_2.1.1
#> [10] inline_0.3.15 digest_0.6.30 htmltools_0.5.3
#> [13] rsconnect_0.8.16 fansi_1.0.3 magrittr_2.0.3
#> [16] checkmate_2.0.0 googlesheets4_1.0.0 tzdb_0.2.0
#> [19] RcppParallel_5.1.4 matrixStats_0.61.0 xts_0.12-0
#> [22] sandwich_3.0-2 prettyunits_1.1.1 colorspace_2.0-3
#> [25] rvest_1.0.3 ggdist_3.0.0 haven_2.5.1
#> [28] xfun_0.33 callr_3.7.2 crayon_1.4.2
#> [31] jsonlite_1.8.0 lme4_1.1-26 survival_3.1-12
#> [34] zoo_1.8-8 glue_1.6.2 gtable_0.3.1
#> [37] gargle_1.2.0 emmeans_1.5.0 V8_3.2.0
#> [40] distributional_0.2.2 pkgbuild_1.3.1 rstan_2.21.2
#> [43] abind_1.4-5 scales_1.2.1 mvtnorm_1.1-3
#> [46] DBI_1.1.1 miniUI_0.1.1.1 xtable_1.8-4
#> [49] clisymbols_1.2.0 stats4_4.0.2 StanHeaders_2.21.0-7
#> [52] DT_0.15 htmlwidgets_1.5.4 httr_1.4.4
#> [55] threejs_0.3.3 arrayhelpers_1.1-0 posterior_1.1.0
#> [58] ellipsis_0.3.2 pkgconfig_2.0.3 loo_2.3.1
#> [61] farver_2.1.1 sass_0.4.2 dbplyr_2.1.1
#> [64] utf8_1.2.2 tidyselect_1.1.2 rlang_1.0.6
#> [67] reshape2_1.4.4 later_1.3.0 munsell_0.5.0
#> [70] cellranger_1.1.0 tools_4.0.2 cachem_1.0.6
#> [73] cli_3.4.1 generics_0.1.2 broom_1.0.1
#> [76] ggridges_0.5.2 evaluate_0.16 fastmap_1.1.0
#> [79] yaml_2.3.5 processx_3.7.0 knitr_1.40
#> [82] fs_1.5.2 nlme_3.1-148 projpred_2.0.2
#> [85] mime_0.12 xml2_1.3.3 compiler_4.0.2
#> [88] shinythemes_1.1.2 rstudioapi_0.14 gamm4_0.2-6
#> [91] curl_4.3.2 reprex_2.0.1 statmod_1.4.36
#> [94] bslib_0.4.0 stringi_1.7.8 ps_1.7.1
#> [97] Brobdingnag_1.2-6 lattice_0.20-41 Matrix_1.3-4
#> [100] nloptr_1.2.2.2 markdown_1.1 shinyjs_1.1
#> [103] tensorA_0.36.2 vctrs_0.5.0 pillar_1.8.1
#> [106] lifecycle_1.0.3 jquerylib_0.1.4 bridgesampling_1.0-0
#> [109] estimability_1.3 httpuv_1.5.5 R6_2.5.1
#> [112] promises_1.1.1 gridExtra_2.3 codetools_0.2-16
#> [115] boot_1.3-25 colourpicker_1.0 MASS_7.3-51.6
#> [118] gtools_3.8.2 assertthat_0.2.1 withr_2.5.0
#> [121] shinystan_2.5.0 multcomp_1.4-20 mgcv_1.8-31
#> [124] parallel_4.0.2 hms_1.1.1 grid_4.0.2
#> [127] minqa_1.2.4 coda_0.19-4 snakecase_0.11.0
#> [130] rmarkdown_2.16 googledrive_2.0.0 shiny_1.6.0
#> [133] lubridate_1.8.0 base64enc_0.1-3 dygraphs_1.1.1.6
Read and add final touches to data
df <- read.csv("data/cleaned/size_at_age_BT_FM_1970-2004.csv", sep = ";")
# test
df %>% arrange(length)
df %>% arrange(desc(length))
max(df$length)
#> [1] 482.7
# Check out how many ("length") unique ID observations there are
length(unique(df$ID))
#> [1] 10188
# Remove gear 32, see vbge script
df <- df %>% filter(!gear == 32)
#> filter: removed 292 rows (1%), 52,143 rows remaining
# I need to make the ID completely unique (since I should include ID information in my
# mixed model), otherwise they may have the same ID in different areas.
# We solve this by writing ID with area + ID. The paste function pastes things together
df$ID <- paste(df$ID, df$area, sep = "")
# Check catch age corresponds to number of read length-at-ages!
df <- df %>%
group_by(ID) %>%
mutate(n = n()) %>%
mutate(length = length/10) %>%
rename("back_calc_age" = "age") %>%
ungroup()
#> group_by: one grouping variable (ID)
#> mutate (grouped): new variable 'n' (integer) with 9 unique values and 0% NA
#> mutate (grouped): changed 52,143 values (100%) of 'length' (0 new NA)
#> rename: renamed one variable (back_calc_age)
#> ungroup: no grouping variables
ggplot(df, aes(factor(catch_age), factor(n))) + geom_point()

# Remove individuals with the same ID (bug)
df <- df %>% filter(n <= catch_age)
#> filter: removed 3 rows (<1%), 52,140 rows remaining
# Now every individual has the same number of rows as it's age in years
ggplot(df, aes(factor(catch_age), factor(n))) + geom_point()

# Now I create a "wide" data frame, so that I can easily create new columns by taking
# one column minus another
df_wide <- df %>%
filter(birth_year > 1980) %>%
spread(back_calc_age, length) %>% # Here I convert values in the "age" column to my new columns!
# The value in these columns is taken from the "length" column
arrange(ID)
#> filter: removed 18,932 rows (36%), 33,208 rows remaining
#> spread: reorganized (back_calc_age, length) into (1, 2, 3, 4, 5, …) [was 33208x9, now 9145x16]
# Check a single ID
df %>% filter(ID == "1983127BT")
#> filter: removed 52,138 rows (>99%), 2 rows remaining
df_wide %>% filter(ID == "1983127BT")
#> filter: removed 9,144 rows (>99%), one row remaining
# Calculation of growth and geometric length for all ages
df_growth <- data.frame(df_wide %>%
mutate(G1 = 100 * (log(`2`) - log(`1`)),
G2 = 100 * (log(`3`) - log(`2`)),
G3 = 100 * (log(`4`) - log(`3`)),
G4 = 100 * (log(`5`) - log(`4`)),
G5 = 100 * (log(`6`) - log(`5`)),
G6 = 100 * (log(`7`) - log(`6`)),
G7 = 100 * (log(`8`) - log(`7`)),
G8 = 100 * (log(`9`) - log(`8`)), # 9 is the max age in the data!
L1 = `1`,
L2 = `2`,
L3 = `3`,
L4 = `4`,
L5 = `5`,
L6 = `6`,
L7 = `7`,
L8 = `8`))
#> mutate: new variable 'G1' (double) with 2,898 unique values and 10% NA
#> new variable 'G2' (double) with 2,971 unique values and 34% NA
#> new variable 'G3' (double) with 2,399 unique values and 56% NA
#> new variable 'G4' (double) with 1,749 unique values and 72% NA
#> new variable 'G5' (double) with 1,229 unique values and 83% NA
#> new variable 'G6' (double) with 782 unique values and 90% NA
#> new variable 'G7' (double) with 485 unique values and 94% NA
#> new variable 'G8' (double) with 192 unique values and 98% NA
#> new variable 'L1' (double) with 778 unique values and 0% NA
#> new variable 'L2' (double) with 716 unique values and 10% NA
#> new variable 'L3' (double) with 564 unique values and 34% NA
#> new variable 'L4' (double) with 417 unique values and 56% NA
#> new variable 'L5' (double) with 313 unique values and 72% NA
#> new variable 'L6' (double) with 278 unique values and 83% NA
#> new variable 'L7' (double) with 231 unique values and 90% NA
#> new variable 'L8' (double) with 184 unique values and 94% NA
# Check it went OK
df_wide %>% filter(ID == "1983127BT")
#> filter: removed 9,144 rows (>99%), one row remaining
df_growth %>% filter(ID == "1983127BT")
#> filter: removed 9,144 rows (>99%), one row remaining
# Now make the data "long" again (each observation is a row),
# so that we can plot growth ~ length
# We need to do it separately (length and growth) to avoid duplicates
# First subset the important columns... length and growth data separately
df_g <- df_growth %>% select(c(area, ID, catch_age, G1, G2, G3, G4, G5, G6, G7, G8))
#> select: dropped 21 variables (catch_year, birth_year, gear, n, X1, …)
df_l <- df_growth %>% select(c(area, ID, catch_age, L1, L2, L3, L4, L5, L6, L7, L8))
#> select: dropped 21 variables (catch_year, birth_year, gear, n, X1, …)
# Now we have two wide data frames... let's make them long using the gather function
# - separately - and then merge them
df_g_l <- df_g %>% gather(g_age, growth, 4:11) # columns 4-11 are gathered
#> gather: reorganized (G1, G2, G3, G4, G5, …) into (g_age, growth) [was 9145x11, now 73160x5]
df_l_l <- df_l %>% gather(g_length, length, 4:11)
#> gather: reorganized (L1, L2, L3, L4, L5, …) into (g_length, length) [was 9145x11, now 73160x5]
# Now join datasets. Before I do that I need to add a new common column ("age"), so that
# R knows which length to go with each growth. Check data frames again...
head(arrange(subset(df_g_l, catch_age == 3), ID), 5)
head(arrange(subset(df_l_l, catch_age == 3), ID), 5)
# Here I'm splitting the L1, G1 stuff so that I get G and 1, because then the numbers
# will match in the two datasets, and I make sure the correct length is matched with growth
df_g_l <- df_g_l %>% separate(g_age, c("g", "back_calc_age"), sep = 1) %>% arrange(ID)
df_l_l <- df_l_l %>% separate(g_length, c("g", "back_calc_age"), sep = 1) %>% arrange(ID)
# Now I'll do a left_join to add in length to growth data. Match by ID and age
df_l_l_subset <- df_l_l %>% select(ID, back_calc_age, length)
#> select: dropped 3 variables (area, catch_age, g)
df_all <- left_join(df_g_l, df_l_l_subset, by = c("ID", "back_calc_age"))
#> left_join: added one column (length)
#> > rows only in x 0
#> > rows only in y ( 0)
#> > matched rows 73,160
#> > ========
#> > rows total 73,160
# Check it went OK
df_growth %>% filter(ID == "1983127BT")
#> filter: removed 9,144 rows (>99%), one row remaining
df_all %>% filter(ID == "1983127BT")
#> filter: removed 73,152 rows (>99%), 8 rows remaining
# Remove NA growth and length individuals from this analysis. Calculate other things as well
# centering and squaring etc
dfm <- df_all %>%
drop_na(growth) %>%
filter(growth > 0) %>%
drop_na(length) %>%
mutate(log_length = log(length),
log_growth = log(growth),
log_length_ct = log_length - mean(log_length),
log_length_ct_sq = log_length_ct*log_length_ct) %>%
separate(ID, c("catch_year", "ID2"), sep = 4) %>%
mutate_at(c("catch_year", "catch_age", "back_calc_age"), as.numeric) %>%
mutate(birth_year = catch_year - catch_age,
ID = paste(catch_year, ID2, sep = "")) %>%
select(-g) %>%
filter(birth_year < 1998) %>%
filter(catch_year < 2003) %>%
mutate_at(c("ID", "ID2", "area"), as.factor)
#> drop_na: removed 49,097 rows (67%), 24,063 rows remaining
#> filter: removed one row (<1%), 24,062 rows remaining
#> drop_na: no rows removed
#> mutate: new variable 'log_length' (double) with 1,743 unique values and 0% NA
#> new variable 'log_growth' (double) with 9,183 unique values and 0% NA
#> new variable 'log_length_ct' (double) with 1,743 unique values and 0% NA
#> new variable 'log_length_ct_sq' (double) with 1,743 unique values and 0% NA
#> mutate_at: converted 'catch_year' from character to double (0 new NA)
#> converted 'catch_age' from integer to double (0 new NA)
#> converted 'back_calc_age' from character to double (0 new NA)
#> mutate: new variable 'birth_year' (double) with 22 unique values and 0% NA
#> new variable 'ID' (character) with 8,243 unique values and 0% NA
#> select: dropped one variable (g)
#> filter: removed 1,841 rows (8%), 22,221 rows remaining
#> filter: removed 545 rows (2%), 21,676 rows remaining
#> mutate_at: converted 'area' from character to factor (0 new NA)
#> converted 'ID2' from character to factor (0 new NA)
#> converted 'ID' from character to factor (0 new NA)
Exploratory plots
# Plot distribution of catch years
dfm %>%
group_by(catch_age, catch_year, area) %>%
summarise(n = n()) %>%
ungroup() %>%
ggplot(., aes(factor(catch_age), n, fill = area)) +
geom_bar(stat = "identity") +
facet_wrap(~catch_year, scales = "free")
#> group_by: 3 grouping variables (catch_age, catch_year, area)
#> summarise: now 237 rows and 4 columns, 2 group variables remaining (catch_age, catch_year)
#> ungroup: no grouping variables

# Filter to have at least 4 data points per individual
dfm_3 <- dfm %>% filter(catch_age > 3)
#> filter: removed 5,028 rows (23%), 16,648 rows remaining
dfm_4 <- dfm %>% filter(catch_age > 4)
#> filter: removed 9,015 rows (42%), 12,661 rows remaining
# Plot distribution of data
p3 <- ggplot(dfm_3, aes(x = log_growth)) + geom_density()
p4 <- ggplot(dfm_4, aes(x = log_growth)) + geom_density()
p3 + p4

dfm <- dfm %>% filter(catch_age > 4)
#> filter: removed 9,015 rows (42%), 12,661 rows remaining
dfm %>%
group_by(catch_age, catch_year, area) %>%
summarise(n = n()) %>%
ungroup() %>%
ggplot(., aes(factor(catch_age), n, fill = area)) +
geom_bar(stat = "identity") +
facet_wrap(~catch_year, scales = "free")
#> group_by: 3 grouping variables (catch_age, catch_year, area)
#> summarise: now 140 rows and 4 columns, 2 group variables remaining (catch_age, catch_year)
#> ungroup: no grouping variables

# Plot sample size per individual
min(dfm$catch_age)
#> [1] 5
dfm %>%
group_by(ID) %>%
mutate(n = n()) %>%
ungroup() %>%
ggplot(., aes(n)) +
geom_histogram()
#> group_by: one grouping variable (ID)
#> mutate (grouped): new variable 'n' (integer) with 6 unique values and 0% NA
#> ungroup: no grouping variables
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Check relationship between catch age and # of back-calculated ages
dfm %>%
group_by(ID) %>%
mutate(n = n()) %>%
ggplot(., aes(n, catch_age)) +
geom_point() +
geom_abline(intercept = 0, slope = 1) +
geom_abline(intercept = 1, slope = 1, col = "red")
#> group_by: one grouping variable (ID)
#> mutate (grouped): new variable 'n' (integer) with 6 unique values and 0% NA

# Some data without growth, hence log 0
dfm <- dfm %>% group_by(ID) %>% mutate(n = n()) %>% mutate(test = catch_age - n) %>% ungroup()
#> group_by: one grouping variable (ID)
#> mutate (grouped): new variable 'n' (integer) with 6 unique values and 0% NA
#> mutate (grouped): new variable 'test' (double) with 2 unique values and 0% NA
#> ungroup: no grouping variables
# Remove these
dfm <- dfm %>% filter(test == 1)
#> filter: removed 3 rows (<1%), 12,658 rows remaining
# Create area 2 variable
dfm <- dfm %>%
mutate(area2 = ifelse(area == "BT", "Warm", "Cold"))
#> mutate: new variable 'area2' (character) with 2 unique values and 0% NA
# Plot again
dfm %>%
group_by(catch_age, catch_year, area) %>%
summarise(n = n()) %>%
ungroup() %>%
ggplot(., aes(factor(catch_age), n, fill = area)) +
geom_bar(stat = "identity") +
facet_wrap(~catch_year, scales = "free")
#> group_by: 3 grouping variables (catch_age, catch_year, area)
#> summarise: now 140 rows and 4 columns, 2 group variables remaining (catch_age, catch_year)
#> ungroup: no grouping variables

# And now again but with sample size per age by catch year
# No catch data in FM in 1986
dfm %>%
group_by(back_calc_age, catch_year, area) %>%
summarise(n = n()) %>%
ungroup() %>%
ggplot(., aes(factor(back_calc_age), n, fill = area)) +
geom_bar(stat = "identity") +
facet_wrap(~catch_year)
#> group_by: 3 grouping variables (back_calc_age, catch_year, area)
#> summarise: now 239 rows and 4 columns, 2 group variables remaining (back_calc_age, catch_year)
#> ungroup: no grouping variables

# Same but with birth year
dfm %>%
group_by(back_calc_age, birth_year, area) %>%
summarise(n = n()) %>%
ungroup() %>%
ggplot(., aes(factor(back_calc_age), n, fill = area)) +
geom_bar(stat = "identity") +
facet_wrap(~birth_year)
#> group_by: 3 grouping variables (back_calc_age, birth_year, area)
#> summarise: now 244 rows and 4 columns, 2 group variables remaining (back_calc_age, birth_year)
#> ungroup: no grouping variables

# Calculate samples sizes
dfm %>% group_by(area) %>% summarise(n = n())
#> group_by: one grouping variable (area)
#> summarise: now 2 rows and 2 columns, ungrouped
dfm %>% group_by(area) %>% distinct(ID) %>% summarise(n = n())
#> group_by: one grouping variable (area)
#> distinct (grouped): removed 10,232 rows (81%), 2,426 rows remaining
#> summarise: now 2 rows and 2 columns, ungrouped
# Average number of data points per individual?
dfm %>% group_by(ID) %>% summarise(n = n()) %>% ungroup() %>% summarize(mean_n = mean(n))
#> group_by: one grouping variable (ID)
#> summarise: now 2,426 rows and 2 columns, ungrouped
#> ungroup: no grouping variables
#> summarize: now one row and one column, ungrouped
# Now plot full data.
dfm %>%
ggplot(., aes(x = log_length, y = log_growth, color = ID)) +
facet_wrap(~area) +
geom_point(size = 1, alpha = 0.2) +
geom_line(size = 1, alpha = 0.2) +
scale_color_viridis(discrete = T, direction = -1) +
guides(color = "none") +
NULL

# Check unique years
dfm %>% group_by(area, catch_year) %>% summarise(catch_year = (unique(catch_year))) %>% data.frame()
#> group_by: 2 grouping variables (area, catch_year)
#> summarise: now 33 rows and 2 columns, one group variable remaining (area)
# Check sample size:
nrow(dfm)
#> [1] 12658
length(unique(dfm$ID))
#> [1] 2426
nrow(dfm) / length(unique(dfm$ID))
#> [1] 5.217642
max(dfm$catch_year)
#> [1] 2002
# Add dummy variable}
# Follow the method in VBGE and use dummy coding
bt <- filter(dfm, area == "BT")
#> filter: removed 10,320 rows (82%), 2,338 rows remaining
fm <- filter(dfm, area == "FM")
#> filter: removed 2,338 rows (18%), 10,320 rows remaining
dfm_dummy <- data.frame(rbind(cbind(bt, areaW=1, areaC=0), cbind(fm, areaW=0, areaC=1)))
# Plot non-linear relationship
ggplot(dfm_dummy, aes(length, growth, color = area2)) +
geom_point()

Save data
write.csv(dfm_dummy, "data/for_fitting/growth_scaling_dat.csv")