Read in catch data, clean, filter and process. FM data, unlike the BT data, comes in data files which correspond to the tabs in the spreadsheet: FM09_alla_grundutrdag_combinedbyAG_190916. They all have different layout so I will clean them separately…
Load libraries
library(tidyverse)
#> ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
#> ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
#> ✔ tibble 3.1.8 ✔ dplyr 1.0.10
#> ✔ tidyr 1.2.0 ✔ stringr 1.4.1
#> ✔ readr 2.1.1 ✔ forcats 0.5.1
#> Warning: package 'tidyr' was built under R version 4.0.5
#> ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
#> ✖ dplyr::filter() masks stats::filter()
#> ✖ dplyr::lag() masks stats::lag()
library(tidylog)
#>
#> Attaching package: 'tidylog'
#>
#> The following objects are masked from 'package:dplyr':
#>
#> add_count, add_tally, anti_join, count, distinct, distinct_all,
#> distinct_at, distinct_if, filter, filter_all, filter_at, filter_if,
#> full_join, group_by, group_by_all, group_by_at, group_by_if,
#> inner_join, left_join, mutate, mutate_all, mutate_at, mutate_if,
#> relocate, rename, rename_all, rename_at, rename_if, rename_with,
#> right_join, sample_frac, sample_n, select, select_all, select_at,
#> select_if, semi_join, slice, slice_head, slice_max, slice_min,
#> slice_sample, slice_tail, summarise, summarise_all, summarise_at,
#> summarise_if, summarize, summarize_all, summarize_at, summarize_if,
#> tally, top_frac, top_n, transmute, transmute_all, transmute_at,
#> transmute_if, ungroup
#>
#> The following objects are masked from 'package:tidyr':
#>
#> drop_na, fill, gather, pivot_longer, pivot_wider, replace_na,
#> spread, uncount
#>
#> The following object is masked from 'package:stats':
#>
#> filter
library(RColorBrewer)
#> Warning: package 'RColorBrewer' was built under R version 4.0.5
library(patchwork)
sessionInfo()
#> R version 4.0.2 (2020-06-22)
#> Platform: x86_64-apple-darwin17.0 (64-bit)
#> Running under: macOS 10.16
#>
#> Matrix products: default
#> BLAS: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
#> LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
#>
#> locale:
#> [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] patchwork_1.1.1 RColorBrewer_1.1-3 tidylog_1.0.2 forcats_0.5.1
#> [5] stringr_1.4.1 dplyr_1.0.10 purrr_0.3.4 readr_2.1.1
#> [9] tidyr_1.2.0 tibble_3.1.8 ggplot2_3.3.6 tidyverse_1.3.2
#>
#> loaded via a namespace (and not attached):
#> [1] Rcpp_1.0.8 lubridate_1.8.0 clisymbols_1.2.0
#> [4] assertthat_0.2.1 digest_0.6.30 utf8_1.2.2
#> [7] R6_2.5.1 cellranger_1.1.0 backports_1.3.0
#> [10] reprex_2.0.1 evaluate_0.16 httr_1.4.4
#> [13] pillar_1.8.1 rlang_1.0.6 googlesheets4_1.0.0
#> [16] readxl_1.3.1 rstudioapi_0.14 jquerylib_0.1.4
#> [19] rmarkdown_2.16 googledrive_2.0.0 munsell_0.5.0
#> [22] broom_1.0.1 compiler_4.0.2 modelr_0.1.8
#> [25] xfun_0.33 pkgconfig_2.0.3 htmltools_0.5.3
#> [28] tidyselect_1.1.2 fansi_1.0.3 crayon_1.4.2
#> [31] tzdb_0.2.0 dbplyr_2.1.1 withr_2.5.0
#> [34] grid_4.0.2 jsonlite_1.8.0 gtable_0.3.1
#> [37] lifecycle_1.0.3 DBI_1.1.1 magrittr_2.0.3
#> [40] scales_1.2.1 cli_3.4.1 stringi_1.7.8
#> [43] cachem_1.0.6 fs_1.5.2 xml2_1.3.3
#> [46] bslib_0.4.0 ellipsis_0.3.2 generics_0.1.2
#> [49] vctrs_0.5.0 tools_4.0.2 glue_1.6.2
#> [52] hms_1.1.1 fastmap_1.1.0 yaml_2.3.5
#> [55] colorspace_2.0-3 gargle_1.2.0 rvest_1.0.3
#> [58] knitr_1.40 haven_2.5.1 sass_0.4.2
# Need to set fileEncoding here, else error: "invalid multibyte string 18"
df83 <- read.csv("data/raw/Catch_data_FM09__1983-86_190916.csv", sep = ";", fileEncoding = "latin1")
# Tidy data. Remove unnecessary columns
df83 <- df83 %>%
filter(Art == "ABBO", Ã…rtal < 2004) %>%
rename(year = Årtal, week = Vecka, day = Dag, effort = Ansträngning, species = Art,
weight = Vikt, n = Antal) %>%
select(-c(Vtn.stånd, VindRiktn.I, VindSt.I, Vind_upp_rikn, VindSt.Upp, Ström_I_rikn,
Ström_upp_rikn, Salthalt_I_yta, Salthalt_I_botten, Salthalt_upp_yta,
Salthalt_upp_botten, Drift_i, Drift_u, Drift_dim, Siktdjup, Lufttryck_i,
Lufttryck_upp, Sjuk_kontroll, X..))
#> filter: removed 987 rows (74%), 345 rows remaining
#> rename: renamed 7 variables (year, week, day, effort, species, …)
#> select: dropped 19 variables (Vtn.stånd, VindRiktn.I, VindSt.I, Vind_upp_rikn, VindSt.Upp, …)
# We would much rather prefer to have a column for each length, so that 1 row = observation.
dat83 <- df83 %>%
gather(length, n2, c(X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X12, X13, X14), na.rm = T)
#> gather: reorganized (X1, X2, X3, X4, X5, …) into (length, n2) [was 345x38, now 952x26]
head(df83)
head(dat83) # Now the data is in a long, tidy format.
# Test it is correct:
subset(df83, year == 1983 & week == 30 & day == 2 & Stations_namn == "Asphällan")
subset(dat83, year == 1983 & week == 30 & day == 2 & Stations_namn == "Asphällan")
# Create a new empty column for numeric "length"
dat83$length_group <- as.numeric(substring(dat83$length, 2))
head(dat83)
str(dat83)
#> 'data.frame': 952 obs. of 27 variables:
#> $ Provfiske : chr "FM09S" "FM09S" "FM09S" "FM09S" ...
#> $ Area : chr "FM" "FM" "FM" "FM" ...
#> $ Areanamn : chr "Forsmark" "Forsmark" "Forsmark" "Forsmark" ...
#> $ Sektion : int 1 1 1 1 1 1 1 1 1 1 ...
#> $ Sektion_namn : chr "Syd Biotestsjön" "Syd Biotestsjön" "Syd Biotestsjön" "Syd Biotestsjön" ...
#> $ Station : int 31 31 31 31 31 31 31 31 31 31 ...
#> $ Stations_namn: chr "Asphällan" "Asphällan" "Asphällan" "Asphällan" ...
#> $ Position_N : chr "60 24,59" "60 24,59" "60 24,59" "60 24,59" ...
#> $ Position_E : chr "18 11,88" "18 11,88" "18 11,88" "18 11,88" ...
#> $ Redskap : int 9 9 9 9 9 9 9 9 9 9 ...
#> $ year : int 1983 1983 1983 1983 1983 1983 1983 1983 1983 1983 ...
#> $ week : int 30 30 30 31 31 31 31 32 32 32 ...
#> $ day : int 2 2 5 3 3 5 5 2 2 4 ...
#> $ YtTmp.I : logi NA NA NA NA NA NA ...
#> $ YtTmp.Upp : logi NA NA NA NA NA NA ...
#> $ effort : int 1 1 1 1 1 1 1 1 1 1 ...
#> $ Nr : int 1 2 2 1 2 1 2 1 2 1 ...
#> $ Störning : int 0 0 0 0 0 0 0 0 0 0 ...
#> $ TmpBtnI : chr "17,50" "17,80" "18,80" "17,60" ...
#> $ TmpBtnU : chr "17,30" "17,30" "18,30" "17,30" ...
#> $ species : chr "ABBO" "ABBO" "ABBO" "ABBO" ...
#> $ weight : logi NA NA NA NA NA NA ...
#> $ Längdgr_std : int 1 1 1 1 1 1 1 1 1 1 ...
#> $ n : int 8 53 19 2 50 1 46 19 84 10 ...
#> $ length : chr "X2" "X2" "X2" "X2" ...
#> $ n2 : int 1 16 1 2 17 1 20 4 41 5 ...
#> $ length_group : num 2 2 2 2 2 2 2 2 2 2 ...
# Our n2 column tells us how many fish are in that size-group. We want one row for each observation!
# So we need to repeat that observation by n2.
# Now repeat
dat83 <- dat83[rep(seq(nrow(dat83)), dat83$n2),]
head(dat83, 50)
# Test it is correct:
df83 %>%
filter(year == 1983 & week == 30 & day == 2 & Stations_namn == "Asphällan") %>%
arrange(n) %>%
as.data.frame()
#> filter: removed 343 rows (99%), 2 rows remaining
dat83 %>%
filter(year == 1983 & week == 30 & day == 2 & Stations_namn == "Asphällan") %>%
arrange(n) %>%
dplyr::select(-YtTmp.I, -YtTmp.Upp, -Störning, -TmpBtnU, -Sektion_namn, -Sektion,
-Areanamn, -Position_N, -Position_E, -Provfiske, -Redskap) %>%
as.data.frame()
#> filter: removed 9,413 rows (99%), 61 rows remaining
df87 <- read.csv("data/raw/Catch_data_FM09__1987-90_190916.csv", sep = ";", fileEncoding = "latin1")
# Tidy data. Remove unnecessary columns
df87 <- df87 %>%
filter(Art == "ABBO", Ã…rtal < 2004) %>%
rename(year = Årtal, week = Vecka, day = Dag, effort = Ansträngning, species = Art,
weight = Vikt, n = Antal) %>%
select(-c(Vtn.stånd, VindRiktn.I, VindSt.I, Vind_upp_rikn, VindSt.Upp, Ström_I_rikn,
Ström_upp_rikn, Salthalt_I_yta, Salthalt_I_botten, Salthalt_upp_yta,
Salthalt_upp_botten, Drift_i, Drift_u, Drift_dim, Siktdjup, Lufttryck_i,
Lufttryck_upp, Sjuk_kontroll, X..))
#> filter: removed 1,114 rows (75%), 366 rows remaining
#> rename: renamed 7 variables (year, week, day, effort, species, …)
#> select: dropped 19 variables (Vtn.stånd, VindRiktn.I, VindSt.I, Vind_upp_rikn, VindSt.Upp, …)
# Convert to long data frame
# Now the X-columns will be rows AND you will get a new column that takes the old "colmn" value and put that
# in the new column n2.
dat87 <- df87 %>%
gather(length, n2, c(X6, X9, X11, X14, X16, X19, X21, X24, X26, X29, X31, X34, X36, X39,
X41, X44, X46, X49, X51, X56, X59, X61, X64, X71), na.rm = T)
#> gather: reorganized (X6, X9, X11, X14, X16, …) into (length, n2) [was 366x48, now 1384x26]
# Test it is correct:
subset(df87, year == 1987 & week == 32 & day == 2 & Stations_namn == "Asphällan")
subset(dat87, year == 1987 & week == 32 & day == 2 & Stations_namn == "Asphällan")
# Create a new empty column for numeric "length"
dat87$length_group <- as.numeric(substring(dat87$length, 2))
# Our n2 column tells us how many fish are in that size-group. We want one row for each observation!
# So we need to repeat that observation by n2.
dat87 <- dat87[rep(seq(nrow(dat87)), dat87$n2),]
head(dat87, 50)
# Test it is correct:
df87 %>%
filter(year == 1987 & week == 32 & day == 2 & Stations_namn == "Asphällan") %>%
arrange(n) %>%
as.data.frame()
#> filter: removed 364 rows (99%), 2 rows remaining
dat87 %>%
filter(year == 1987 & week == 32 & day == 2 & Stations_namn == "Asphällan") %>%
arrange(n) %>%
dplyr::select(-YtTmp.I, -YtTmp.Upp, -Störning, -TmpBtnU, -Sektion_namn, -Sektion,
-Areanamn, -Position_N, -Position_E, -Provfiske, -Redskap) %>%
as.data.frame()
#> filter: removed 8,174 rows (99%), 52 rows remaining
df91 <- read.csv("data/raw/Catch_data_FM09__1991-00_190916.csv", sep = ";", fileEncoding = "latin1")
# Tidy data. Remove unnecessary columns
df91 <- df91 %>%
filter(Art == "ABBO", Ã…rtal < 2004) %>%
rename(year = Årtal, week = Vecka, day = Dag, effort = Ansträngning, species = Art,
weight = Vikt, n = Antal) %>%
select(-c(Vtn.stånd, VindRiktn.I, VindSt.I, Vind_upp_rikn, VindSt.Upp, Ström_I_rikn,
Ström_upp_rikn, Salthalt_I_yta, Salthalt_I_botten, Salthalt_upp_yta,
Salthalt_upp_botten, Drift_i, Drift_u, Drift_dim, Siktdjup, Lufttryck_i,
Lufttryck_upp, Sjuk_kontroll))
#> filter: removed 2,032 rows (81%), 478 rows remaining
#> rename: renamed 7 variables (year, week, day, effort, species, …)
#> select: dropped 18 variables (Vtn.stånd, VindRiktn.I, VindSt.I, Vind_upp_rikn, VindSt.Upp, …)
# Go from wide to long format
dat91 <- df91 %>%
gather(length, n2, c(X9, X11, X14, X16, X19, X21, X24, X26, X29, X31, X34, X36, X39, X41, X44,
X46, X49, X51, X54, X56,X59, X61, X66, X69, X71, X76, X81, X101), na.rm = T)
#> gather: reorganized (X9, X11, X14, X16, X19, …) into (length, n2) [was 478x52, now 3185x26]
# Test it is correct:
subset(df91, year == 1992 & week == 32 & day == 2 & Stations_namn == "Asphällan")
subset(dat91, year == 1992 & week == 32 & day == 2 & Stations_namn == "Asphällan")
# Create a new empty column for numeric "length"
dat91$length_group <- as.numeric(substring(dat91$length, 2))
# Our n2 column tells us how many fish are in that size-group. We want one row for each observation!
# So we need to repeat that observation by n2.
dat91 <- dat91[rep(seq(nrow(dat91)), dat91$n2),]
head(dat91, 50)
# Test it is correct:
df91 %>%
filter(year == 1992 & week == 32 & day == 2 & Stations_namn == "Asphällan") %>%
arrange(n) %>%
as.data.frame()
#> filter: removed 477 rows (>99%), one row remaining
dat91 %>%
filter(year == 1992 & week == 32 & day == 2 & Stations_namn == "Asphällan") %>%
arrange(n) %>%
dplyr::select(-YtTmp.I, -YtTmp.Upp, -Störning, -TmpBtnU, -Sektion_namn, -Sektion,
-Areanamn, -Position_N, -Position_E, -Provfiske, -Redskap) %>%
as.data.frame()
#> filter: removed 31,725 rows (>99%), 99 rows remaining
df01 <- read.csv("data/raw/Catch_data_FM09__2001-06_190916.csv", sep = ";", fileEncoding = "latin1")
# Tidy data. Remove unnecessary columns
df01 <- df01 %>%
filter(Art == "ABBO", Ã…rtal < 2004) %>%
rename(year = Årtal, week = Vecka, day = Dag, effort = Ansträngning, species = Art,
weight = Vikt, n = Antal) %>%
select(-c(Vtn.stånd, VindRiktn.I, VindSt.I, Vind_upp_rikn, VindSt.Upp, Ström_I_rikn,
Ström_upp_rikn, Salthalt_I_yta, Salthalt_I_botten, Salthalt_upp_yta,
Salthalt_upp_botten, Drift_i, Drift_u, Drift_dim, Siktdjup, Lufttryck_i,
Lufttryck_upp, Sjuk_kontroll, X..))
#> filter: removed 2,293 rows (79%), 601 rows remaining
#> rename: renamed 7 variables (year, week, day, effort, species, …)
#> select: dropped 19 variables (Vtn.stånd, VindRiktn.I, VindSt.I, Vind_upp_rikn, VindSt.Upp, …)
# Go from wide to long format
dat01 <- df01 %>%
gather(length, n2, c(X7, X9, X10, X11, X12, X13, X14, X15, X16, X17, X18, X19, X20, X21, X22, X23, X24, X25,
X26, X27, X28, X29, X30, X31, X32, X33, X34, X35, X36, X37, X38, X39, X40, X41, X42, X43,
X44, X45, X46, X47, X48, X49, X50, X51, X52, X55, X62, X65, X80, X83, X90), na.rm = T)
#> gather: reorganized (X7, X9, X10, X11, X12, …) into (length, n2) [was 601x75, now 2189x26]
# Test it is correct:
subset(df01, year == 2002 & week == 32 & day == 2 & Stations_namn == "Asphällan")
subset(dat01, year == 2002 & week == 32 & day == 2 & Stations_namn == "Asphällan")
# Create a new empty column for numeric "length"
dat01$length_group <- as.numeric(substring(dat01$length, 2))
# Our n2 column tells us how many fish are in that size-group. We want one row for each observation!
# So we need to repeat that observation by n2.
dat01 <- dat01[rep(seq(nrow(dat01)), dat01$n2),]
head(dat01, 50)
# Test it is correct:
df01 %>%
filter(year == 2002 & week == 32 & day == 2 & Stations_namn == "Asphällan") %>%
arrange(n) %>%
as.data.frame()
#> filter: removed 596 rows (99%), 5 rows remaining
dat01 %>%
filter(year == 2002 & week == 32 & day == 2 & Stations_namn == "Asphällan") %>%
arrange(n) %>%
dplyr::select(-YtTmp.I, -YtTmp.Upp, -Störning, -TmpBtnU, -Sektion_namn, -Sektion,
-Areanamn, -Position_N, -Position_E, -Provfiske, -Redskap) %>%
as.data.frame()
#> filter: removed 5,732 rows (>99%), 24 rows remaining
Combine all FM (reference) data
catch_FM <- rbind(dat83, dat87, dat91, dat01)
catch_FM <- catch_FM %>% filter(year > 1982 & year < 2004)
#> filter: no rows removed
#** Apply further filters ============================================================
# Remove disturbance
catch_FM <- catch_FM %>% filter(Störning == 0)
#> filter: removed 31 rows (<1%), 55,249 rows remaining
# Full data
ggplot(catch_FM, aes(factor(week), fill = factor(day))) +
facet_wrap(~year, scales = "free_y") +
geom_bar() +
scale_fill_brewer(palette="Set1", name = "Weekday") +
theme_classic(base_size = 12) +
ggtitle("Reference area")
# Filter autumn fishing?
sort(unique(catch_FM$week))
#> [1] 30 31 32 33 34
# This is not necessary because the data only contains autumn fishing!
# Since BT data is filter for year > 1986 this has to be done here as well.
catch_FM <- catch_FM %>% filter(year > 1986)
#> filter: removed 9,455 rows (17%), 45,794 rows remaining
# All nets have the same effort
unique(catch_FM$effort)
#> [1] 1
unique(catch_FM$week)
#> [1] 32 30 31 33 34
# Need to set fileEncoding here, else error: "invalid multibyte string 18"
df <- read.csv("data/raw/Catch_data_BT09_140507.csv", sep = ";", fileEncoding = "latin1")
# Tidy data. Remove unnecessary columns
df <- df %>%
filter(Art == "ABBO", Ã…rtal < 2004) %>%
dplyr::rename(year = Årtal, week = Vecka, day = Dag, effort = Ansträngning,
species = Art, weight = Vikt, n = Antal) %>%
select(-c(Vtn.stånd, VindRiktn.I, VindSt.I, Vind_upp_rikn, VindSt.Upp, Ström_I_rikn,
Ström_upp_rikn, Salthalt_I_yta, Salthalt_I_botten, Salthalt_upp_yta,
Salthalt_upp_botten, Drift_i, Drift_u, Drift_dim, Siktdjup, Lufttryck_i,
Lufttryck_upp, Sjuk_kontroll))
#> filter: removed 6,470 rows (81%), 1,543 rows remaining
#> select: dropped 18 variables (Vtn.stånd, VindRiktn.I, VindSt.I, Vind_upp_rikn, VindSt.Upp, …)
# Slightly cleaner but we have a problem: X1, X2... X9 are length.
# We would much rather prefer to have a column for each length, so that 1 row = observation.
# Luckily there is a super neat function in tidyr (part of the tidyverse) called "gather".
# What I whant to do is:
# 1. Give the new column names I want to create, e.g. "length"
# 2. Specify which columns are moved and shuffled in these columns.
# These will be the current columns for unique lengths. So check the str() and the column number using names() of the data again
# I will move columns 13:121 = the ones starting at x8 to x116
# Now the X-columns will be rows AND you will get a new column that takes the old "column"
# value and put that in the new column n2. DOUBLE CHECK!!
dat <- df %>%
gather(length, n2, c(X8, X9, X10, X11, X12, X13, X14, X15, X16, X17, X18, X19, X20,
X21, X22, X23, X24, X25, X26, X27, X28, X29, X30, X31, X32, X33,
X34, X35, X36, X37, X38, X39, X40, X41, X42, X43, X44, X45, X46,
X47, X48, X49, X50, X51, X52, X53, X54, X55, X56, X57, X58, X59,
X60, X61, X62, X63, X64, X65, X66, X67, X68, X69, X70, X71, X72,
X73, X74, X75, X76, X77, X78, X79, X80, X81, X82, X84, X85, X86,
X87, X88, X89, X90, X91, X93, X94, X96, X97, X98, X99, X101, X103,
X104, X106, X109, X111, X114, X116), na.rm = T)
#> gather: reorganized (X8, X9, X10, X11, X12, …) into (length, n2) [was 1543x122, now 5427x27]
head(dat) # Now the data is in a long, tidy format.
head(df)
# Test it is correct:
subset(df, year == 2003 & week == 19 & day == 4 & Stations_namn == "Malören")
subset(dat, year == 2003 & week == 19 & day == 4 & Stations_namn == "Malören")
# But "length" is not numeric but a character. Create a new empty column for numeric "length"
dat$length_group <- as.numeric(substring(dat$length, 2))
# Our n2 column tells us how many fish are in that size-group.
# We want one row for each observation! So we need to repeat that observation by n2.
dat <- dat[rep(seq(nrow(dat)), dat$n2),]
head(dat, 50)
# Test it is correct:
df %>%
filter(year == 2003 & week == 19 & day == 4 & Stations_namn == "Malören") %>%
arrange(n) %>%
as.data.frame()
#> filter: removed 1,539 rows (>99%), 4 rows remaining
dat %>%
filter(year == 2003 & week == 19 & day == 4 & Stations_namn == "Malören") %>%
arrange(n) %>%
dplyr::select(-YtTmp.I, -YtTmp.Upp, -Störning, -TmpBtnU, -Sektion_namn, -Sektion,
-Areanamn, -Position_N, -Position_E, -Provfiske, -Redskap) %>%
as.data.frame()
#> filter: removed 16,023 rows (>99%), 13 rows remaining
# Remove disturbance
# Disturbance codes:
# 2: seals damage
# 3: Strong algal growth on the gears
# 4: Clogging by drifting algae.
# 9: Other reason. (Damage by boat traffic, other human inference etc.)
# Plot disturbance
dat %>%
filter(Störning > 1) %>%
ggplot(., aes(x = Störning)) +
geom_histogram(bins = 10) +
scale_x_continuous(breaks = c(2, 3, 4, 9))
#> filter: removed 14,768 rows (92%), 1,268 rows remaining
dat <- dat %>% filter(Störning == 0)
#> filter: removed 1,268 rows (8%), 14,768 rows remaining
# Overfishing effect:
# They put nets many days in a row to get an overfishing affect.
# Catches decline after a few days. I don't want this effect!
# Here we plot which days are fished (fill) over the week that has been fished (x-axis). "Overfishing" effect
# could happen if a year is fished a lot in many consecutive days. It's not clear here that is the case
# Full data
ggplot(dat, aes(factor(week), fill = factor(day))) +
facet_wrap(~year, scales = "free") +
geom_bar()
# Filter autumn
dat %>%
filter(week > 35) %>%
ggplot(., aes(factor(week), fill = factor(day))) +
facet_wrap(~year, scales = "free_y") +
geom_bar()
#> filter: removed 3,696 rows (25%), 11,072 rows remaining
# Ok, 1984 is heavily fished all year actually.
# We'll remove it since it's very different from the rest. When that is removed, we
# can use Malin's for loop to select fishing days after .v40 (or any other week).
# The other two years with fishing before v.40 are 1996 & 2003, but that's w. 30 so we
# don't belive it has a big effect.
dat <- dat %>% filter(year > 1986)
#> filter: removed 1,208 rows (8%), 13,560 rows remaining
# In some years they have been fishing almost every week. To not have an
# overfishing-effect, we use only the first part of the fishing that season (first
# day and first week)
# Filter autumn fishing
# By filtering week > 40 & week < 49 I know that I include all October fishing trips
dat_oct <- dat %>%
filter(week > 40 & week < 49) # OCTOBER
#> filter: removed 5,324 rows (39%), 8,236 rows remaining
sort(unique(dat_oct$year))
#> [1] 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 2001 2002 2003
sort(unique(dat_oct$week))
#> [1] 41 42 43 44 45 46 47
# Since this gives me v. 41 42 43 44 45 46 47
# year -99 -00 is lost due to disturbance
# How does the filtered data set look?
ggplot(dat_oct, aes(x = factor(week), fill = factor(day))) +
facet_wrap(~year) +
geom_bar() +
scale_fill_brewer(palette="Set1", name = "Weekday") +
theme_classic(base_size = 12) +
ggtitle("Biotest lake")
# Rename data
catch_BT <- dat_oct
# Clean data to make more readable
catch_BT <- catch_BT %>%
select(Area, Sektion, Station, year, week, day, length_group, Station,
Längdgr_std, length_group) %>%
as.data.frame()
#> select: dropped 20 variables (Provfiske, Areanamn, Sektion_namn, Stations_namn, Position_N, …)
catch_FM <- catch_FM %>%
select(Area, Sektion, Station, year, week, day, length_group, Station,
Längdgr_std, length_group) %>%
as.data.frame()
#> select: dropped 19 variables (Provfiske, Areanamn, Sektion_namn, Stations_namn, Position_N, …)
catch <- rbind(catch_BT, catch_FM)
# First split up the data i two separate dataframes, one with std 2 and one with std 3.
# Call them dat_std2 och dat_std3
dat_std2 <- catch %>%
filter(Längdgr_std == 2)
#> filter: removed 6,670 rows (12%), 47,360 rows remaining
dat_std3 <- catch %>%
filter(Längdgr_std == 3)
#> filter: removed 47,360 rows (88%), 6,670 rows remaining
# Convert std 3 to std 2
sort(unique(dat_std3$length_group))
#> [1] 7 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32
#> [26] 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
# 7 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
# 36 37 38 39 40 41 42 43 44 45 46 47
# Insert new column
dat_std3$length_group2 <- NA
dat_std3$length_group2[dat_std3$length_group==7] <- 6
dat_std3$length_group2[dat_std3$length_group==9] <- 9
dat_std3$length_group2[dat_std3$length_group==10] <- 9
dat_std3$length_group2[dat_std3$length_group==11] <- 11
dat_std3$length_group2[dat_std3$length_group==12] <- 11
dat_std3$length_group2[dat_std3$length_group==13] <- 11
dat_std3$length_group2[dat_std3$length_group==14] <- 14
dat_std3$length_group2[dat_std3$length_group==15] <- 14
dat_std3$length_group2[dat_std3$length_group==16] <- 16
dat_std3$length_group2[dat_std3$length_group==17] <- 16
dat_std3$length_group2[dat_std3$length_group==18] <- 16
dat_std3$length_group2[dat_std3$length_group==19] <- 19
dat_std3$length_group2[dat_std3$length_group==20] <- 19
dat_std3$length_group2[dat_std3$length_group==21] <- 21
dat_std3$length_group2[dat_std3$length_group==22] <- 21
dat_std3$length_group2[dat_std3$length_group==23] <- 21
dat_std3$length_group2[dat_std3$length_group==24] <- 24
dat_std3$length_group2[dat_std3$length_group==25] <- 24
dat_std3$length_group2[dat_std3$length_group==26] <- 26
dat_std3$length_group2[dat_std3$length_group==27] <- 26
dat_std3$length_group2[dat_std3$length_group==28] <- 26
dat_std3$length_group2[dat_std3$length_group==29] <- 29
dat_std3$length_group2[dat_std3$length_group==30] <- 29
dat_std3$length_group2[dat_std3$length_group==31] <- 31
dat_std3$length_group2[dat_std3$length_group==32] <- 31
dat_std3$length_group2[dat_std3$length_group==33] <- 31
dat_std3$length_group2[dat_std3$length_group==34] <- 34
dat_std3$length_group2[dat_std3$length_group==35] <- 34
dat_std3$length_group2[dat_std3$length_group==36] <- 36
dat_std3$length_group2[dat_std3$length_group==37] <- 36
dat_std3$length_group2[dat_std3$length_group==38] <- 36
dat_std3$length_group2[dat_std3$length_group==39] <- 39
dat_std3$length_group2[dat_std3$length_group==40] <- 39
dat_std3$length_group2[dat_std3$length_group==41] <- 41
dat_std3$length_group2[dat_std3$length_group==42] <- 41
dat_std3$length_group2[dat_std3$length_group==43] <- 41
dat_std3$length_group2[dat_std3$length_group==44] <- 44
dat_std3$length_group2[dat_std3$length_group==45] <- 44
dat_std3$length_group2[dat_std3$length_group==46] <- 46
dat_std3$length_group2[dat_std3$length_group==47] <- 46
dat_std3$length_group2[dat_std3$length_group==48] <- 46
# Now compare actual values - Looks ok!
ggplot(dat_std3, aes(factor(length_group), factor(length_group2))) +
geom_point(size = 2)
# Remove length_group so that only std 2 is included
dat_std3 <- dat_std3 %>%
select(-c(length_group))
#> select: dropped one variable (length_group)
# Rename length_group2 to length_group
dat_std3 <- dat_std3 %>%
dplyr::rename(length_group = length_group2)
# Merge data frames
catch_full <- rbind(dat_std2, dat_std3)
# In order to get even length classes (with respect to the code) I gave a new code to std 2
# which represents the starting length in each interval, as opposed to the original
# which was the integer in the middle of 2.5 cm classes...
# E.g. if the interval is 37.6 - 40, the new code becomes 37.6.
sort(unique(catch_full$length_group))
#> [1] 6 9 11 14 16 19 21 24 26 29 31 34 36 39 41 44 46
# 6 9 11 14 16 19 21 24 26 29 31 34 36 39 41 44 46
# Insert new column
catch_full$new_length_group <- NA
catch_full$new_length_group[catch_full$length_group==6] <- 5.1
catch_full$new_length_group[catch_full$length_group==9] <- 7.6
catch_full$new_length_group[catch_full$length_group==11] <- 10.1
catch_full$new_length_group[catch_full$length_group==14] <- 12.6
catch_full$new_length_group[catch_full$length_group==16] <- 15.1
catch_full$new_length_group[catch_full$length_group==19] <- 17.6
catch_full$new_length_group[catch_full$length_group==21] <- 20.1
catch_full$new_length_group[catch_full$length_group==24] <- 22.6
catch_full$new_length_group[catch_full$length_group==26] <- 25.1
catch_full$new_length_group[catch_full$length_group==29] <- 27.6
catch_full$new_length_group[catch_full$length_group==31] <- 30.1
catch_full$new_length_group[catch_full$length_group==34] <- 32.6
catch_full$new_length_group[catch_full$length_group==36] <- 35.1
catch_full$new_length_group[catch_full$length_group==39] <- 37.6
catch_full$new_length_group[catch_full$length_group==41] <- 40.1
catch_full$new_length_group[catch_full$length_group==44] <- 42.6
catch_full$new_length_group[catch_full$length_group==46] <- 45.1
# Compare values
ggplot(catch_full, aes(length_group, new_length_group)) +
geom_point(size = 2) +
geom_abline(slope = 1, intercept = 0, color = "red")
# Remove length_group so that only new_length_group is included
catch_full <- catch_full %>%
select(-c(length_group))
#> select: dropped one variable (length_group)
# Rename length_group2 to length_group
catch_full <- catch_full %>%
dplyr::rename(length_group = new_length_group)
# Insert netID
sort(unique(catch_full$Station))
#> [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 21 22 23 24 25 26
#> [26] 27 28 29 30 31 32 33 34 35 36 37 38 39 40 42 43 44 45 46 47 48 49 50
t <- catch_full %>%
dplyr::select(Area, year, week, day, Sektion, Station)
unique(is.na(t))
#> Area year week day Sektion Station
#> 2857 FALSE FALSE FALSE FALSE FALSE FALSE
catch_full$netID <- paste(catch_full$Area, catch_full$year, catch_full$Station, sep = ".")
catch_full$netID2 <- paste(catch_full$Area, catch_full$year, catch_full$week, catch_full$day, catch_full$Sektion, catch_full$Station, sep = ".")
length(unique(catch_full$netID))
#> [1] 256
Save data
write.csv(catch_full, "data/cleaned/catch_BT_FM_1987-2003.csv", row.names = FALSE)