Read and add final touches to data
df <- read.csv("data/aged_catch_BT_FM_1987-2003.csv")
head(df)
# 1996 is a special year because stations changed. But since we don't do anything with station
# we simply ignore that
# df %>%
# group_by(year, Area, Station) %>%
# summarise(n = n()) %>%
# ggplot(., aes(year, n, fill = factor(Station))) +
# geom_bar(stat = "identity") +
# facet_wrap(~Area)
# df %>%
# filter(Area == "BT") %>%
# group_by(year, netID) %>%
# summarise(n = n()) %>%
# ggplot(., aes(year, n)) + geom_bar(stat="identity")
# df %>% group_by(Area) %>% summarise(mean_age = mean(age), sd_age = sd(age))
# df %>% filter(age > 2) %>% group_by(Area) %>% summarise(mean_age = mean(age), sd_age = sd(age))
# ggplot(df, aes(Area, age)) + geom_boxplot()
# df %>% filter(age > 2) %>% ggplot(., aes(Area, age)) + geom_boxplot()
# summary(lm(age~Area, data = df))
# summary(lm(age~Area, data = filter(df, age > 2)))
# Currently the data is in format 1 row, 1 ind. We want a column that has the CPUE
# by length class.
# How many nets in total per area and year? (For scaling with effort later)
df <- df %>% group_by(Area, year) %>%
mutate(n_nets_year_test = length(unique(netID))) %>%
mutate(n_nets_year = length(unique(netID2))) %>%
ungroup() %>%
as.data.frame()
#> group_by: 2 grouping variables (Area, year)
#> mutate (grouped): new variable 'n_nets_year_test' (integer) with 4 unique values and 0% NA
#> mutate (grouped): new variable 'n_nets_year' (integer) with 10 unique values and 0% NA
#> ungroup: no grouping variables
length(unique(filter(df, Area == "BT")$netID))
#> filter: removed 45,794 rows (85%), 8,236 rows remaining
#> [1] 118
length(unique(filter(df, Area == "FM")$netID))
#> filter: removed 8,236 rows (15%), 45,794 rows remaining
#> [1] 138
length(unique(filter(df, Area == "BT")$netID2))
#> filter: removed 45,794 rows (85%), 8,236 rows remaining
#> [1] 500
length(unique(filter(df, Area == "FM")$netID2))
#> filter: removed 8,236 rows (15%), 45,794 rows remaining
#> [1] 825
# We want the data as follows:
# Year, LngtClass, Age, Number
# Group by year, Area and age, summarize and get n() per length, year and area...
df2 <- df %>%
group_by(year, Area, age) %>%
summarise(catch_n = n()) %>% # Get catch as a column, this is now the number of rows a given length occurs each year and area
ungroup() %>%
mutate(effort_id = paste(year, Area, sep = ".")) %>% # For calculating the total effort that gave that catch (total for year and area)
as.data.frame()
#> group_by: 3 grouping variables (year, Area, age)
#> summarise: now 278 rows and 4 columns, 2 group variables remaining (year, Area)
#> ungroup: no grouping variables
#> mutate: new variable 'effort_id' (character) with 32 unique values and 0% NA
# Now we need to get the effort back in there
df_effort <- df %>%
mutate(effort_id = paste(year, Area, sep = ".")) %>%
select(effort_id, n_nets_year_test, n_nets_year) %>%
distinct(effort_id, .keep_all = TRUE) %>%
as.data.frame()
#> mutate: new variable 'effort_id' (character) with 32 unique values and 0% NA
#> select: dropped 11 variables (Area, Sektion, Station, year, week, …)
#> distinct: removed 53,998 rows (>99%), 32 rows remaining
# Now do a left_join
df3 <- left_join(df2, df_effort, by = "effort_id") %>% as.data.frame()
#> left_join: added 2 columns (n_nets_year_test, n_nets_year)
#> > rows only in x 0
#> > rows only in y ( 0)
#> > matched rows 278
#> > =====
#> > rows total 278
# Looks Ok!
df %>% filter(year == 1987 & Area == "FM")
#> filter: removed 52,160 rows (97%), 1,870 rows remaining
df %>% filter(year == 1987 & Area == "FM") %>% distinct(n_nets_year, .keep_all = TRUE)
#> filter: removed 52,160 rows (97%), 1,870 rows remaining
#> distinct: removed 1,869 rows (>99%), one row remaining
df2 %>% filter(effort_id == "1987.FM")
#> filter: removed 270 rows (97%), 8 rows remaining
df3 %>% filter(effort_id == "1987.FM")
#> filter: removed 270 rows (97%), 8 rows remaining
# Go from total catch to CPUE
df4 <- df3 %>%
ungroup() %>%
rename("area" = "Area") %>%
mutate(cpue_numbers_test = catch_n/n_nets_year_test,
cpue_numbers = catch_n/n_nets_year) # Get numbers CPUE, divide by the previously create n_nets, which is # of unique net ID's in each area and year
#> ungroup: no grouping variables
#> rename: renamed one variable (area)
#> mutate: new variable 'cpue_numbers_test' (double) with 197 unique values and 0% NA
#> new variable 'cpue_numbers' (double) with 225 unique values and 0% NA
head(df3, 30)
head(df4)
p1 <- df4 %>%
group_by(area, year) %>%
summarise(sum_cpue = sum(cpue_numbers)) %>%
ggplot(aes(year, sum_cpue, color = area)) +
geom_point() +
stat_smooth() +
ggtitle("long ID")
#> group_by: 2 grouping variables (area, year)
#> summarise: now 32 rows and 3 columns, one group variable remaining (area)
p2 <- df4 %>%
group_by(area, year) %>%
summarise(sum_cpue = sum(cpue_numbers_test)) %>%
ggplot(aes(year, sum_cpue, color = area)) +
geom_point() +
stat_smooth() +
ggtitle("short ID")
#> group_by: 2 grouping variables (area, year)
#> summarise: now 32 rows and 3 columns, one group variable remaining (area)
p1/p2
#> `geom_smooth()` using method = 'loess' and formula 'y ~ x'
#> `geom_smooth()` using method = 'loess' and formula 'y ~ x'

# Calculate average cpue by year and area
df4
p1 <- df4 %>%
group_by(year, area, age) %>%
summarise(mean_cpue = mean(cpue_numbers)) %>%
ggplot(aes(year, mean_cpue, fill = factor(age))) +
geom_area() +
facet_wrap(~area) +
scale_fill_brewer(palette = "Set1") +
coord_cartesian(expand = 0) +
ggtitle("New ID")
#> group_by: 3 grouping variables (year, area, age)
#> summarise: now 278 rows and 4 columns, 2 group variables remaining (year, area)
p2 <- df4 %>%
group_by(year, area, age) %>%
summarise(mean_cpue = mean(cpue_numbers_test)) %>%
ggplot(aes(year, mean_cpue, fill = factor(age))) +
geom_area() +
facet_wrap(~area) +
scale_fill_brewer(palette = "Set1") +
coord_cartesian(expand = 0) +
ggtitle("Old ID")
#> group_by: 3 grouping variables (year, area, age)
#> summarise: now 278 rows and 4 columns, 2 group variables remaining (year, area)
p1/p2

# Test I get 1 unique row per age, year and area
df4 %>%
group_by(year, area, age) %>%
summarise(n = n()) %>%
ungroup() %>%
distinct(n)
#> group_by: 3 grouping variables (year, area, age)
#> summarise: now 278 rows and 4 columns, 2 group variables remaining (year, area)
#> ungroup: no grouping variables
#> distinct: removed 277 rows (>99%), one row remaining
# Plot the log cpue as function of age to find the ages that correspond to the descending limb:
ggplot(df4, aes(factor(age), log(cpue_numbers), color = factor(year))) +
geom_point() +
facet_wrap(~ area) +
scale_color_viridis(discrete = TRUE) +
coord_cartesian(expand = 0) +
theme(aspect.ratio = 1)

# Let's use age 3 and older
ggplot(df4, aes(factor(age), log(cpue_numbers), color = factor(year))) +
geom_point() +
facet_wrap(~ area) +
scale_color_viridis(discrete = TRUE) +
coord_cartesian(expand = 0) +
theme(aspect.ratio = 1)

# Plot total catch
df2 %>%
group_by(Area, year) %>%
summarize(tot_catch = sum(catch_n)) %>%
ggplot(., aes(year, tot_catch)) + geom_bar(stat = "identity") +
facet_wrap(~ Area)
#> group_by: 2 grouping variables (Area, year)
#> summarize: now 32 rows and 3 columns, one group variable remaining (Area)

# Plot cpue
df4 %>%
group_by(area, year) %>%
ggplot(., aes(year, cpue_numbers)) + geom_bar(stat = "identity") +
facet_wrap(~ area)
#> group_by: 2 grouping variables (area, year)

# Edit variables
d <- df4 %>%
mutate(log_cpue = log(cpue_numbers),
area2 = ifelse(area == "BT", "Warm", "Cold"),
cohort = year - age)
#> mutate: new variable 'log_cpue' (double) with 225 unique values and 0% NA
#> new variable 'area2' (character) with 2 unique values and 0% NA
#> new variable 'cohort' (integer) with 25 unique values and 0% NA
# Remove cohorts before 1981
d <- d %>% filter(cohort > 1980, age > 2)
#> filter: removed 70 rows (25%), 208 rows remaining
sort(unique(d$cohort))
#> [1] 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995
#> [16] 1996 1997 1998 1999 2000
max(d$year)
#> [1] 2003
Explore data
# Plot data
d %>% filter(area == "BT") %>%
ggplot(aes(age, log_cpue)) +
geom_point() +
stat_smooth(method = "lm", se = FALSE) +
facet_wrap(~ cohort)
#> filter: removed 113 rows (54%), 95 rows remaining
#> `geom_smooth()` using formula 'y ~ x'

d %>% filter(area == "FM") %>%
ggplot(aes(age, log_cpue)) +
geom_point() +
stat_smooth(method = "lm", se = FALSE) +
facet_wrap(~ cohort)
#> filter: removed 95 rows (46%), 113 rows remaining
#> `geom_smooth()` using formula 'y ~ x'

d %>%
ggplot(aes(age, cpue_numbers)) +
geom_point() +
stat_smooth(method = "lm", se = FALSE) +
facet_wrap(~ area)
#> `geom_smooth()` using formula 'y ~ x'

# Make age proportion data frame (not by year)
tot_cpue_area <- d %>%
filter(age > 2) %>%
mutate(Area2 = ifelse(area == "BT", "Warm", "Cold")) %>%
group_by(Area2) %>%
summarise(sum_cpue = sum(cpue_numbers))
#> filter: no rows removed
#> mutate: new variable 'Area2' (character) with 2 unique values and 0% NA
#> group_by: one grouping variable (Area2)
#> summarise: now 2 rows and 2 columns, ungrouped
d_prop_age <- d %>%
filter(age > 2) %>%
mutate(Area2 = ifelse(area == "BT", "Warm", "Cold")) %>%
group_by(Area2) %>%
mutate(sum_cpue = sum(cpue_numbers)) %>%
ungroup() %>%
group_by(Area2, age) %>%
summarise(sum_cpue_age = sum(cpue_numbers)) %>%
ungroup()
#> filter: no rows removed
#> mutate: new variable 'Area2' (character) with 2 unique values and 0% NA
#> group_by: one grouping variable (Area2)
#> mutate (grouped): new variable 'sum_cpue' (double) with 2 unique values and 0% NA
#> ungroup: no grouping variables
#> group_by: 2 grouping variables (Area2, age)
#> summarise: now 14 rows and 3 columns, one group variable remaining (Area2)
#> ungroup: no grouping variables
d_prop_age %>% as.data.frame()
d_prop_age <- left_join(d_prop_age, tot_cpue_area) %>%
mutate(prop = sum_cpue_age / sum_cpue)
#> Joining, by = "Area2"
#> left_join: added one column (sum_cpue)
#> > rows only in x 0
#> > rows only in y ( 0)
#> > matched rows 14
#> > ====
#> > rows total 14
#> mutate: new variable 'prop' (double) with 14 unique values and 0% NA
pal <- rev(brewer.pal(n = 6, name = "Paired")[c(2, 6)])
p <- d_prop_age %>%
ggplot(aes(factor(age), prop, color = Area2)) +
geom_point(stat = "identity") +
scale_color_manual(values = rev(pal)) +
theme_light() +
NULL
# Plot proportion by age
pp <- df %>%
filter(age > 2) %>%
mutate(Area2 = ifelse(Area == "BT", "Warm", "Cold")) %>%
group_by(Area2, age) %>%
summarise(n = n()) %>%
mutate(freq = n / sum(n)) %>%
ggplot(aes(x = factor(age), y = freq, color = Area2, group = interaction(age, Area2))) +
geom_point() +
scale_color_manual(values = rev(pal)) +
theme_light() +
labs(x = "Age", y = "Proportion") +
guides(fill = "none") +
theme(text = element_text(size = 12))
#> filter: removed 15,771 rows (29%), 38,259 rows remaining
#> mutate: new variable 'Area2' (character) with 2 unique values and 0% NA
#> group_by: 2 grouping variables (Area2, age)
#> summarise: now 14 rows and 3 columns, one group variable remaining (Area2)
#> mutate (grouped): new variable 'freq' (double) with 14 unique values and 0% NA
p / pp

df %>%
filter(age > 2) %>%
mutate(Area2 = ifelse(Area == "BT", "Warm", "Cold")) %>%
group_by(Area2, year, age) %>%
summarise(n = n()) %>%
mutate(freq = n / sum(n)) %>%
ggplot(aes(x = factor(age), y = freq, color = Area2, group = interaction(age, Area2))) +
geom_point(position = position_dodge(width = 0.8)) +
geom_boxplot(position = position_dodge(width = 0.8), fill = NA) +
scale_color_manual(values = rev(pal)) +
theme_light() +
labs(x = "Age", y = "Proportion") +
guides(fill = "none") +
theme(text = element_text(size = 12))
#> filter: removed 15,771 rows (29%), 38,259 rows remaining
#> mutate: new variable 'Area2' (character) with 2 unique values and 0% NA
#> group_by: 3 grouping variables (Area2, year, age)
#> summarise: now 219 rows and 4 columns, 2 group variables remaining (Area2, year)
#> mutate (grouped): new variable 'freq' (double) with 214 unique values and 0% NA

ggsave("figures/supp/age_prop_diff.png", width = 3.5, height = 6.5, dpi = 600)