Analysis for Characteristics of macroplastic debris in eight diverse river systems across four continents

Packages

library(tidyverse)
library(dplyr)
library(zoo)
library(devtools)
library(fastR2)

Datasets

final_dataset_raw.csv

final_dataset.csv

final_dataset_rounded.csv

final_dataset_fig2.csv

final_dataset_fig3.csv

final_dataset_fig4.csv

final_dataset_fig5.csv

fig2_data.csv

fig3_data.csv

fig4_data.csv

fig5_data.csv

fig2_glmm.csv

subsample_data.csv

Manage and Wrangle Raw Data (final_dataset_raw.csv) to Analysis-Ready Format (final_dataset.csv)

final_dataset_raw.csv | Dataset as collected and submitted by local community-led efforts at each study site

final_dataset.csv | Analysis-ready dataset, wrangled to account for variations in collection, sorting, labeling, etc. across study sites as well as logical data entry errors, to establish standardization across study sites for orthogonal comparisons

final_dataset_rounded.csv | Same as final_dataset.csv, rounded to 4 decimal places

Standardize total debris and plastic debris

#Load data
final_dataset_raw <- read_csv("final_dataset_raw.csv")

#Remove entries with no data
final_dataset <- final_dataset_raw %>% 
  filter(
    all_debris > 0
  )

#Create one plastic debris column, as some study sites reported plastic debris as "dried/clean" and others as "wet/dirty", and procedures and methods within and across each category make keeping the raw weights as the most accurate representation rather than attempting an adjustment to the weight of "dried/clean" or "wet/dirty" plastic debris

#Make sure necessary columns are numeric
final_dataset$plastic_debris_dirty <- as.numeric(final_dataset$plastic_debris_dirty)
final_dataset$plastic_debris_clean <- as.numeric(final_dataset$plastic_debris_clean)

#Mutate new column for plastic debris that calls upon "dried/clean" weights from Kenya and Thailand, with the rest calling from "wet/dirty"
final_dataset <-final_dataset %>%
  rowwise() %>%
  mutate(
    plastic_debris = ifelse(country == "Kenya", plastic_debris_clean,
                     ifelse(country == "Thailand", plastic_debris_clean,
                  plastic_debris_dirty)))

#Mutate a river_id column for simpler future analysis grouping and identification because some locations have devices in multiple rivers and report this as such
final_dataset <- final_dataset %>%
    rowwise() %>%
  mutate(
    river_id = ifelse(country == "Kenya", "Athi",
                   ifelse(country == "Thailand", "Lat Phrao",
                          ifelse(country == "Mexico", "Tijuana",
                                 ifelse(country == "Ecuador", "Portoviejo",
                                        ifelse(country == "Vietnam", "Song Hong",
                                               ifelse(country == "Panama", "Juan Díaz",
                                                      ifelse(country == "Indonesia", "Citarum",
                                                             "Kingston")))))))) 

#Mutate a date column for simpler future analysis grouping that combines month and year, which are reported in separate columns
final_dataset <- final_dataset %>% 
  mutate(
    date = paste(month, year, sep = " ")
  )

#Plastic debris reported in Panama is just the recycled PET and HDPE (total debris was known and reported). Additional sub-sampling was performed to understand true proportion of plastic collected in the total debris. 50.91% of the debris in sampling efforts was plastic. Apply this percentage to the total debris in each month in Panama to estimate this study site's plastic debris

#Make sure necessary columns are numeric
final_dataset$plastic_debris <- as.numeric(final_dataset$plastic_debris) #make column numeric

#Multiplying Panama total debris by plastic debris fraction for each month of data submission and recording results in plastic_debris column
final_dataset$plastic_debris <- ifelse(final_dataset$river_id == "Juan Díaz", final_dataset$all_debris*.5165171, final_dataset$plastic_debris)

Standardize polymer types

#In order from polymer 1 (PET) to 7 (Other), bring various forms of reporting from different study sites under one orthogonal column for each polymer

#PET (1)

#Make sure necessary columns are numeric
final_dataset$pet_plastic <- as.numeric(final_dataset$pet_plastic)
final_dataset$polymeruk_petliq_plastic <- as.numeric(final_dataset$polymeruk_petliq_plastic)

#Make NA = 0 to avoid computational errors caused by NA
final_dataset$pet_plastic[is.na(final_dataset$pet_plastic)] <- 0
final_dataset$polymeruk_petliq_plastic[is.na(final_dataset$polymeruk_petliq_plastic)] <- 0

#Mutate a new PET column. Jamaica reported two PET categories and therefore combine them; Panama PET proportion is based on sub-sampling of total debris (proportion of all debris in sub-sampling that was PET multiplied by all debris); all others report total PET in single PET category
final_dataset <- final_dataset %>%
  rowwise() %>%
  mutate(
    plastic_pet = ifelse(country == "Jamaica", sum(pet_plastic, polymeruk_petliq_plastic, na.rm=TRUE),
                         ifelse(country == "Panama", all_debris*0.307392808,
                            pet_plastic)))

#HDPE (2)

#Make sure necessary columns are numeric
final_dataset$hdpe_plastic <- as.numeric(final_dataset$hdpe_plastic)
final_dataset$polymeruk_hdpebad_plastic <- as.numeric(final_dataset$polymeruk_hdpebad_plastic)

#Make NA = 0 to avoid computational errors caused by NA
final_dataset$hdpe_plastic[is.na(final_dataset$hdpe_plastic)] <- 0
final_dataset$polymeruk_hdpebad_plastic[is.na(final_dataset$polymeruk_hdpebad_plastic)] <- 0

#Mutate a new HDPE column. Jamaica reported two HDPE categories and therefore combine them; Panama HDPE proportion is based on sub-sampling of total debris (proportion of all debris in sub-sampling that was HDPE multiplied by all debris); all others report total HDPE in single HDPE category
final_dataset <- final_dataset %>%
  rowwise() %>%
  mutate(
    plastic_hdpe = ifelse(country == "Jamaica", sum(hdpe_plastic, polymeruk_hdpebad_plastic, na.rm=TRUE),
                          ifelse(country == "Panama", all_debris*0.04897705, 
                            hdpe_plastic)))

#PVC (3)

#Make sure necessary columns are numeric
final_dataset$pvc_plastic <- as.numeric(final_dataset$pvc_plastic)

#Make NA = 0 to avoid computational errors caused by NA
final_dataset$pvc_plastic[is.na(final_dataset$pvc_plastic)] <- 0

#Mutate a new PVC column. Panama PVC proportion is based on sub-sampling of total debris (proportion of all debris in sub-sampling that was PVC multiplied by all debris); all others report total PVC in single PVC category
final_dataset <- final_dataset %>%
  rowwise() %>%
  mutate(
    plastic_pvc = ifelse(country == "Panama", all_debris*0.018572707,
                    pvc_plastic))

#LDPE (4)

#Make sure necessary columns are numeric
final_dataset$ldpe_plastic <- as.numeric(final_dataset$ldpe_plastic)
final_dataset$polymeruk_wrapper_plastic <- as.numeric(final_dataset$polymeruk_wrapper_plastic)
final_dataset$film_plastic <- as.numeric(final_dataset$film_plastic)
final_dataset$plastic_wtoe <- as.numeric(final_dataset$plastic_wtoe)

#Make NA = 0 to avoid computational errors caused by NA
final_dataset$ldpe_plastic[is.na(final_dataset$ldpe_plastic)] <- 0
final_dataset$polymeruk_wrapper_plastic[is.na(final_dataset$polymeruk_wrapper_plastic)] <- 0
final_dataset$film_plastic[is.na(final_dataset$film_plastic)] <- 0
final_dataset$plastic_wtoe[is.na(final_dataset$plastic_wtoe)] <- 0

#Mutate a new LDPE column. Vietnam reported some "wrappers" as "unknown polymer" and we assume they are LDPE in addition to reported LDPE; Jamaica did not sort out LDPE but rather a film plastic category and we assume this was primarily LDPE; Thailand only reported sorted recycled plastic (all of the non-recycled plastic was sent to waste to energy), so sub-sampling was performed of the non-recycled plastic and the proportion of that sub-sampling that was LDPE was added to the LDPE recycled total by multiplying the proportion by the amount of plastic sent to waste to energy; Panama proportion is based on sub-sampling of total debris (proportion of all debris in sub-sampling that was LDPE multiplied by all debris); all others report total LDPE in LDPE column
final_dataset <- final_dataset %>%
  rowwise() %>% 
  mutate(
    plastic_ldpe = ifelse(country == "Vietnam", sum(ldpe_plastic,polymeruk_wrapper_plastic, na.rm=TRUE),
                      ifelse(country == "Jamaica", film_plastic,
                         ifelse(country == "Thailand", plastic_wtoe*.72619513 + ldpe_plastic,
                            ifelse(country == "Panama", all_debris*0.006281589,
                      ldpe_plastic)))))

#PP (5)

#Make sure necessary columns are numeric
final_dataset$pp_plastic <- as.numeric(final_dataset$pp_plastic) 
final_dataset$polymeruk_ricebag_plastic <- as.numeric(final_dataset$polymeruk_ricebag_plastic)
final_dataset$hard_plastic <- as.numeric(final_dataset$hard_plastic)

#Make NA = 0 to avoid computational errors caused by NA
final_dataset$pp_plastic[is.na(final_dataset$pp_plastic)] <- 0
final_dataset$polymeruk_ricebag_plastic[is.na(final_dataset$polymeruk_ricebag_plastic)] <- 0
final_dataset$hard_plastic[is.na(final_dataset$hard_plastic)] <- 0

#Mutate a new PP column. Vietnam reported "rice bags" as "unknown polymer" and we assume they were PP as they typically are PP and are reported so in other locations; Jamaica did not sort out PP but rather a hard plastic category and we assumed this is primarily PP (HDPE and PET were sorted out separately and PVC was very uncommon); Panama proportion is based on sub-sampling of total debris (proportion of all debris in sub-sampling that was PP multiplied by all debris); all others reported total PP in PP column
final_dataset <- final_dataset %>%
  rowwise() %>%
  mutate(
    plastic_pp = ifelse(country == "Vietnam", sum(pp_plastic,polymeruk_ricebag_plastic,na.rm=TRUE),
                    ifelse(country == "Jamaica", hard_plastic,
                       ifelse(country == "Panama", all_debris*0.005138933,
                          pp_plastic))))

#PS (6)

#Make sure necessary columns are numeric
final_dataset$ps_plastic <- as.numeric(final_dataset$ps_plastic)
final_dataset$foam_plastic <- as.numeric(final_dataset$foam_plastic)

#Make NA = 0 to avoid computational errors caused by NA
final_dataset$ps_plastic[is.na(final_dataset$ps_plastic)] <- 0
final_dataset$foam_plastic[is.na(final_dataset$foam_plastic)] <- 0

#Mutate a new PS column. Jamaica did not sort PS but rather a foam plastic category and we assumed this was primarily PS; Thailand only reported sorted recycled plastic (all of the non-recycled plastic was sent to waste to energy), so sub-sampling was performed of the non-recycled plastic and the proportion of that sub-sampling that was PS was added to the PS recycled total by multiplying the proportion by the amount of plastic sent to waste to energy; Panama proportion is based on sub-sampling of total debris (proportion of all debris in sub-sampling that was PS multiplied by all debris); all others reported total PS in PS column
final_dataset <- final_dataset %>%
  rowwise() %>%
  mutate(
    plastic_ps = ifelse(country == "Jamaica", foam_plastic,
                    ifelse(country == "Thailand", plastic_wtoe*.03213902 + ps_plastic,
                        ifelse(country == "Panama", all_debris*0.076376629,
                ps_plastic))))

#Other (7)

#Make sure necessary columns are numeric
final_dataset$pa_plastic <- as.numeric(final_dataset$pa_plastic)
final_dataset$ptfe_plastic <- as.numeric(final_dataset$ptfe_plastic)
final_dataset$polymerother_plastic <- as.numeric(final_dataset$polymerother_plastic)
final_dataset$polyisoprene_plastic <- as.numeric(final_dataset$polyisoprene_plastic)

#Make NA = 0 to avoid computational errors caused by NA
final_dataset$pa_plastic[is.na(final_dataset$pa_plastic)] <- 0
final_dataset$ptfe_plastic[is.na(final_dataset$ptfe_plastic)] <- 0
final_dataset$polymerother_plastic[is.na(final_dataset$polymerother_plastic)] <- 0
final_dataset$polyisoprene_plastic[is.na(final_dataset$polyisoprene_plastic)] <- 0

#Mutate a new Other column. Thailand only reported sorted recycled plastic (all of the non-recycled plastic was sent to waste to energy), so sub-sampling was performed of the non-recycled plastic and the proportion of that sub-sampling that did not apply to the six other polymer classes was added to the PA, PTFE, Other, and polyisoprene recycled totals by multiplying the proportion by the amount of plastic sent to waste to energy; Panama proportion is based on sub-sampling of total debris (proportion of all debris in sub-sampling that was plastic but did not apply to the six other polymer classes multiplied by all debris); all others report other, PA, PTFE, polyisoprene categories combined for Other
final_dataset <- final_dataset %>%
  rowwise() %>%
  mutate(
    plastic_other = ifelse(country == "Thailand", sum(plastic_wtoe*.24166585, pa_plastic, ptfe_plastic, polymerother_plastic, polyisoprene_plastic, na.rm=TRUE),
                        ifelse(country == "Panama", all_debris*0.053777418,
                          sum(pa_plastic, ptfe_plastic, polymerother_plastic, polyisoprene_plastic, na.rm=TRUE))))

Standardize item categories

#Wrangle single-use item type data for Ecuador, Indonesia, Kenya, Vietnam (data only available for these subset study sites)

#Make sure necessary columns are numeric
final_dataset$plastic_pet <- as.numeric(final_dataset$plastic_pet)
final_dataset$wrappers_plastic <- as.numeric(final_dataset$wrappers_plastic)
final_dataset$bags_plastic <- as.numeric(final_dataset$bags_plastic)
final_dataset$plastic_ps <- as.numeric(final_dataset$plastic_ps)
final_dataset$plastic_debris <- as.numeric(final_dataset$plastic_debris)

#Make NA = 0 to avoid computational errors caused by NA
final_dataset$plastic_pet[is.na(final_dataset$plastic_pet)] <- 0
final_dataset$wrappers_plastic[is.na(final_dataset$wrappers_plastic)] <- 0
final_dataset$bags_plastic[is.na(final_dataset$bags_plastic)] <- 0
final_dataset$plastic_ps[is.na(final_dataset$plastic_ps)] <- 0
final_dataset$plastic_debris[is.na(final_dataset$plastic_debris)] <- 0

#Mutate item type categories

#Mutate a new bottles column. Apply PET values to bottle column for the subset countries, where reported bottle totals were all equivalent to reported PET
final_dataset <- final_dataset %>%
  rowwise() %>% 
  mutate(
    plastic_item_bottles = ifelse(country == "Ecuador", plastic_pet,
                              ifelse(country == "Indonesia", plastic_pet,
                                  ifelse(country == "Kenya", plastic_pet,
                                      ifelse(country == "Vietnam", plastic_pet,
                                             0)))))

#One off corrections for misreported/error in data
#Bottle amount different from PET in this case in Kenya, all numbers check out otherwise
final_dataset[250, 51] <- 440

#Mutate a new bags column. Apply bag values to bags column for all subset countries, as specific bag weights were reported
final_dataset <- final_dataset %>%
  rowwise() %>% 
  mutate(
    plastic_item_bags = ifelse(country == "Ecuador", bags_plastic,
                              ifelse(country == "Indonesia", bags_plastic,
                                  ifelse(country == "Kenya", bags_plastic,
                                      ifelse(country == "Vietnam", bags_plastic,
                                             0)))))

#Mutate a new wrappers column. Apply wrapper values to wrappers column for all subset countries, as specific wrapper weights were reported
final_dataset <- final_dataset %>%
  rowwise() %>% 
  mutate(
    plastic_item_wrappers = ifelse(country == "Ecuador", wrappers_plastic,
                              ifelse(country == "Indonesia", wrappers_plastic,
                                  ifelse(country == "Kenya", wrappers_plastic,
                                      ifelse(country == "Vietnam", wrappers_plastic,
                                             0)))))

#Mutate a new foam column. Apply PS values to foam column for all subset countries, where based on collection methods, we have assumed that reported PS was largely single-use expanded polystyrene (foam)
final_dataset <- final_dataset %>%
  rowwise() %>% 
  mutate(
    plastic_item_foam = ifelse(country == "Ecuador", plastic_ps,
                              ifelse(country == "Indonesia", plastic_ps,
                                  ifelse(country == "Kenya", plastic_ps,
                                      ifelse(country == "Vietnam", plastic_ps,
                                             0)))))

#Mutate a new Other column. For visualization purposes that subtracts the sum of the item types from the plastic debris reported so the proportions will equal 1.
final_dataset <- final_dataset %>%
  rowwise() %>% 
  mutate(
    plastic_item_other = ifelse(country == "Ecuador", plastic_debris - sum(plastic_item_bottles, plastic_item_bags, plastic_item_wrappers, plastic_item_foam, na.rm=TRUE),
                              ifelse(country == "Indonesia", plastic_debris - sum(plastic_item_bottles, plastic_item_bags, plastic_item_wrappers, plastic_item_foam, na.rm=TRUE),
                                  ifelse(country == "Kenya", plastic_debris - sum(plastic_item_bottles, plastic_item_bags, plastic_item_wrappers, plastic_item_foam, na.rm=TRUE),
                                      ifelse(country == "Vietnam", plastic_debris - sum(plastic_item_bottles, plastic_item_bags, plastic_item_wrappers, plastic_item_foam, na.rm=TRUE),
                                             0)))))

#Make all Jamaica item type categories "N/A" as they do not apply to this analysis
final_dataset$plastic_item_bottles <- ifelse(final_dataset$country == "Jamaica", "N/A", final_dataset$plastic_item_bottles)
final_dataset$plastic_item_bags <- ifelse(final_dataset$country == "Jamaica", "N/A", final_dataset$plastic_item_bags)
final_dataset$plastic_item_wrappers <- ifelse(final_dataset$country == "Jamaica", "N/A", final_dataset$plastic_item_wrappers)
final_dataset$plastic_item_foam <- ifelse(final_dataset$country == "Jamaica", "N/A", final_dataset$plastic_item_foam)
final_dataset$plastic_item_other <- ifelse(final_dataset$country == "Jamaica", "N/A", final_dataset$plastic_item_other)

#Make all Mexico item type categories "N/A" as they do not apply to this analysis
final_dataset$plastic_item_bottles <- ifelse(final_dataset$country == "Mexico", "N/A", final_dataset$plastic_item_bottles)
final_dataset$plastic_item_bags <- ifelse(final_dataset$country == "Mexico", "N/A", final_dataset$plastic_item_bags)
final_dataset$plastic_item_wrappers <- ifelse(final_dataset$country == "Mexico", "N/A", final_dataset$plastic_item_wrappers)
final_dataset$plastic_item_foam <- ifelse(final_dataset$country == "Mexico", "N/A", final_dataset$plastic_item_foam)
final_dataset$plastic_item_other <- ifelse(final_dataset$country == "Mexico", "N/A", final_dataset$plastic_item_other)

#Make all Thailand item type categories "N/A" as they do not apply to this analysis
final_dataset$plastic_item_bottles <- ifelse(final_dataset$country == "Thailand", "N/A", final_dataset$plastic_item_bottles)
final_dataset$plastic_item_bags <- ifelse(final_dataset$country == "Thailand", "N/A", final_dataset$plastic_item_bags)
final_dataset$plastic_item_wrappers <- ifelse(final_dataset$country == "Thailand", "N/A", final_dataset$plastic_item_wrappers)
final_dataset$plastic_item_foam <- ifelse(final_dataset$country == "Thailand", "N/A", final_dataset$plastic_item_foam)
final_dataset$plastic_item_other <- ifelse(final_dataset$country == "Thailand", "N/A", final_dataset$plastic_item_other)

#Make all Panama item type categories "N/A" as they do not apply to this analysis
final_dataset$plastic_item_bottles <- ifelse(final_dataset$country == "Panama", "N/A", final_dataset$plastic_item_bottles)
final_dataset$plastic_item_bags <- ifelse(final_dataset$country == "Panama", "N/A", final_dataset$plastic_item_bags)
final_dataset$plastic_item_wrappers <- ifelse(final_dataset$country == "Panama", "N/A", final_dataset$plastic_item_wrappers)
final_dataset$plastic_item_foam <- ifelse(final_dataset$country == "Panama", "N/A", final_dataset$plastic_item_foam)
final_dataset$plastic_item_other <- ifelse(final_dataset$country == "Panama", "N/A", final_dataset$plastic_item_other)

Standardize end-of-life fate categories

#Wrangle end-of-life fate data for consolodated groupings used in the analysis

#Make sure necessary columns are numeric
final_dataset$plastic_recycled <- as.numeric(final_dataset$plastic_recycled)
final_dataset$plastic_thirdrecycler <- as.numeric(final_dataset$plastic_thirdrecycler)
final_dataset$plastic_reused <- as.numeric(final_dataset$plastic_reused)
final_dataset$plastic_wtoe <- as.numeric(final_dataset$plastic_wtoe)
final_dataset$plastic_downcycled <- as.numeric(final_dataset$plastic_downcycled)
final_dataset$plastic_solddowncycler <- as.numeric(final_dataset$plastic_solddowncycler)
final_dataset$plastic_stockfordowncycle <- as.numeric(final_dataset$plastic_stockfordowncycle)
final_dataset$plastic_cement <- as.numeric(final_dataset$plastic_cement)
final_dataset$plastic_incinerated <- as.numeric(final_dataset$plastic_incinerated)
final_dataset$plastic_landfilled <- as.numeric(final_dataset$plastic_landfilled)

#Make NA = 0 to avoid computational errors caused by NA
final_dataset$plastic_recycled[is.na(final_dataset$plastic_recycled)] <- 0
final_dataset$plastic_thirdrecycler[is.na(final_dataset$plastic_thirdrecycler)] <- 0
final_dataset$plastic_reused[is.na(final_dataset$plastic_reused)] <- 0
final_dataset$plastic_wtoe[is.na(final_dataset$plastic_wtoe)] <- 0
final_dataset$plastic_downcycled[is.na(final_dataset$plastic_downcycled)] <- 0
final_dataset$plastic_solddowncycler[is.na(final_dataset$plastic_solddowncycler)] <- 0
final_dataset$plastic_stockfordowncycle[is.na(final_dataset$plastic_stockfordowncycle)] <- 0
final_dataset$plastic_cement[is.na(final_dataset$plastic_cement)] <- 0
final_dataset$plastic_incinerated[is.na(final_dataset$plastic_incinerated)] <- 0
final_dataset$plastic_landfilled[is.na(final_dataset$plastic_landfilled)] <- 0

#Mutate grouped end-of-life fate categories. For all locations, we combined more granular categories into broader categories.
final_dataset <- final_dataset %>%
  rowwise() %>% 
  mutate(
    plastic_fate_recycled = plastic_recycled + plastic_thirdrecycler,
    plastic_fate_downcycled = plastic_downcycled + plastic_solddowncycler + plastic_stockfordowncycle,
    plastic_fate_reused = plastic_reused,
    plastic_fate_energy = plastic_wtoe,
    plastic_fate_landfill_incineration = plastic_landfilled + plastic_incinerated + plastic_cement
  )

#Adjust Panama data. Panama only reported an official recycled plastics number, not a total plastic number. All of their non-recycled debris was landfilled, therefore we subtract the actual recycled plastic from the estimated total plastic from the sub-sampling process for the total landfilled plastic.
final_dataset$plastic_fate_landfill_incineration <- ifelse(final_dataset$country == "Panama", final_dataset$plastic_debris - final_dataset$plastic_fate_recycled, final_dataset$plastic_fate_landfill_incineration)

Trim, error check, and finalize for analysis and visualization

#Select only necessary columns to eliminate redundancies
final_dataset <- final_dataset %>%
  select(month, year, country, organization, river, device, river_id, date, all_debris, plastic_debris, plastic_pet, plastic_hdpe, plastic_pvc, plastic_ldpe, plastic_pp, plastic_ps, plastic_other, plastic_item_bottles, plastic_item_bags, plastic_item_wrappers, plastic_item_foam, plastic_item_other, plastic_fate_recycled, plastic_fate_downcycled, plastic_fate_reused, plastic_fate_energy, plastic_fate_landfill_incineration)

#Error check by summing polymers, which should not exceed the total plastic reported
data_error_check <- final_dataset %>%
  rowwise() %>% 
  mutate(
    pol_sum = sum(plastic_pet, plastic_hdpe, plastic_pvc, plastic_ldpe, plastic_pp, plastic_ps, plastic_other)
  ) %>% 
  mutate(
    pol_plastic = plastic_debris - pol_sum
  )

#If polymer sums exceeded plastic reported, we adjusted the polymer numbers where there were logical entry mistakes/errors and likely over-reported totals based on the way the dataset was collected and collated

#Make sure necessary columns are numeric
final_dataset$plastic_hdpe <- as.numeric(final_dataset$plastic_hdpe)
final_dataset$plastic_pvc <- as.numeric(final_dataset$plastic_pvc)
final_dataset$plastic_ldpe <- as.numeric(final_dataset$plastic_ldpe)
final_dataset$plastic_pp <- as.numeric(final_dataset$plastic_pp)
final_dataset$plastic_other <- as.numeric(final_dataset$plastic_other)
final_dataset$plastic_item_bottles <- as.numeric(final_dataset$plastic_item_bottles)
final_dataset$plastic_item_other <- as.numeric(final_dataset$plastic_item_other)

#Specific cell replacements to correct for entry and reporting error
final_dataset[70, 13] <- 0
final_dataset[4, 17] <- 565
final_dataset[6, 14] <- 4515.8331
final_dataset[7, 17] <- 530
final_dataset[26, 17] <- 406
final_dataset[28, 12] <- 900
final_dataset[29, 11] <- 1150
final_dataset[29, 18] <- 1150
final_dataset[29, 22] <- 1020
final_dataset[34, 17] <- 885.72
final_dataset[86, 17] <- 600
final_dataset[86, 15] <- 500
final_dataset[86, 14] <- 300
final_dataset[86, 12] <- 702
final_dataset[90, 14] <- 402.1729
final_dataset[91, 14] <- 414.8921
final_dataset[92, 14] <- 337.3905
final_dataset[93, 14] <- 406.3353
final_dataset[108, 14] <- 3.255
final_dataset[110, 17] <- 330
final_dataset[114, 12] <- 1000
final_dataset[140, 12] <- 1194
final_dataset[156, 17] <- 574
final_dataset[156, 15] <- 705
final_dataset[158, 17] <- 308
final_dataset[170, 12] <- 498
final_dataset[170, 17] <- 447
final_dataset[172, 12] <- 635
final_dataset[172, 13] <- 105
final_dataset[172, 14] <- 200
final_dataset[172, 15] <- 209
final_dataset[172, 17] <- 680
final_dataset[186, 12] <- 600
final_dataset[186, 14] <- 169
final_dataset[186, 15] <- 200
final_dataset[186, 17] <- 397
final_dataset[209, 12] <- 695
final_dataset[209, 14] <- 252
final_dataset[209, 15] <- 275
final_dataset[209, 17] <- 448
final_dataset[227, 13] <- 70
final_dataset[238, 17] <- 0
final_dataset[246, 12] <- 600
final_dataset[286, 12] <- 415
final_dataset[293, 14] <- 115
final_dataset[307, 14] <- 450
final_dataset[307, 17] <- 35
final_dataset[308, 12] <- 535
final_dataset[308, 14] <- 590
final_dataset[308, 17] <- 183
final_dataset[311, 14] <- 2002
final_dataset[312, 14] <- 955
final_dataset[313, 14] <- 568
final_dataset[314, 14] <- 765
final_dataset[316, 12] <- 250
final_dataset[316, 13] <- 35
final_dataset[316, 14] <- 274
final_dataset[330, 14] <- 806
final_dataset[355, 14] <- 973.15
final_dataset[358, 14] <- 590.79
final_dataset[360, 14] <- 764.1
final_dataset[360, 15] <- 445.51
final_dataset[361, 14] <- 1766.742
final_dataset[362, 14] <- 602.577
final_dataset[362, 12] <- 675.677
final_dataset[377, 14] <- 896.3
final_dataset[379, 14] <- 1158.9
final_dataset[381, 14] <- 1976.2
final_dataset[382, 14] <- 1521.8
final_dataset[383, 14] <- 2151.5
final_dataset[384, 14] <- 1181.2
final_dataset[385, 14] <- 1697.3
final_dataset[387, 14] <- 830.7537736
final_dataset[388, 14] <- 1247.3986357
final_dataset[389, 14] <- 1035.8449069
final_dataset[390, 14] <- 2232.5075589
final_dataset[400, 14] <- 1661
final_dataset[401, 14] <- 1273
final_dataset[404, 14] <- 1137
final_dataset[405, 14] <- 1180
final_dataset[411, 14] <- 761.9338158
final_dataset[412, 14] <- 1300.4708528
final_dataset[413, 14] <- 1011.2157386
final_dataset[414, 14] <- 2383.8196653
final_dataset[418, 17] <- 0
final_dataset[426, 14] <- 1009
final_dataset[427, 14] <- 969
final_dataset[431, 13] <- 40
final_dataset[437, 14] <- 494.177223
final_dataset[438, 14] <- 1217.2594176
final_dataset[439, 14] <- 1063.3859517
final_dataset[440, 14] <- 2342.5797792
final_dataset[463, 14] <- 2363
final_dataset[563, 17] <- 0
final_dataset[597, 17] <- 0
final_dataset[726, 14] <- 1664
final_dataset[737, 17] <- 830.6
final_dataset[749, 14] <- 7.333

#Error check re-performed by summing polymers, which should not exceed the total plastic reported, after adjustments were made
data_error_check <- final_dataset %>%
  rowwise() %>% 
  mutate(
    pol_sum = sum(plastic_pet, plastic_hdpe, plastic_pvc, plastic_ldpe, plastic_pp, plastic_ps, plastic_other)
  ) %>% 
  mutate(
    pol_plastic = plastic_debris - pol_sum
  ) %>% 
  filter(
   pol_plastic < 0
  )

#No errors found

#Error check for item types, where the "other" category should always be > 0, otherwise item totals are overreported
data_error_check <- final_dataset %>%
  rowwise() %>% 
  filter(
 plastic_item_other < 0
  )

#No errors found

#Error check for end-of-life fates by summing all categories, which should not exceed the total plastic reported
data_error_check <- final_dataset %>%
  rowwise() %>% 
  mutate(
    wm_sum = sum(plastic_fate_recycled, plastic_fate_downcycled, plastic_fate_reused, plastic_fate_energy, plastic_fate_landfill_incineration)
  ) %>% 
  mutate(
    wm_plastic = plastic_debris - wm_sum
  )

#Specific cell replacements to correct for entry and reporting error
final_dataset[6, 26] <- 6107.72

#Error check re-performed for end-of-life fates by summing all categories, which should not exceed the total plastic reported, after adjustments were made
data_error_check <- final_dataset %>%
  rowwise() %>% 
  mutate(
    wm_sum = sum(plastic_fate_recycled, plastic_fate_downcycled, plastic_fate_reused, plastic_fate_energy, plastic_fate_landfill_incineration)
  ) %>% 
  mutate(
    wm_plastic = plastic_debris - wm_sum
  ) %>% 
  filter(
    wm_plastic < 0
  )

#No errors found

#Create a version of the dataset with rounded decimals

#Make sure necessary columns are numeric
final_dataset$all_debris <- as.numeric(final_dataset$all_debris)
final_dataset$plastic_debris <- as.numeric(final_dataset$plastic_debris)
final_dataset$plastic_pet <- as.numeric(final_dataset$plastic_pet)
final_dataset$plastic_hdpe <- as.numeric(final_dataset$plastic_hdpe)
final_dataset$plastic_pvc <- as.numeric(final_dataset$plastic_pvc)
final_dataset$plastic_ldpe <- as.numeric(final_dataset$plastic_ldpe)
final_dataset$plastic_pp <- as.numeric(final_dataset$plastic_pp)
final_dataset$plastic_ps <- as.numeric(final_dataset$plastic_ps)
final_dataset$plastic_other <- as.numeric(final_dataset$plastic_other)
final_dataset$plastic_item_bottles <- as.numeric(final_dataset$plastic_item_bottles)
final_dataset$plastic_item_bags <- as.numeric(final_dataset$plastic_item_bags)
final_dataset$plastic_item_wrappers <- as.numeric(final_dataset$plastic_item_wrappers)
final_dataset$plastic_item_foam <- as.numeric(final_dataset$plastic_item_foam)
final_dataset$plastic_item_other <- as.numeric(final_dataset$plastic_item_other)
final_dataset$plastic_fate_recycled <- as.numeric(final_dataset$plastic_fate_recycled)
final_dataset$plastic_fate_downcycled <- as.numeric(final_dataset$plastic_fate_downcycled)
final_dataset$plastic_fate_reused <- as.numeric(final_dataset$plastic_fate_reused)
final_dataset$plastic_fate_energy <- as.numeric(final_dataset$plastic_fate_energy)
final_dataset$plastic_fate_landfill_incineration <- as.numeric(final_dataset$plastic_fate_landfill_incineration)

#Round the columns to 4 decimal places
final_dataset_rounded <- final_dataset %>% 
  mutate_if(is.numeric, ~round(.,4))

#Due to the previous class change to numeric to create the rounded dataset, we now must return N/As to the non-item type analysis countries

#Make all Jamaica item type categories "N/A" as they do not apply to this analysis
final_dataset_rounded$plastic_item_bottles <- ifelse(final_dataset_rounded$country == "Jamaica", "N/A", final_dataset_rounded$plastic_item_bottles)
final_dataset_rounded$plastic_item_bags <- ifelse(final_dataset_rounded$country == "Jamaica", "N/A", final_dataset_rounded$plastic_item_bags)
final_dataset_rounded$plastic_item_wrappers <- ifelse(final_dataset_rounded$country == "Jamaica", "N/A", final_dataset_rounded$plastic_item_wrappers)
final_dataset_rounded$plastic_item_foam <- ifelse(final_dataset_rounded$country == "Jamaica", "N/A", final_dataset_rounded$plastic_item_foam)
final_dataset_rounded$plastic_item_other <- ifelse(final_dataset_rounded$country == "Jamaica", "N/A", final_dataset_rounded$plastic_item_other)

final_dataset$plastic_item_bottles <- ifelse(final_dataset$country == "Jamaica", "N/A", final_dataset$plastic_item_bottles)
final_dataset$plastic_item_bags <- ifelse(final_dataset$country == "Jamaica", "N/A", final_dataset$plastic_item_bags)
final_dataset$plastic_item_wrappers <- ifelse(final_dataset$country == "Jamaica", "N/A", final_dataset$plastic_item_wrappers)
final_dataset$plastic_item_foam <- ifelse(final_dataset$country == "Jamaica", "N/A", final_dataset$plastic_item_foam)
final_dataset$plastic_item_other <- ifelse(final_dataset$country == "Jamaica", "N/A", final_dataset$plastic_item_other)

#Make all Mexico item type categories "N/A" as they do not apply to this analysis
final_dataset_rounded$plastic_item_bottles <- ifelse(final_dataset_rounded$country == "Mexico", "N/A", final_dataset_rounded$plastic_item_bottles)
final_dataset_rounded$plastic_item_bags <- ifelse(final_dataset_rounded$country == "Mexico", "N/A", final_dataset_rounded$plastic_item_bags)
final_dataset_rounded$plastic_item_wrappers <- ifelse(final_dataset_rounded$country == "Mexico", "N/A", final_dataset_rounded$plastic_item_wrappers)
final_dataset_rounded$plastic_item_foam <- ifelse(final_dataset_rounded$country == "Mexico", "N/A", final_dataset_rounded$plastic_item_foam)
final_dataset_rounded$plastic_item_other <- ifelse(final_dataset_rounded$country == "Mexico", "N/A", final_dataset_rounded$plastic_item_other)

final_dataset$plastic_item_bottles <- ifelse(final_dataset$country == "Mexico", "N/A", final_dataset$plastic_item_bottles)
final_dataset$plastic_item_bags <- ifelse(final_dataset$country == "Mexico", "N/A", final_dataset$plastic_item_bags)
final_dataset$plastic_item_wrappers <- ifelse(final_dataset$country == "Mexico", "N/A", final_dataset$plastic_item_wrappers)
final_dataset$plastic_item_foam <- ifelse(final_dataset$country == "Mexico", "N/A", final_dataset$plastic_item_foam)
final_dataset$plastic_item_other <- ifelse(final_dataset$country == "Mexico", "N/A", final_dataset$plastic_item_other)

#Make all Thailand item type categories "N/A" as they do not apply to this analysis
final_dataset_rounded$plastic_item_bottles <- ifelse(final_dataset_rounded$country == "Thailand", "N/A", final_dataset_rounded$plastic_item_bottles)
final_dataset_rounded$plastic_item_bags <- ifelse(final_dataset_rounded$country == "Thailand", "N/A", final_dataset_rounded$plastic_item_bags)
final_dataset_rounded$plastic_item_wrappers <- ifelse(final_dataset_rounded$country == "Thailand", "N/A", final_dataset_rounded$plastic_item_wrappers)
final_dataset_rounded$plastic_item_foam <- ifelse(final_dataset_rounded$country == "Thailand", "N/A", final_dataset_rounded$plastic_item_foam)
final_dataset_rounded$plastic_item_other <- ifelse(final_dataset_rounded$country == "Thailand", "N/A", final_dataset_rounded$plastic_item_other)

final_dataset$plastic_item_bottles <- ifelse(final_dataset$country == "Thailand", "N/A", final_dataset$plastic_item_bottles)
final_dataset$plastic_item_bags <- ifelse(final_dataset$country == "Thailand", "N/A", final_dataset$plastic_item_bags)
final_dataset$plastic_item_wrappers <- ifelse(final_dataset$country == "Thailand", "N/A", final_dataset$plastic_item_wrappers)
final_dataset$plastic_item_foam <- ifelse(final_dataset$country == "Thailand", "N/A", final_dataset$plastic_item_foam)
final_dataset$plastic_item_other <- ifelse(final_dataset$country == "Thailand", "N/A", final_dataset$plastic_item_other)

#Make all Panama item type categories "N/A" as they do not apply to this analysis
final_dataset$plastic_item_bottles <- ifelse(final_dataset$country == "Panama", "N/A", final_dataset$plastic_item_bottles)
final_dataset$plastic_item_bags <- ifelse(final_dataset$country == "Panama", "N/A", final_dataset$plastic_item_bags)
final_dataset$plastic_item_wrappers <- ifelse(final_dataset$country == "Panama", "N/A", final_dataset$plastic_item_wrappers)
final_dataset$plastic_item_foam <- ifelse(final_dataset$country == "Panama", "N/A", final_dataset$plastic_item_foam)
final_dataset$plastic_item_other <- ifelse(final_dataset$country == "Panama", "N/A", final_dataset$plastic_item_other)

final_dataset_rounded$plastic_item_bottles <- ifelse(final_dataset_rounded$country == "Panama", "N/A", final_dataset_rounded$plastic_item_bottles)
final_dataset_rounded$plastic_item_bags <- ifelse(final_dataset_rounded$country == "Panama", "N/A", final_dataset_rounded$plastic_item_bags)
final_dataset_rounded$plastic_item_wrappers <- ifelse(final_dataset_rounded$country == "Panama", "N/A", final_dataset_rounded$plastic_item_wrappers)
final_dataset_rounded$plastic_item_foam <- ifelse(final_dataset_rounded$country == "Panama", "N/A", final_dataset_rounded$plastic_item_foam)
final_dataset_rounded$plastic_item_other <- ifelse(final_dataset_rounded$country == "Panama", "N/A", final_dataset_rounded$plastic_item_other)

#Change class of date column to "date"
final_dataset$date <- as.yearmon(final_dataset$date)
final_dataset_rounded$date <- as.yearmon(final_dataset_rounded$date)

Resulting Datasets

final_dataset_raw.csv
final_dataset.csv
final_dataset_rounded.csv

Subset Full Analysis-Ready Dataset (final_dataset.csv) to Figure-Specific Datasets

As a subset of final_dataset.csv, create:

final_dataset_fig2.csv
final_dataset_fig3.csv
final_dataset_fig4.csv
final_dataset_fig5.csv

Subset data for figures 2-5

#Load data
final_dataset <- read_csv("final_dataset.csv")

#Figure 2. Total Debris & Plastic Debris Collected

#Select relevant columns, Remove Jamaica excavations (unsorted debris), and only keep rows with reported data
final_dataset_fig2 <- final_dataset %>%
  select(
    country, organization, river_id, date, device, all_debris, plastic_debris
  ) %>% 
  filter(device != "TOC-EXC-011") %>% 
  filter(device != "TOC-EXC-008") %>% 
  filter(device != "TOC-EXC-010") %>%
  filter(
    plastic_debris > 0
  )

#Change date column to class(date)
final_dataset_fig2$date <- as.yearmon(final_dataset_fig2$date)

#Make sure necessary columns are numeric
final_dataset_fig2$all_debris <- as.numeric(final_dataset_fig2$all_debris)
final_dataset_fig2$plastic_debris <- as.numeric(final_dataset_fig2$plastic_debris)

#Combine plastic and debris for each month and location so there is one entry per month per location (not multiple for multiple devices)  
final_dataset_fig2 <- final_dataset_fig2 %>% 
  group_by(date, river_id, organization, country) %>% 
    summarise(
      all_debris = sum(all_debris, na.rm = TRUE),
      plastic_debris = sum(plastic_debris, na.rm = TRUE),
      device_months = length(unique(device))
    )

#Figure 3. Polymer Composition of Plastic Debris Collected

#Select relevant columns, Remove Jamaica excavations (unsorted debris) and unsorted Indonesia collections
final_dataset_fig3 <- final_dataset %>%
  select(
    date, river_id, organization, country, device, plastic_pet, plastic_hdpe, plastic_pvc, plastic_ldpe, plastic_pp, plastic_ps, plastic_other
  ) %>% 
  filter(device != "TOC-EXC-011") %>% 
  filter(device != "TOC-EXC-008") %>% 
  filter(device != "TOC-EXC-010") %>%
  filter(device != "CRGF-HAND-1")

#Change date column to class(date)
final_dataset_fig3$date <- as.yearmon(final_dataset_fig3$date)

#Make sure necessary columns are numeric
final_dataset_fig3$plastic_pet <- as.numeric(final_dataset_fig3$plastic_pet)
final_dataset_fig3$plastic_hdpe <- as.numeric(final_dataset_fig3$plastic_hdpe)
final_dataset_fig3$plastic_pvc <- as.numeric(final_dataset_fig3$plastic_pvc)
final_dataset_fig3$plastic_ldpe <- as.numeric(final_dataset_fig3$plastic_ldpe)
final_dataset_fig3$plastic_pp <- as.numeric(final_dataset_fig3$plastic_pp)
final_dataset_fig3$plastic_ps <- as.numeric(final_dataset_fig3$plastic_ps)
final_dataset_fig3$plastic_other <- as.numeric(final_dataset_fig3$plastic_other)

#Calculate a total plastic column to create proportions that will all add up exactly to 1 (there are some very slight differences in the original plastic_debris column from the sum of all of the polymers - all on the order of < 1), remove any rows that do not have any data
final_dataset_fig3 <- final_dataset_fig3 %>% 
  rowwise() %>% 
  mutate(
    plastic_debris_polymers_total = plastic_pet + plastic_hdpe + plastic_pvc + plastic_ldpe + plastic_pp + plastic_ps + plastic_other
  ) %>% 
  filter(
    plastic_debris_polymers_total > 0
  )

#Combine polymers for each month and location so there is one entry per month per location (not multiple for multiple devices)  
final_dataset_fig3 <- final_dataset_fig3 %>% 
  group_by(date, river_id, organization, country) %>% 
    summarise(
      plastic_pet = sum(plastic_pet, na.rm = TRUE), 
      plastic_hdpe = sum(plastic_hdpe, na.rm = TRUE), 
      plastic_pvc = sum(plastic_pvc, na.rm = TRUE),
      plastic_ldpe = sum(plastic_ldpe, na.rm = TRUE),
      plastic_pp = sum(plastic_pp, na.rm = TRUE),
      plastic_ps = sum(plastic_ps, na.rm = TRUE),
      plastic_other = sum(plastic_other, na.rm = TRUE),
      plastic_debris_polymers_total = sum(plastic_debris_polymers_total, na.rm = TRUE)
    )

#Figure 4. Proportion of Single-Use Plastic Item Categories in Plastic Debris Collected

#Select relevant columns for the 4 locations in this analysis (Ecuador, Indonesia, Kenya, Vietnam), and remove unsorted Indonesia collections
final_dataset_fig4 <- final_dataset %>%
  select(
    date, river_id, organization, country, device, plastic_item_bottles, plastic_item_bags, plastic_item_wrappers, plastic_item_foam, plastic_item_other) %>% 
  filter(
          country == "Vietnam" | 
          country == "Indonesia" | 
          country == "Kenya" | 
          country == "Ecuador") %>% 
  filter(device != "CRGF-HAND-1")

#Change date column to class(date)
final_dataset_fig4$date <- as.yearmon(final_dataset_fig4$date)

#Make sure necessary columns are numeric
final_dataset_fig4$plastic_item_bottles <- as.numeric(final_dataset_fig4$plastic_item_bottles)
final_dataset_fig4$plastic_item_bags <- as.numeric(final_dataset_fig4$plastic_item_bags)
final_dataset_fig4$plastic_item_wrappers <- as.numeric(final_dataset_fig4$plastic_item_wrappers)
final_dataset_fig4$plastic_item_foam <- as.numeric(final_dataset_fig4$plastic_item_foam)
final_dataset_fig4$plastic_item_other <- as.numeric(final_dataset_fig4$plastic_item_other)

#Calculate a total plastic column to create proportions that will all add up exactly to 1 (there are some very slight differences in the original plastic_debris column from the sum of all of the items - all on the order of < 1), remove any rows that do not have any data
final_dataset_fig4 <- final_dataset_fig4 %>% 
  rowwise() %>% 
  mutate(
    plastic_debris_items_total = plastic_item_bottles + plastic_item_bags + plastic_item_wrappers + plastic_item_foam + plastic_item_other
  ) %>% 
  filter(
    plastic_debris_items_total > 0
  )

#Combine items for each month and location so there is one entry per month per location (not multiple for multiple devices)  
final_dataset_fig4 <- final_dataset_fig4 %>% 
  group_by(date, river_id, organization, country) %>% 
    summarise(
      plastic_item_bottles = sum(plastic_item_bottles, na.rm = TRUE), 
      plastic_item_bags = sum(plastic_item_bags, na.rm = TRUE), 
      plastic_item_wrappers = sum(plastic_item_wrappers, na.rm = TRUE),
      plastic_item_foam = sum(plastic_item_foam, na.rm = TRUE),
      plastic_item_other = sum(plastic_item_other, na.rm = TRUE),
      plastic_debris_items_total = sum(plastic_debris_items_total, na.rm = TRUE)
    )

#Figure 5. End-of-Life Fate of Plastic Debris Collected

#Select relevant columns
final_dataset_fig5 <- final_dataset %>%
  select(
    date, river_id, organization, country, device, plastic_fate_recycled, plastic_fate_downcycled, plastic_fate_reused, plastic_fate_energy, plastic_fate_landfill_incineration
  )

#Change date column to class(date)
final_dataset_fig5$date <- as.yearmon(final_dataset_fig5$date)

#Make sure necessary columns are numeric
final_dataset_fig5$plastic_fate_recycled <- as.numeric(final_dataset_fig5$plastic_fate_recycled)
final_dataset_fig5$plastic_fate_downcycled <- as.numeric(final_dataset_fig5$plastic_fate_downcycled)
final_dataset_fig5$plastic_fate_reused <- as.numeric(final_dataset_fig5$plastic_fate_reused)
final_dataset_fig5$plastic_fate_energy <- as.numeric(final_dataset_fig5$plastic_fate_energy)
final_dataset_fig5$plastic_fate_landfill_incineration <- as.numeric(final_dataset_fig5$plastic_fate_landfill_incineration)

#Calculate a total plastic column to create proportions that will all add up exactly to 1 (there are some very slight differences in the original plastic_debris column from the sum of all of the end of life fates - all on the order of < 1), remove any rows that do not have any data
final_dataset_fig5 <- final_dataset_fig5 %>% 
  rowwise() %>% 
  mutate(
    plastic_debris_fate_total = plastic_fate_recycled + plastic_fate_downcycled + plastic_fate_reused + plastic_fate_energy + plastic_fate_landfill_incineration
  ) %>% 
  filter(
    plastic_debris_fate_total > 0
  )

#Combine fates for each month and location so there is one entry per month per location (not multiple for multiple devices)  
final_dataset_fig5 <- final_dataset_fig5 %>% 
  group_by(date, river_id, organization, country) %>% 
    summarise(
      plastic_fate_recycled = sum(plastic_fate_recycled, na.rm = TRUE), 
      plastic_fate_downcycled = sum(plastic_fate_downcycled, na.rm = TRUE), 
      plastic_fate_reused = sum(plastic_fate_reused, na.rm = TRUE),
      plastic_fate_energy = sum(plastic_fate_energy, na.rm = TRUE),
      plastic_fate_landfill_incineration = sum(plastic_fate_landfill_incineration, na.rm = TRUE),
      plastic_debris_fate_total = sum(plastic_debris_fate_total, na.rm = TRUE)
    )

Resulting Datasets

final_dataset_fig2.csv
final_dataset_fig3.csv
final_dataset_fig4.csv
final_dataset_fig5.csv

Create Additional Figure-Specific Datasets that Include Error Calculations in Correct Format for Summary Analyses, Statistics Testing, and Visualizations

final_dataset_fig2.csv -> fig2_data.csv
final_dataset_fig3.csv -> fig3_data.csv
final_dataset_fig4.csv -> fig4_data.csv
final_dataset_fig5.csv -> fig5_data.csv

Figure 2. Total Debris & Plastic Debris Collected

#Figure 2. Begin with final_dataset_fig2.csv

#Pivot so each location and month has a row for all debris and plastic debris
fig2_data <- final_dataset_fig2 %>% 
  pivot_longer(
    cols = all_debris:plastic_debris,
    names_to = "debris_type",
    values_to = "mass_kg"
  )

#Summarize to get total debris numbers, monthly means, effort (n months), standard deviations, and standard errors
fig2_data <- fig2_data %>% 
  group_by(river_id, debris_type, organization, country) %>% 
  summarise(
    total_kg = sum(mass_kg, na.rm = TRUE),
    mean_kg = mean(mass_kg, na.rm = TRUE),
    effort_months = length(unique(date)),
    device_months = sum(device_months, na.rm = TRUE),
    var_kg = var(mass_kg, na.rm = TRUE),
    sd_kg = sd(mass_kg, na.rm = TRUE),
    se_kg = sd_kg / sqrt(device_months)
  )

#Function to calculate 95% confidence interval
ciFun <- qt(.975, fig2_data$device_months - 1)

#Calculate 95% confidence interval
fig2_data <- fig2_data %>%
  ungroup() %>% 
  mutate(
    ci95_kg = se_kg * ciFun
  )

#Add outisde variables to dataset for each study site

#Nearest urban populations
#Categorical ranking of technology types
#Average reported width of river at collection locations
#Length of river
#Categorical ranking of waste picker activity at each location
fig2_data <- fig2_data %>%
    rowwise() %>%
  mutate(
    urban_population = ifelse(country == "Kenya", 4734881,
                   ifelse(country == "Thailand", 10539415,
                          ifelse(country == "Mexico", 2140398,
                                 ifelse(country == "Ecuador", 275421,
                                        ifelse(country == "Vietnam", 236294,
                                               ifelse(country == "Panama", 1860291,
                                                      ifelse(country == "Indonesia", 2580191,
                                                             590940)))))))) %>%
  mutate(
    tech_rank = ifelse(country == "Kenya", 1,
                   ifelse(country == "Thailand", 2,
                          ifelse(country == "Mexico", 1,
                                 ifelse(country == "Ecuador", 3,
                                        ifelse(country == "Vietnam", 2,
                                               ifelse(country == "Panama", 3,
                                                      ifelse(country == "Indonesia", 2,
                                                             2)))))))) %>%
  mutate(
    river_width_m = ifelse(country == "Kenya", 6,
                   ifelse(country == "Thailand", 20,
                          ifelse(country == "Mexico", 7,
                                 ifelse(country == "Ecuador", 19,
                                        ifelse(country == "Vietnam", 800,
                                               ifelse(country == "Panama", 34,
                                                      ifelse(country == "Indonesia", 319,
                                                             20)))))))) %>% #length of river
  mutate(
    river_length = ifelse(country == "Kenya", 390000,
                   ifelse(country == "Thailand", 370000,
                          ifelse(country == "Mexico", 190000,
                                 ifelse(country == "Ecuador", 100000,
                                        ifelse(country == "Vietnam", 1149000,
                                               ifelse(country == "Panama", 28000,
                                                      ifelse(country == "Indonesia", 297000,
                                                             9000)))))))) %>%
  mutate(
    waste_picker_rank = ifelse(country == "Kenya", 1,
                   ifelse(country == "Thailand", 2,
                          ifelse(country == "Mexico", 2,
                                 ifelse(country == "Ecuador", 3,
                                        ifelse(country == "Vietnam", 3,
                                               ifelse(country == "Panama", 1,
                                                      ifelse(country == "Indonesia", 3,
                                                             1))))))))

#Select only necessary columns and reorder columns
fig2_data <- fig2_data[,c(1,4,2,7,8,13,14,17,15,16,5,6,9,10,11,12)]

#Because Panama plastic debris is calculated as a sub-proportion of total debris based on sub-sampling, the error needs to be calculated from the sub-sampling and not the plastic debris data as it is now. Determine the var, sd, se, and ci95 for the sub-sampling efforts, then use the ratio of the plastic:debris for these values and apply that ratio to the actual amount of total debris collected to derive the same error variables for the total plastic debris.

#Load data
sub_sample_data <- read_csv("subsample_data.csv")

#Filter for Panama only
panama_sub <- sub_sample_data %>% 
  filter(
    country == "Panama"
  )

#Keep only columns needed for total debris and plastic debris
panama_sub <- panama_sub[,1:5]

#Pivot and calculate effort for future SE calculation
pan_sub <- panama_sub %>% 
  pivot_longer(
    cols = total_debris:total_plastic,
    names_to = "debris_type",
    values_to = "mass_kg"
  ) %>% 
  mutate(
    effort = ifelse(debris_type == "total_debris", mass_kg / mass_kg[sample_id == "sample 1"], mass_kg / mass_kg[sample_id == "sample 1"]))

#Summarize to get total debris numbers, monthly means, effort (n months), standard deviations, and standard errors
#Included an alternate se and ci95 interval, but did not use
pan_sub <- pan_sub %>% 
  group_by(debris_type) %>% 
  summarise(
    total_kg = sum(mass_kg, na.rm = TRUE),
    mean_kg = mean(mass_kg, na.rm = TRUE),
    effort = sum(effort),
    samples = length(unique(date)),
    var_kg = var(mass_kg, na.rm = TRUE),
    sd_kg = sd(mass_kg, na.rm = TRUE),
    se_kg = sd_kg / sqrt(effort),
    se_alt = sd_kg / sqrt(samples)
  )

#Function to calculate 95% confidence interval
ciFun <- qt(.975, pan_sub$effort - 1)

#Alternate function to calculate 95% confidence interval (unused)
ciFun2 <- qt(.975, pan_sub$samples - 1)

#Calculate 95% confidence interval
pan_sub <- pan_sub %>%
  ungroup() %>% 
  mutate(
    ci95_kg = se_kg * ciFun
  ) %>% 
  mutate(
    ci95_alt = se_alt*ciFun2
  )

#Select Panama data from fig2_data
pan_fig2 <- fig2_data %>% 
  filter(
    country == "Panama"
  )

#Select relevant columns and mutate new columns to match pan_sub for future merging
pan_fig2 <- pan_fig2[, c(3,11,12,13,14,15,16)] %>% 
  mutate(
    ci95_alt = ci95_kg
  ) %>% 
  mutate(
    se_alt = se_kg
  )

#Mutate effort column
pan_fig2 <- pan_fig2 %>% 
  mutate(
    effort = ifelse(debris_type == "all_debris", pan_sub$effort[pan_sub$debris_type == "total_debris"], pan_sub$effort[pan_sub$debris_type == "total_plastic"])
  )

#Select and order columns for merge
pan_fig2 <- pan_fig2[, c(1,2,3,10,4,5,6,9,7,8)]
pan_sub <- pan_sub[, c(1,2,3,4,6,7,8,10,9,11)]

#Merge data
pan_err <- rbind(pan_sub, pan_fig2)

#Specific cell replacements
pan_err[1, 1] <- "all_debris"
pan_err[2, 1] <- "plastic_debris"

#ID column creation
pan_err[c(1,2),11] <- "sub"
pan_err[c(3,4),11] <- "all"

colnames(pan_err)[11] <- "id" 

#Mutating columns to find plastic:debris ratios of error in the sub-sample data
pan_err <- pan_err %>% 
  group_by(id) %>% 
  mutate(
    total_ratio = total_kg / total_kg[debris_type == "all_debris"]) %>% 
  mutate(
    mean_ratio = mean_kg / mean_kg[debris_type == "all_debris"]) %>% 
  mutate(
    var_ratio = var_kg / var_kg[debris_type == "all_debris"]) %>% 
  mutate(
    sd_ratio = sd_kg / sd_kg[debris_type == "all_debris"]) %>% 
  mutate(
    se_ratio = se_kg / se_kg[debris_type == "all_debris"]) %>%
  mutate(
    ci95_ratio = ci95_kg / ci95_kg[debris_type == "all_debris"])  %>%
   mutate(
    se_alt_ratio = se_alt / se_alt[debris_type == "all_debris"]) %>%
  mutate(
    ci95_alt_ratio = ci95_alt / ci95_alt[debris_type == "all_debris"])

#Only need sub sample plastic numbers and total debris numbers
pan_err <- pan_err[c(2,3),]

class(pan_err$id)

#Create new data frame with calculated error for Panama
panama_error <- data.frame(
  river_id = "Juan Díaz",
  country = "Panama",
  debris_type = "plastic_debris",
  var_kg = pan_err$var_ratio[pan_err$id == "sub"] * pan_err$var_kg[pan_err$id == "all"],
  sd_kg = pan_err$sd_ratio[pan_err$id == "sub"] * pan_err$sd_kg[pan_err$id == "all"],
  se_kg = pan_err$se_ratio[pan_err$id == "sub"] * pan_err$se_kg[pan_err$id == "all"],
  ci95_kg = pan_err$ci95_ratio[pan_err$id == "sub"] * pan_err$ci95_kg[pan_err$id == "all"]
)

#Send Panama error calculations to fig2_data
fig2_data$var_kg[fig2_data$country == "Panama" & fig2_data$debris_type == "plastic_debris"] <- panama_error$var_kg
fig2_data$sd_kg[fig2_data$country == "Panama" & fig2_data$debris_type == "plastic_debris"] <- panama_error$sd_kg
fig2_data$se_kg[fig2_data$country == "Panama" & fig2_data$debris_type == "plastic_debris"] <- panama_error$se_kg
fig2_data$ci95_kg[fig2_data$country == "Panama" & fig2_data$debris_type == "plastic_debris"] <- panama_error$ci95_kg

Figure 3. Polymer Composition of Plastic Debris Collected

#Figure 3. Begin with final_dataset_fig3.csv

#Pivot so each location and month has a row for all polymers
fig3_data <- final_dataset_fig3 %>% 
  pivot_longer(
    cols = plastic_pet:plastic_debris_polymers_total,
    names_to = "polymer",
    values_to = "mass_kg"
  )

#Reclassify columns (numeric, factor, date)
fig3_data$mass_kg <- as.numeric(fig3_data$mass_kg)
fig3_data$polymer <- as.factor(fig3_data$polymer)
fig3_data$date <- as.yearmon(fig3_data$date)

#Summarize to get total mass of each polymer at each location
fig3_data <- fig3_data %>% 
  group_by(river_id, organization, country, polymer) %>% 
  summarise(
    polymer_total_mass = sum(mass_kg, na.rm = TRUE), 
)

#Calculate a proportion for each polymer at each location based on the total of that polymer/total of all plastic
#Whether this is calculated as the total polymer/total plastic or mean monthly polymer/mean total plastic, the proportions come out exactly the same
fig3_data <- fig3_data %>% 
  group_by(river_id) %>% 
  mutate(
    polymer_proportions = polymer_total_mass / polymer_total_mass[polymer == "plastic_debris_polymers_total"])

#Calculate 95% confidence intervals using Wilson score intervals (a KG is a "trial" so total kg weight for a polymer is the success, and total plastic weight is the trial)

#Mutate a new column for "trials" = total plastic weight for each location
fig3_data <- fig3_data %>% 
  group_by(river_id) %>%
  mutate(trials = polymer_total_mass[polymer == "plastic_debris_polymers_total"])

#Make sure necessary columns are numeric
fig3_data$polymer_total_mass <- as.numeric(fig3_data$polymer_total_mass)
fig3_data$trials <- as.numeric(fig3_data$trials)

#Run the Wilson test where the total weight of the polymer at each location is the success out the trials, which is the total kg weight for that location
fig3_data <- fig3_data %>% 
  ungroup() %>%
  rowwise() %>%
  mutate(
    wilson_lower = wilson.ci(polymer_total_mass, n = trials, conf.level = 0.95)[1],
    wilson_upper = wilson.ci(polymer_total_mass, n = trials, conf.level = 0.95)[2]) 

#Select and arrange only the columns we need
fig3_data <- fig3_data[,c(1,3,4,5,6,8,9)]

#Remove the total summed computational rows
fig3_data <- fig3_data %>% 
  filter(
    polymer != "plastic_debris_polymers_total"
  )

#When some plastic is reported as 0, the wilson.ci formula spits out an interval, which isn't the case, so going to change those to 0's
condition <- fig3_data$polymer_total_mass == 0

fig3_data$wilson_lower <- replace(fig3_data$wilson_lower, fig3_data$polymer_total_mass %in% condition, 0)
fig3_data$wilson_upper <- replace(fig3_data$wilson_upper, fig3_data$polymer_total_mass %in% condition, 0)

#Add additional variables to dataset
#Categorical ranking of waste picker activity at each location
fig3_data <- fig3_data %>%
    rowwise() %>% #categorical ranking of waste picker activity at each location
  mutate(
    waste_picker_rank = ifelse(country == "Kenya", 1,
                   ifelse(country == "Thailand", 2,
                          ifelse(country == "Mexico", 2,
                                 ifelse(country == "Ecuador", 3,
                                        ifelse(country == "Vietnam", 3,
                                               ifelse(country == "Panama", 1,
                                                      ifelse(country == "Indonesia", 3,
                                                             1))))))))

#Reorder
fig3_data <- fig3_data[,c(1,2,3,8,4,5,6,7)]

#Calculate Wilson score intervals for Thailand and Panama (sub-sampling only for Panama, sub-sampling + recycled plastic for Thailand)

#Thailand

#Load data
final_dataset_raw <- read_csv("final_dataset_raw.csv") 
sub_sample_data <- read_csv("subsample_data.csv")
final_dataset <- read_csv("final_dataset.csv")

#Filter for raw Thailand data
thailand_raw <- final_dataset_raw %>% 
  filter(
    country == "Thailand"
  )

#Make sure necessary columns are numeric
thailand_raw$pvc_plastic <- as.numeric(thailand_raw$pvc_plastic)
thailand_raw$pet_plastic <- as.numeric(thailand_raw$pet_plastic)
thailand_raw$hdpe_plastic <- as.numeric(thailand_raw$hdpe_plastic)
thailand_raw$ldpe_plastic <- as.numeric(thailand_raw$ldpe_plastic)
thailand_raw$pp_plastic <- as.numeric(thailand_raw$pp_plastic)
thailand_raw$ps_plastic <- as.numeric(thailand_raw$ps_plastic)
thailand_raw$polymerother_plastic <- as.numeric(thailand_raw$polymerother_plastic)
thailand_raw$plastic_recycled <- as.numeric(thailand_raw$plastic_recycled)
thailand_raw$plastic_wtoe <- as.numeric(thailand_raw$plastic_wtoe)
thailand_raw$plastic_debris_clean <- as.numeric(thailand_raw$plastic_debris_clean)

#Summarize Thailand raw data to calculate sums of polymers and fates
thailand_sum <- thailand_raw %>% 
    summarise(
      plastic_pet = sum(pet_plastic, na.rm = TRUE), 
      plastic_hdpe = sum(hdpe_plastic, na.rm = TRUE), 
      plastic_pvc = sum(pvc_plastic, na.rm = TRUE),
      plastic_ldpe = sum(ldpe_plastic, na.rm = TRUE),
      plastic_pp = sum(pp_plastic, na.rm = TRUE),
      plastic_ps = sum(ps_plastic, na.rm = TRUE),
      plastic_other = sum(polymerother_plastic, na.rm = TRUE),
      plastic_recycled = sum(plastic_recycled, na.rm = TRUE),
      plastic_wtoe = sum(plastic_wtoe, na.rm = TRUE),
      plastic_all = sum(plastic_debris_clean, na.rm = TRUE))

#Summarize Thailand fixed/adjusted data to calculate sums of polymers and fates
thailand_sumfix <- final_dataset %>% 
  filter(
    country == "Thailand"
  ) %>% 
    summarise(
      plastic_pet = sum(plastic_pet, na.rm = TRUE), 
      plastic_hdpe = sum(plastic_hdpe, na.rm = TRUE), 
      plastic_pvc = sum(plastic_pvc, na.rm = TRUE),
      plastic_ldpe = sum(plastic_ldpe, na.rm = TRUE),
      plastic_pp = sum(plastic_pp, na.rm = TRUE),
      plastic_ps = sum(plastic_ps, na.rm = TRUE),
      plastic_other = sum(plastic_other, na.rm = TRUE),
      plastic_recycled = sum(plastic_fate_recycled, na.rm = TRUE),
      plastic_wtoe = sum(plastic_fate_energy, na.rm = TRUE),
      plastic_all = sum(plastic_debris, na.rm = TRUE))

#Mutate new polymer sum column
thailand_sum <- thailand_sum %>% 
      mutate(
        plastic_debris_polymers_total = plastic_pet + plastic_hdpe + plastic_pvc + plastic_ldpe + plastic_pp + plastic_ps + plastic_other
      ) %>% 
      mutate(
        id = "recycle"
      )

#Mutate new polymer sum column
thailand_sumfix <- thailand_sumfix %>% 
      mutate(
        plastic_debris_polymers_total = plastic_pet + plastic_hdpe + plastic_pvc + plastic_ldpe + plastic_pp + plastic_ps + plastic_other
      ) %>% 
      mutate(
        id = "all"
      )

#Filter for Thailand sub-sample data
thai_sample <- sub_sample_data %>% 
  filter(
    country == "Thailand"
  )

#Summarize Thailand sub-sample data to calculate sums of polymers
thai_sum <- thai_sample %>% 
  summarise(
      plastic_pet = sum(plastic_pet, na.rm = TRUE), 
      plastic_hdpe = sum(plastic_hdpe, na.rm = TRUE), 
      plastic_pvc = sum(plastic_pvc, na.rm = TRUE),
      plastic_ldpe = sum(plastic_ldpe, na.rm = TRUE),
      plastic_pp = sum(plastic_pp, na.rm = TRUE),
      plastic_ps = sum(plastic_ps, na.rm = TRUE),
      plastic_other = sum(plastic_other, na.rm = TRUE))

#Add fate data from thailand_sum for column matching for future merging
thai_sum <- thai_sum %>% 
  mutate(
    plastic_recycled = thailand_sum$plastic_recycled,
    plastic_wtoe = thailand_sum$plastic_wtoe,
    plastic_all = thailand_sumfix$plastic_all
  )

#Mutate new polymer sum column
thai_sum <- thai_sum %>% 
      mutate(
        plastic_debris_polymers_total = plastic_pet + plastic_hdpe + plastic_pvc + plastic_ldpe + plastic_pp + plastic_ps + plastic_other
  ) %>%
      mutate(
        id = "sample"
      )

#Bind three Thailand polymer sum datasets
thai_err <- rbind(thai_sum, thailand_sum, thailand_sumfix)

#Mutate trials column for Wilson score intervals, where trials for sub-sample is total sub-sample weight, but trials for recycled plastic is the total weight of plastic debris (not just total recycled weight) because the recycled plastic is the successes out of the full plastic weight collected and sampled
thai_err <- thai_err %>% 
  mutate(
    trials = ifelse(id == "recycle", 349476.21, plastic_debris_polymers_total)
  )

#Select and order columns
thai_err <- thai_err[,c(12,8,9,10,11,13,1,2,3,4,5,6,7)]

#Pivot to format for Wilson score interval calculations
thailand_error <- thai_err %>% 
  pivot_longer(
    cols = plastic_pet:plastic_other,
    names_to = "polymer",
    values_to = "mass_kg"
  )

#Reclassify polymer column
thailand_error$polymer <- as.factor(thailand_error$polymer)

#New data frame to work with
#Mutate columns with fate weights
#Mutate columns to determine the proportion of plastic recycled and plastic sent to waste to energy
#Mutate column to calculate trials as proportionate to their respective contributions
#Filter out non relevant rows
#Mutate column to calculate successes as proportionate to their respective contributions (i.e., actual recycled and sub-sampled kgs)
thailand_error_new <- thailand_error %>%
  group_by(polymer) %>% 
  mutate(
    rec = plastic_debris_polymers_total[id == "recycle"],
    wtoe = plastic_debris_polymers_total[id == "all"] - plastic_debris_polymers_total[id == "recycle"],
    all = plastic_debris_polymers_total[id == "all"]
  ) %>% 
  (ungroup) %>% 
  mutate(
    rec_prop = rec/all,
    wtoe_prop = wtoe/all
  ) %>% 
  mutate(
    trials_new = ifelse(id == "sample", ((plastic_debris_polymers_total*wtoe_prop) + (rec*rec_prop)), trials[id == "all"])
  ) %>% 
  filter(
    ifelse(id == "sample", polymer == "plastic_ldpe" | polymer == "plastic_ps" | polymer == "plastic_other", id == "recycle" | id == "all")) %>% 
  group_by(polymer) %>% 
  mutate(
    success = ifelse(id == "sample", mass_kg*wtoe_prop + mass_kg[id == "recycle"]*rec_prop, mass_kg)) %>% 
  mutate(
    prop = success/trials_new
  )

#Make sure necessary columns are numeric
thailand_error_new$trials_new <- as.numeric(thailand_error_new$trials_new)
thailand_error_new$success <- as.numeric(thailand_error_new$success)

#Run the Wilson test with proportionate successes and trials
thailand_error_new <- thailand_error_new %>%
  ungroup() %>%
  rowwise() %>% 
  mutate(
    wt_low = wilson.ci(success, n = trials_new, conf.level = 0.95)[1],
    wt_up = wilson.ci(success, n = trials_new, conf.level = 0.95)[2])

#Make sure necessary columns are numeric
thailand_error_new$wt_low <- as.numeric(thailand_error_new$wt_low)
thailand_error_new$wt_up <- as.numeric(thailand_error_new$wt_up)
thailand_error_new$prop <- as.numeric(thailand_error_new$prop)

#Mutate column that makes the upper and lower wilson test result a ratio to the proportion of the polymer
thailand_error_new <- thailand_error_new %>% 
  mutate(
    wt_low_rat = wt_low/prop) %>% 
  mutate(
    wt_up_rat = wt_up/prop)

#Mutate the actual, final proportion of each polymer, and apply calculated ratio to achieve final Wilson 95% confidence intervals
thailand_error_new <- thailand_error_new %>%
  group_by(polymer) %>% 
  mutate(
    prop_act = prop[id == "all"]
  ) %>%
  mutate(
  wilson_lower = wt_low_rat*prop_act,
  wilson_upper = wt_up_rat*prop_act
)

#Only need to keep sample rows
w_thai <-  thailand_error_new %>% 
  filter(
    id == "sample"
  )

#Send Thailand error calculations to fig3_data
fig3_data$wilson_lower[fig3_data$country == "Thailand" & fig3_data$polymer == "plastic_ldpe"] <- w_thai$wilson_lower[w_thai$polymer == "plastic_ldpe"]
fig3_data$wilson_lower[fig3_data$country == "Thailand" & fig3_data$polymer == "plastic_ps"] <- w_thai$wilson_lower[w_thai$polymer == "plastic_ps"]
fig3_data$wilson_lower[fig3_data$country == "Thailand" & fig3_data$polymer == "plastic_other"] <- w_thai$wilson_lower[w_thai$polymer == "plastic_other"]
fig3_data$wilson_upper[fig3_data$country == "Thailand" & fig3_data$polymer == "plastic_ldpe"] <- w_thai$wilson_upper[w_thai$polymer == "plastic_ldpe"]
fig3_data$wilson_upper[fig3_data$country == "Thailand" & fig3_data$polymer == "plastic_ps"] <- w_thai$wilson_upper[w_thai$polymer == "plastic_ps"]
fig3_data$wilson_upper[fig3_data$country == "Thailand" & fig3_data$polymer == "plastic_other"] <- w_thai$wilson_upper[w_thai$polymer == "plastic_other"]

#Panama

#Filter Panama data from sub-sample data frame
pan_sample <- sub_sample_data %>% 
  filter(
    country == "Panama"
  )

#Summarize Panama sub-data to calculate sums of polymers
pan_sum <- pan_sample %>% 
  summarise(
      plastic_pet = sum(plastic_pet, na.rm = TRUE), 
      plastic_hdpe = sum(plastic_hdpe, na.rm = TRUE), 
      plastic_pvc = sum(plastic_pvc, na.rm = TRUE),
      plastic_ldpe = sum(plastic_ldpe, na.rm = TRUE),
      plastic_pp = sum(plastic_pp, na.rm = TRUE),
      plastic_ps = sum(plastic_ps, na.rm = TRUE),
      plastic_other = sum(plastic_other, na.rm = TRUE),
      debris = sum(total_debris, na.rm = TRUE),
      plastic = sum(total_plastic, na.rm. = TRUE))

#Mutate sum of polymer column
pan_sum <- pan_sum %>% 
      mutate(
        plastic_debris_polymers_total = plastic_pet + plastic_hdpe + plastic_pvc + plastic_ldpe + plastic_pp + plastic_ps + plastic_other
  )

#Pivot summary data to format for error calculations
pan_error <- pan_sum %>% 
  pivot_longer(
    cols = plastic_pet:plastic_other,
    names_to = "polymer",
    values_to = "mass_kg"
  )

#Calculate proportions of polymers out of total plastic
#Calculate Wilson scores using kg of sub-sampled polymers and total sub-sampled plastic
pan_error <- pan_error %>% 
  mutate(
    prop = mass_kg /plastic_debris_polymers_total
  )  %>% 
  ungroup() %>%
  rowwise() %>% 
  mutate(
    wilson_lower = wilson.ci(mass_kg, n = plastic_debris_polymers_total, conf.level = 0.95)[1],
    wilson_upper = wilson.ci(mass_kg, n = plastic_debris_polymers_total, conf.level = 0.95)[2])

#Send Panama error calculations to fig3_data
fig3_data$wilson_lower[fig3_data$country == "Panama" & fig3_data$polymer == "plastic_pet"] <- pan_error$wilson_lower[pan_error$polymer == "plastic_pet"]
fig3_data$wilson_lower[fig3_data$country == "Panama" & fig3_data$polymer == "plastic_hdpe"] <- pan_error$wilson_lower[pan_error$polymer == "plastic_hdpe"]
fig3_data$wilson_lower[fig3_data$country == "Panama" & fig3_data$polymer == "plastic_pvc"] <- pan_error$wilson_lower[pan_error$polymer == "plastic_pvc"]
fig3_data$wilson_lower[fig3_data$country == "Panama" & fig3_data$polymer == "plastic_ldpe"] <- pan_error$wilson_lower[pan_error$polymer == "plastic_ldpe"]
fig3_data$wilson_lower[fig3_data$country == "Panama" & fig3_data$polymer == "plastic_ps"] <- pan_error$wilson_lower[pan_error$polymer == "plastic_ps"]
fig3_data$wilson_lower[fig3_data$country == "Panama" & fig3_data$polymer == "plastic_pp"] <- pan_error$wilson_lower[pan_error$polymer == "plastic_pp"]
fig3_data$wilson_lower[fig3_data$country == "Panama" & fig3_data$polymer == "plastic_other"] <- pan_error$wilson_lower[pan_error$polymer == "plastic_other"]
fig3_data$wilson_upper[fig3_data$country == "Panama" & fig3_data$polymer == "plastic_pet"] <- pan_error$wilson_upper[pan_error$polymer == "plastic_pet"]
fig3_data$wilson_upper[fig3_data$country == "Panama" & fig3_data$polymer == "plastic_hdpe"] <- pan_error$wilson_upper[pan_error$polymer == "plastic_hdpe"]
fig3_data$wilson_upper[fig3_data$country == "Panama" & fig3_data$polymer == "plastic_pvc"] <- pan_error$wilson_upper[pan_error$polymer == "plastic_pvc"]
fig3_data$wilson_upper[fig3_data$country == "Panama" & fig3_data$polymer == "plastic_ldpe"] <- pan_error$wilson_upper[pan_error$polymer == "plastic_ldpe"]
fig3_data$wilson_upper[fig3_data$country == "Panama" & fig3_data$polymer == "plastic_ps"] <- pan_error$wilson_upper[pan_error$polymer == "plastic_ps"]
fig3_data$wilson_upper[fig3_data$country == "Panama" & fig3_data$polymer == "plastic_pp"] <- pan_error$wilson_upper[pan_error$polymer == "plastic_pp"]
fig3_data$wilson_upper[fig3_data$country == "Panama" & fig3_data$polymer == "plastic_other"] <- pan_error$wilson_upper[pan_error$polymer == "plastic_other"]

Figure 4. Proportion of Single-Use Plastic Item Categories in Plastic Debris Collected

#Figure 4. Begin with final_dataset_fig4.csv

#Pivot so each location and month has a row for all items
fig4_data <- final_dataset_fig4 %>% 
  pivot_longer(
    cols = plastic_item_bottles:plastic_debris_items_total,
    names_to = "item",
    values_to = "mass_kg"
  )

#Reclassify columns (numeric, factor, date)
fig4_data$mass_kg <- as.numeric(fig4_data$mass_kg)
fig4_data$item <- as.factor(fig4_data$item)
fig4_data$date <- as.yearmon(fig4_data$date)

#Summarize date for sum weights of all items
fig4_data <- fig4_data %>% 
  group_by(river_id, organization, country, item) %>% 
  summarise(
    item_total_mass = sum(mass_kg, na.rm = TRUE),
    )

#Calculate a proportion for each item at each location based on the total of that item/total of all plastic.
#Calculating this using the monthly mean/monthly total yields exact same results.
fig4_data <- fig4_data %>% 
  group_by(river_id) %>% 
  mutate(
    item_proportions = item_total_mass / item_total_mass[item == "plastic_debris_items_total"])

#Calculate Wilson score intervals for 95% confidence intervals

#Mutate new column for "trials" = total plastic weight for each location
fig4_data <- fig4_data %>% 
  group_by(river_id) %>%
  mutate(trials = item_total_mass[item == "plastic_debris_items_total"])

#Make sure necessary columns are numeric
fig4_data$item_total_mass <- as.numeric(fig4_data$item_total_mass)
fig4_data$trials <- as.numeric(fig4_data$trials)

#Run Wilson test where the total weight of the item at each location is the success out the trials, which is the total kg weight for that location
fig4_data <- fig4_data %>% 
  ungroup() %>% 
  rowwise() %>%
  mutate(
    wilson_lower = wilson.ci(item_total_mass, n = trials, conf.level = 0.95)[1],
    wilson_upper = wilson.ci(item_total_mass, n = trials, conf.level = 0.95)[2])

#Select and arrange columns
fig4_data <- fig4_data[,c(1,3,4,5,6,8,9)]

#Add additional variables to dataset
#Categorical indicator of bag policy
fig4_data <- fig4_data %>%
    rowwise() %>%
  mutate(
    bag_policy = ifelse(country == "Vietnam", "No", "Yes"))

#Remove total summed computational rows
fig4_data <- fig4_data %>% 
  filter(
    item != "plastic_debris_items_total"
  )

#Select and arrange final columns
fig4_data <- fig4_data[,c(1,2,8,3,4,5,6,7)]

Figure 5. End-of-Life Fate of Plastic Debris Collected

#Figure 5. Begin with final_dataset_fig5.csv

#Pivot so each location and month has a row for all fates
fig5_data <- final_dataset_fig5 %>% 
  pivot_longer(
    cols = plastic_fate_recycled:plastic_debris_fate_total,
    names_to = "fate",
    values_to = "mass_kg"
  )

#Reclassify columns (numeric, factor, date)
fig5_data$mass_kg <- as.numeric(fig5_data$mass_kg)
fig5_data$fate <- as.factor(fig5_data$fate)
fig5_data$date <- as.yearmon(fig5_data$date)

#Summarize date for sum weights of all fates
fig5_data <- fig5_data %>% 
  group_by(river_id, organization, country, fate) %>% 
  summarise(
    fate_total_mass = sum(mass_kg, na.rm = TRUE))

#Calculate a proportion for each fate at each location based on the total of that fate/total of all plastic. 
#Calculating proportions using monthly mean/monthly mean total yields the exact same results.
fig5_data <- fig5_data %>% 
  group_by(river_id) %>% 
  mutate(
    fate_proportions = fate_total_mass / fate_total_mass[fate == "plastic_debris_fate_total"])

#Calculate Wilson score intervals for 95% confidence intervals

#Mutate new column for "trials" = total plastic weight for each location
fig5_data <- fig5_data %>% 
  group_by(river_id) %>%
  mutate(trials = fate_total_mass[fate == "plastic_debris_fate_total"])

#Make sure necessary columns are numeric
fig5_data$fate_total_mass <- as.numeric(fig5_data$fate_total_mass)
fig5_data$trials <- as.numeric(fig5_data$trials)

#Run Wilson test where the total weight of the fate at each location is the success out the trials, which is the total kg weight for that location
fig5_data <- fig5_data %>% 
  ungroup() %>%
  rowwise() %>%
  mutate(
    wilson_lower = wilson.ci(fate_total_mass, n = trials, conf.level = 0.95)[1],
    wilson_upper = wilson.ci(fate_total_mass, n = trials, conf.level = 0.95)[2])

#Select and arrange only the columns we need
fig5_data <- fig5_data[,c(1,3,4,5,6,8,9)]

#Remove total summed computational rows
fig5_data <- fig5_data %>% 
  filter(
    fate != "plastic_debris_fate_total"
  )

#When some fate is reported as 0, the wilson.ci formula spits out an interval, which isn't the case (beacuse it is 0), so change these to 0
condition <- fig5_data$fate_total_mass == 0

fig5_data$wilson_lower <- replace(fig5_data$wilson_lower, fig5_data$fate_total_mass %in% condition, 0)
fig5_data$wilson_upper <- replace(fig5_data$wilson_upper, fig5_data$fate_total_mass %in% condition, 0)

Resulting Datasets

fig2_data.csv
fig3_data.csv
fig4_data.csv
fig5_data.csv

Create Additional Dataset Including Variables for GLMM

final_dataset_fig2.csv -> fig2_glmm.csv

Figure 2 dataset in format required to run GLMM

#Figure 2
#Total Debris and Plastic Debris GLMM
#Begin with final_dataset_fig2.csv

#Load data
final_dataset_fig2 <- read_csv("final_dataset_fig2.csv")

#New data frame to work with
fig2_glmm <- final_dataset_fig2

#Mutate additional variables for GLMM
#Nearest urban populations
#Categorical ranking of technology types
#Average reported width of river at collection locations
#Length of river
#Categorical ranking of waste picker activity at each location
fig2_glmm <- fig2_glmm %>%
    rowwise() %>% 
  mutate(
    urban_population = ifelse(country == "Kenya", 4734881,
                   ifelse(country == "Thailand", 10539415,
                          ifelse(country == "Mexico", 2140398,
                                 ifelse(country == "Ecuador", 275421,
                                        ifelse(country == "Vietnam", 236294,
                                               ifelse(country == "Panama", 1860291,
                                                      ifelse(country == "Indonesia", 2580191,
                                                             590940)))))))) %>% 
  mutate(
    tech_rank = ifelse(country == "Kenya", 1,
                   ifelse(country == "Thailand", 2,
                          ifelse(country == "Mexico", 1,
                                 ifelse(country == "Ecuador", 3,
                                        ifelse(country == "Vietnam", 2,
                                               ifelse(country == "Panama", 3,
                                                      ifelse(country == "Indonesia", 2,
                                                             2)))))))) %>%
  mutate(
    river_width_m = ifelse(country == "Kenya", 6,
                   ifelse(country == "Thailand", 20,
                          ifelse(country == "Mexico", 7,
                                 ifelse(country == "Ecuador", 19,
                                        ifelse(country == "Vietnam", 800,
                                               ifelse(country == "Panama", 34,
                                                      ifelse(country == "Indonesia", 319,
                                                             20)))))))) %>%
  mutate(
    river_length_km = ifelse(country == "Kenya", 390,
                   ifelse(country == "Thailand", 370,
                          ifelse(country == "Mexico", 190,
                                 ifelse(country == "Ecuador", 100,
                                        ifelse(country == "Vietnam", 1149,
                                               ifelse(country == "Panama", 28,
                                                      ifelse(country == "Indonesia", 297,
                                                             9)))))))) %>%
  mutate(
    waste_picker_rank = ifelse(country == "Kenya", 1,
                   ifelse(country == "Thailand", 2,
                          ifelse(country == "Mexico", 2,
                                 ifelse(country == "Ecuador", 3,
                                        ifelse(country == "Vietnam", 3,
                                               ifelse(country == "Panama", 1,
                                                      ifelse(country == "Indonesia", 3,
                                                             1))))))))

#Check class of each category
sapply(fig2_glmm, class)

#Reclassify columns as necessary
fig2_glmm$tech_rank <- as.factor(fig2_glmm$tech_rank)
fig2_glmm$waste_picker_rank <- as.factor(fig2_glmm$waste_picker_rank)

Resulting Datasets

fig2_glmm.csv