################################################################################
#Script for the paper:
  #"Projected speaker numbers and dormancy risks of Canada’s Indigenous languages"

#THE SCRIPT IS DIVIDED IN FIVE PARTS
  #STEP 1: DATA PREPARATION, COUNTS OF SPEAKERS
  #STEP 2: BASELINE POPULATIONS
  #STEP 3: TRENDS IN MEAN NB OF CHILDREN
  #STEP 4: PROJECTION
  #STEP 5: ANALYSIS

#EACH PART HAS ITS OWN SUBDIVISIONS

#A SHORT SECTION AT THE END IS DEDICATED TO THE IN-TEXT COMMENTS AND APPENDICES

#Clear environment
rm(list=ls())

#Dowload packages
library(tidyverse)
library(openxlsx)
library(rnaturalearth)
library(sf)
library(ggrepel)
library(raster)
library(ggpubr)
library(treemapify)

#set theme
theme_set(theme_bw())

#set language
Sys.setenv(LANG = "en")

#vector of language names (allows to jump directly to any step)
names <- c("Anicinabemowin", "Atikamekw", "Blackfoot", "Cree langs","Dakelh",
           "Dakotan langs", "Dene", "Gitxsan","Gwich'in",  "Haida", "Innu-Naskapi",  
           "Inuktitut", "Ktunaxa","Mi'kmaq", "Mohawk","Nisga'a","Ntlakapamux",
           "Nuu-chah-nulth","Oji-Cree",  "Ojibwa langs",  "Secwepemctsin",
           "Slavey (N & S)","Tlicho","Tlingit","Tsilhqot'in","Tsimshian", "Wolastoqewi")

################################################################################
#STEP 1: DATA PREPARATION, COUNTS OF SPEAKERS
#1. YEAR 2001
#2. YEAR 2006
#3. YEAR 2011
#4. YEAR 2016
#5. YEAR 2021
#6. YEARS 2001-2021
#7. WPP DATA
################################################################################
#1.1 YEAR 2001##################################################################
#load data, dplyr::select appropriate information, add age vector
sc01 <- read.csv("https://zenodo.org/records/12530583/files/indigenousmothertongue2001.csv?download=1")[5:39, -2] %>% 
  pivot_longer(X0.4:X100., names_to="age", values_to="pop") %>%
  mutate(age = str_extract(age, "[0-9]+")) %>%
  rename(name = 1) %>%
  filter(!str_detect(name, "languages") | (str_detect(name, "n.i.e.") | str_detect(name, "n.o.s."))) %>%
  mutate(year = 2001, name = trimws(str_remove(name, " languages")))

#assign ISO codes
sc01 <- sc01 %>% 
  mutate(iso = case_when(
    name == "Algonquin" ~ "alq", 
    name == "Attikamekw" ~ "atj",
    name == "Blackfoot" ~ "bla",
    name == "Cree" ~ "Cree",
    name == "Malecite" ~ "pqm",
    name == "Micmac" ~ "mic",
    name == "Montagnais-Naskapi" ~ "Montagnais-Naskapi",
    name == "Ojibway" ~ "Ojibway",
    name == "Oji-Cree" ~ "ojs",
    name == "Algonquian, n.i.e." ~ "Algonquian languages",
    
    name == "Carrier" ~ "crx",
    name == "Chilcotin" ~ "clc",
    name == "Chipewyan" ~ "Chipewyan",
    name == "Dene" ~ "chp",
    name == "Dogrib" ~ "dgr",
    name == "Kutchin-Gwich'in (Loucheux)" ~ "gwi",
    name == "North Slave (Hare)" ~ "scs",
    name == "South Slave" ~ "xsl",
    name == "Athapaskan, n.i.e." ~ "Athabaskan languages",
    
    name == "Tlingit" ~ "tli",
    name == "Haida" ~ "hax-hdn",
    
    name == "Mohawk" ~ "moh",
    name == "Iroquoian, n.i.e." ~ "Iroquoian languages",
    
    name == "Kutenai" ~ "kut",
    
    name == "Shuswap" ~ "shs",
    name == "Thompson (Ntlakapamux)" ~ "thp",
    name == "Salish, n.i.e." ~ "Salish languages",
    
    name == "Dakota/Sioux" ~ "Siouan",
    
    name == "Gitksan" ~ "git",
    name == "Nishga" ~ "ncg",
    name == "Tsimshian" ~ "tsi",
    
    name == "Nootka" ~ "nuk",
    name == "Wakashan, n.i.e." ~ "Wakashan languages",
    
    name == "Inuktitut (Eskimo)" ~ "ike",
    
    name == "Aboriginal, n.i.e." ~ "Indigenous, n.i.e."))

#languages to which we assign NA speakers because they are not in this census round
sc01 <- bind_rows(sc01, 
                  data.frame(
                    iso = c("crj", "crk", "crl", "crm", "csw", "cwd", #Cree languages
                            "moe","nsk",                              #Innu-Naskapi
                            "ciw", "ojw", "otw",                      #Ojibway
                            "bcr", "bea","sek", "kkz","srs", "ttm",
                            "tce","tht", "Slavey", "Tutchone",        #Athabaskan
                            "cay", "one",                             #Iroquoian
                            "hur", "lil", "squ","str", "oka", "coo",  #Salish
                            "dak", "sto", "asb", "Siouan languages",  #Siouan
                            "has", "hei", "kwk",                      #Wakashan
                            "ikt", "Inuvialuktun", "Inuit languages", #Inuit languages
                            "crg", "Indigenous, n.o.s."),             #Other
                    pop = NA,
                    year = 2001))

#1.2 YEAR 2006###################################################################
#load data, dplyr::select appropriate information, add age vector
sc06 <- read.csv("https://zenodo.org/records/12530583/files/indigenousmothertongue2006.csv?download=1")[5:40, -2] %>% 
  pivot_longer(X0.to.4.years:X100.years.and.over, names_to="age", values_to="pop") %>%
  mutate(age = str_extract(age, "[0-9]+")) %>%
  rename(name = 1) %>%
  filter(!str_detect(name, "languages") | (str_detect(name, "n.i.e.") | str_detect(name, "n.o.s."))) %>%
  mutate(year = 2006, name = trimws(str_remove(name, " languages")))

#add iso codes
sc06 <- left_join(sc06, sc01 %>% dplyr::select(name, iso) %>% unique()) %>%
  bind_rows(data.frame(name = NA, pop = NA, age = NA, year = 2006, iso = sc01 %>% filter(is.na(name)) %>% pull(iso)))

#check missing iso's
sc06 %>% filter(is.na(iso)) %>% pull(name) %>% unique() %>% sort()

#add iso's manually
sc06 <- sc06 %>% 
  mutate(iso = case_when(
    name == "Inuinnaqtun" ~ "ikt",
    name == "Inuktitut, n.i.e." ~ "ike",
    name == "Atikamekw" ~ "atj",
    name == "Mi'kmaq" ~ "mic",
    name == "Nisga'a" ~ "ncg",
    TRUE ~ as.character(iso))) 

#1.3 YEAR 2011###################################################################
sc11 <- read.csv("https://zenodo.org/records/12530583/files/indigenousmothertongue2011.csv?download=1")[7:80, -2] %>% 
  pivot_longer(X0.to.4.years:X100.years.and.over, names_to="age", values_to="pop") %>%
  mutate(age = str_extract(age, "[0-9]+")) %>%
  rename(name = 1) %>%
  filter(!str_detect(name, "languages") | (str_detect(name, "n.i.e.") | str_detect(name, "n.o.s."))) %>%
  mutate(year = 2011, name = trimws(str_remove(name, " languages")))

#extract names between parentheses
sc11 <- sc11 %>% 
  mutate(name1 = str_remove(name," \\s*\\([^\\)]+\\)"),
         name2 = str_extract(name,"(?<=\\()([^()]*?)(?=\\)[^()]*$)"))

#add iso codes: by full name
sc11 <- sc11 %>% 
  left_join(sc06 %>% filter(!is.na(name)) %>% dplyr::select(name, iso) %>% unique())

#add iso codes: by name before parenthesis
sc11 <- sc11 %>% 
  filter(is.na(iso)) %>% 
  dplyr::select(name, age, pop, year, name1, name2) %>% 
  left_join(sc06 %>% filter(!is.na(name)) %>% dplyr::select(name, iso) %>% unique(), by=c("name1" = "name")) %>%
  bind_rows(sc11 %>% filter(!is.na(iso)))

#add iso codes: by name between parenthesis
sc11 <- sc11 %>% 
  filter(is.na(iso)) %>% 
  dplyr::select(name, age, pop, year, name1, name2) %>% 
  left_join(sc06 %>% filter(!is.na(name)) %>% dplyr::select(name, iso) %>% unique(), by=c("name2" = "name")) %>%
  bind_rows(sc11 %>% filter(!is.na(iso))) %>%
  filter(name!="Cree, n.i.e.") 

#check missing iso's
sc11 %>% filter(is.na(iso)) %>% pull(name) %>% unique() %>% sort()

#add iso's manually
sc11 <- sc11 %>% 
  mutate(iso = case_when(
    name == "Aboriginal, n.i.e." ~ "Indigenous, n.i.e.",
    name == "Beaver" ~ "bea", 
    name == "Cayuga" ~ "cay",
    name == "Cree, n.o.s." ~ "Cree", 
    name == "Cree, n.i.e." ~ "Cree",
    name == "Dakota" ~ "dak",
    name == "Gwich'in" ~ "gwi",
    name == "Haisla" ~ "has",
    name == "Halkomelem" ~ "hur",
    name == "Heiltsuk" ~ "hei",
    name == "Innu/Montagnais" ~ "moe",
    name == "Inuvialuktun" ~ "Inuvialuktun",
    name == "Inuit, n.i.e." ~ "Inuit languages",
    name == "Inuktitut" ~ "ike",
    name == "Kaska (Nahani)" ~ "kkz",
    name == "Kwakiutl (Kwak'wala)" ~ "kwk",
    name == "Lillooet" ~ "lil",
    name == "Michif" ~ "crg",
    name == "Naskapi" ~ "nsk",
    name == "North Slavey (Hare)" ~ "scs",  
    name == "Northern Tutchone" ~ "ttm",
    name == "Okanagan" ~ "oka",
    name == "Oneida" ~ "one",
    name == "Plains Cree" ~ "crk",
    name == "Sarcee" ~ "srs",
    name == "Sekani" ~ "sek",
    name == "Siouan, n.i.e." ~ "Siouan languages",
    name == "Slavey, n.o.s." ~ "Slavey",
    name == "South Slavey" ~ "xsl",
    name == "Southern Tutchone" ~ "tce",
    name == "Squamish" ~ "squ",
    name == "Stoney" ~ "sto",
    name == "Straits" ~ "str",
    name == "Swampy Cree" ~ "csw",
    name == "Tahltan" ~ "tht",
    name == "Tutchone, n.o.s." ~ "Tutchone",
    name == "Wetsuweten" ~ "bcr",
    name == "Woods Cree" ~ "cwd",
    TRUE ~ as.character(iso))) 

#reorganize data because Cree appears twice
sc11 <- sc11 %>% group_by(name, age, year, name1, name2, iso) %>% summarise(pop = sum(pop))

#add iso for languages that did not appear in this census data release
sc11 <- sc11 %>% 
  dplyr::select(name, pop, age, year, iso, name1, name2) %>%
  bind_rows(data.frame(name = NA, 
                       pop = NA, 
                       age = NA, 
                       year = 2011,
                       iso = sc06 %>% pull(iso) %>% unique() %>%
                         setdiff(sc11 %>% pull(iso) %>% unique()))) %>% ungroup()

#1.4 YEAR 2016##################################################################
#Load data
sc16 <- read.csv("https://zenodo.org/records/12530583/files/indigenousmothertongue2016.csv?download=1")[8:89, -2] %>% 
  pivot_longer(X0.to.4.years:X100.years.and.over, names_to="age", values_to="pop") %>%
  mutate(age = str_extract(age, "[0-9]+")) %>%
  rename(name = 1) %>%
  filter(!str_detect(name, "languages") | (str_detect(name, "n.i.e.") | str_detect(name, "n.o.s."))) %>%
  mutate(year = 2016, name = trimws(str_remove(name, " languages")))

#extract names between parentheses
sc16 <- sc16 %>% 
  mutate(name1 = str_remove(name," \\s*\\([^\\)]+\\)"),
         name2 = str_extract(name,"(?<=\\()([^()]*?)(?=\\)[^()]*$)"))

#add iso codes: by full name
sc16 <- sc16 %>% left_join(sc11 %>% dplyr::select(name, iso) %>% unique())

#add iso codes: by name before parenthesis
sc16 <- sc16 %>% 
  filter(is.na(iso)) %>% 
  dplyr::select(name, age, pop, year, name1, name2) %>% 
  left_join(sc11 %>% dplyr::select(name1, iso) %>% unique()) %>%
  bind_rows(sc16 %>% filter(!is.na(iso)))

sc16 <- sc16 %>% 
  filter(is.na(iso)) %>% 
  dplyr::select(name, age, pop, year, name1, name2) %>% 
  left_join(sc11 %>% dplyr::select(name2, iso) %>% filter(!is.na(name2)) %>% unique(), by=c("name1" = "name2")) %>%
  bind_rows(sc16 %>% filter(!is.na(iso)))

#add iso codes: by name between parenthesis
sc16 <- sc16 %>% 
  filter(is.na(iso)) %>% 
  dplyr::select(name, age, pop, year, name1, name2) %>% 
  left_join(sc11 %>% filter(!is.na(name)) %>% dplyr::select(name1, iso) %>% unique(), by=c("name2" = "name1")) %>%
  bind_rows(sc16 %>% filter(!is.na(iso)))

sc16 <- sc16 %>% 
  filter(is.na(iso)) %>% 
  dplyr::select(name, age, pop, year, name1, name2) %>% 
  left_join(sc11 %>% filter(!is.na(name)) %>% dplyr::select(name2, iso) %>% filter(!is.na(name2)) %>% unique()) %>%
  bind_rows(sc16 %>% filter(!is.na(iso)))

#check name changes, missings
sc16 %>% filter(is.na(iso)) %>% pull(name) %>% unique() %>% sort()

#change iso's manually
sc16 <- sc16 %>% 
  mutate(iso = case_when(
    name == "Aboriginal, n.o.s." ~ "Indigenous, n.o.s.",
    name == "Athabaskan, n.i.e." ~ "Athabaskan languages",
    name == "Babine (Wetsuwet'en)" ~ "bcr", 
    name == "Comox" ~ "coo",
    name == "Montagnais (Innu)" ~ "moe",
    name == "Moose Cree" ~ "crm",
    name == "Northern East Cree" ~ "crl",
    name == "Southern East Cree" ~ "crj",
    name == "Ottawa (Odawa)" ~ "otw",
    TRUE ~ as.character(iso)))

#add iso for languages that did not appear in this census data release
sc16 <- sc16 %>% 
  dplyr::select(name, pop, age, year, iso, name1, name2) %>%
  bind_rows(data.frame(name = NA, 
                       pop = NA, 
                       age = NA, 
                       year = 2016,
                       iso = sc11 %>% pull(iso) %>% unique() %>%
                         setdiff(sc16 %>% pull(iso) %>% unique())))

#1.5 YEAR 2021##################################################################
#load data
sc21 <- read.csv("https://zenodo.org/records/12530583/files/indigenousmothertongue2021.csv?download=1")[5:76, -2] %>% 
  pivot_longer(X0.to.4.years:X100.years.and.over, names_to="age", values_to="pop") %>%
  mutate(age = str_extract(age, "[0-9]+")) %>%
  rename(name = 1) %>%
  filter(!str_detect(name, "languages") | (str_detect(name, "n.i.e.") | str_detect(name, "n.o.s."))) %>%
  mutate(year = 2021, name = trimws(str_remove(name, " languages")))

#extract names between parentheses
sc21 <- sc21 %>% 
  mutate(name1 = str_remove(name," \\s*\\([^\\)]+\\)"),
         name2 = str_extract(name,"(?<=\\()([^()]*?)(?=\\)[^()]*$)"))

#add iso codes: by full name
sc21 <- sc21 %>% 
  left_join(sc16 %>% filter(!is.na(name)) %>% dplyr::select(name, iso) %>% unique())

#add iso codes: by name before parenthesis
sc21 <- sc21 %>% 
  filter(is.na(iso)) %>% 
  dplyr::select(name, age, pop, year, name1, name2) %>% 
  left_join(sc16 %>% filter(!is.na(name)) %>% dplyr::select(name1, iso) %>% unique()) %>%
  bind_rows(sc21 %>% filter(!is.na(iso)))

sc21 <- sc21 %>% 
  filter(is.na(iso)) %>% 
  dplyr::select(name, age, pop, year, name1, name2) %>% 
  left_join(sc16 %>% filter(!is.na(name), !is.na(name2)) %>% dplyr::select(name2, iso) %>% unique(), by=c("name1" = "name2")) %>%
  bind_rows(sc21 %>% filter(!is.na(iso)))

#add iso codes: by name between parenthesis
sc21 <- sc21 %>% 
  filter(is.na(iso)) %>% 
  dplyr::select(name, age, pop, year, name1, name2) %>% 
  left_join(sc16 %>% filter(!is.na(name)) %>%dplyr::select(name1, iso) %>% unique(), by=c("name2" = "name1")) %>%
  bind_rows(sc21 %>% filter(!is.na(iso)))

sc21 <- sc21 %>% 
  filter(is.na(iso)) %>% 
  dplyr::select(name, age, pop, year, name1, name2) %>% 
  left_join(sc16 %>% filter(!is.na(name), !is.na(name2)) %>% dplyr::select(name2, iso) %>% unique()) %>%
  bind_rows(sc21 %>% filter(!is.na(iso)))

#check name changes, missings
sc21 %>% filter(is.na(iso)) %>% pull(name) %>% unique() %>% sort()

#change iso's manually
sc21 <- sc21 %>% 
  mutate(iso = case_when(
    name == "Anishinaabemowin (Chippewa)" ~ "ciw", 
    name == "Assiniboine" ~ "asb",
    name == "Dene, n.o.s." ~ "chp",
    name == "Inuktut (Inuit), n.i.e." ~ "Inuit languages",
    name == "Ojibway, n.o.s." ~ "Ojibway",
    name == "Saulteau (Western Ojibway)" ~ "ojw",
    name == "Tutchone, n.o.s." ~ "Tutchone",
    name == "Wetsuwet'en-Babine" ~ "bcr",
    name == "Inuvialuktun" ~ "Inuvialuktun",
    name == "Indigenous, n.i.e." ~ "Indigenous, n.i.e.",
    name == "Indigenous, n.o.s." ~ "Indigenous, n.o.s.",
    TRUE ~ as.character(iso)))

#add iso for languages that did not appear in this census data release
sc21 <- sc21 %>% 
  dplyr::select(name, pop, age, year, iso, name1, name2) %>%
  bind_rows(data.frame(name = NA, 
                       pop = NA, 
                       age = NA, 
                       year = 2021,
                       iso = sc16 %>% pull(iso) %>% unique() %>%
                         setdiff(sc21 %>% pull(iso) %>% unique())))

#1.6 YEARS 2001-2021############################################################
#combine in single data frame
sc <- bind_rows(sc01, sc06, sc11, sc16, sc21) %>%
  dplyr::select(-name, -name1, -name2) %>%
  rename(name = iso) %>%
  mutate(age = as.numeric(age))

#calculate number of speakers in continua, append to original data 
sc <- bind_rows(
  sc %>% 
    filter(!name %in% c("Cree", "csw", "crk", "cwd", "crl", "crj", "crm", 
                        "Montagnais-Naskapi", "moe", "nsk", "Ojibway", "otw", 
                        "ciw", "ojw", 'Chipewyan', "chp", "scs", "xsl", "Slavey", 
                        "ttm", "tce", "Tutchone", "dak", "sto", "asb", "Siouan", 
                        "Siouan languages", "ikt", "Inuvialuktun")),
  sc %>% 
  mutate(name = case_when(
    name %in% c("Cree", "csw", "crk", "cwd", "crl", "crj", "crm") ~ "Cree langs",
    name %in% c("Montagnais-Naskapi", "moe", "nsk") ~ "Innu-Naskapi",
    name %in% c("Ojibway", "otw", "ciw", "ojw") ~ "Ojibwa langs",
    name %in% c('Chipewyan', "chp") ~ "Dene",
    name %in% c("scs", "xsl", "Slavey") ~ "Slavey (N & S)",
    name %in% c("ttm", "tce", "Tutchone") ~ "Tutchone (N & S)",
    name %in% c("dak", "sto", "asb", "Siouan", "Siouan languages") ~ "Dakotan langs",
    name %in% c("ikt", "Inuvialuktun") ~ "Inuinnaqtun")) %>% 
  group_by(name, age, year) %>%
  summarise(pop = sum(pop, na.rm = T)) %>%
    filter(!is.na(name)))

#add the sc names in 2021 and use them where possible
sc <- left_join(sc, sc21 %>% distinct(name, iso) %>% rename(sc_name = 1, name = 2)) %>%
  mutate(name = ifelse(is.na(sc_name), name, str_remove(sc_name, " \\s*\\([^\\)]+\\)"))) %>%
  dplyr::select(-sc_name) -> SC

#identify languages or continua that appear 5 times or more
names <- SC %>% 
  filter(!is.na(name), !is.na(pop), !is.na(year), !is.na(age), !grepl('languages', name), !grepl("n.i.e", name)) %>% 
  distinct(name, year) %>% 
  group_by(name) %>%
  summarise(n = n()) %>%
  filter(n ==5) %>% 
  pull(name)

#keep observations that appear 5 times
sc <- sc %>% 
  filter(name %in% names, !is.na(age)) %>%
  arrange(year, name, age) 

saveRDS(sc, "Data/sc")

##1.7 WPP DATA##################################################################
#Set timeout 
options(timeout=1000)

#specify vector with file names
filenames <- sapply(1:13, function(x) 
  paste('https://zenodo.org/records/14221520/files/wpp%202024%20single%20age%20life%20tables%20boths%20sexes%20', 
        c(seq(1976, 2016, 10), seq(2024, 2094, 10))[x], '-', c(seq(1985, 2015, 10), seq(2023, 2093, 10), 2100)[x], '.xlsx?download=1', sep = ''))

#Load data
surv_prob_umic <- bind_rows(lapply(filenames, function(x) read.xlsx(x) %>% filter(X3=="Upper-middle-income countries", X11 %in% seq(1984, 2099, 5))))
surv_prob_esea <- bind_rows(lapply(filenames, function(x) read.xlsx(x) %>% filter(X3=="Eastern and South-Eastern Asia", X11 %in% seq(1984, 2099, 5))))

#function to arrange data in probabilities of surviving 0-2.5, 2.5-7.5, 7.5-12.5, ...
fct <- function(df, yr){
  
  df1 <- data.frame(age = seq(2.5, 97.5, 5), 
                    prob1 = (as.numeric(df %>% filter(X11 == yr, X12 %in% seq(2, 97, 5)) %>% pull(X16)) + 
                               as.numeric(df %>% filter(X11 == yr, X12 %in% seq(3, 98, 5)) %>% pull(X16)))/2)
  
  df2 <- data.frame(age = seq(2.5, 102.5, 5), 
                    prob2 = as.numeric(c(df %>% filter(X11 == yr, X12 %in% c(seq(0, 100, 5))) %>% pull(X16)))*
                      as.numeric(c(df %>% filter(X11 == yr, X12 %in% c(seq(1, 96, 5))) %>% pull(X16), 0))*
                      as.numeric(c(1, df %>% filter(X11 == yr, X12 %in% c(seq(4, 99, 5))) %>% pull(X16))))
  
  c(df1 %>% 
      left_join(df2) %>%
      left_join(df1 %>% mutate(age = age+5) %>% rename(prob3 = prob1)) %>%
      mutate(prob3 = ifelse(is.na(prob3), 1, prob3), prob = prob1 * prob2 * prob3) %>%
      pull(prob), 0)
  
  }

#run function 
surv_prob_inuit <- sapply(seq(1984, 2099, 5), function(x) fct(surv_prob_umic, x))
surv_prob_fn <- sapply(seq(1984, 2099, 5), function(x) fct(surv_prob_esea, x))

#inuit survival probabilities 2001-2006, 2001-2011...2001-2021, aligned by cohorts (1916-1920 until 1996-2000)
#skip first row because it is about 0-2.5 years old only
inuit_prob <- matrix(c(surv_prob_inuit[-c(1, 19:21), 5], 
                       surv_prob_inuit[-c(1:2, 20:21), 6],
                       surv_prob_inuit[-c(1:3, 21), 7],
                       surv_prob_inuit[-c(1:4), 8]), ncol = 4)

#combine probabilities over the periods 2001-2006, 2001-2011, 2001-2016, 2001-2021
for (i in 1:3){
  
  inuit_prob[ , i+1] <- inuit_prob[ , i]*inuit_prob[ , i+1]
  
}

#repeat with the first nations
fn_prob <- matrix(c(surv_prob_fn[-c(1, 19:21), 5], 
                    surv_prob_fn[-c(1:2, 20:21), 6],
                    surv_prob_fn[-c(1:3, 21), 7],
                    surv_prob_fn[-c(1:4), 8]), ncol = 4)

for (i in 1:3){
  
  fn_prob[ , i+1] <- fn_prob[ , i]*fn_prob[ , i+1]
  
}

################################################################################
#STEP 2: BASELINE POPULATIONS
################################################################################
#matrix raw data by age
counts_raw <- lapply(names, function(x) 
  sapply(seq(2001, 2021, 5), function(y) sc %>% filter(name==x, year==y) %>% pull(pop)))

#rescale raw pop counts to have one cohort for each row
counts_res <- lapply(1:27, function(x) 
  matrix(c(counts_raw[[x]][-c(1, 19:21), 2], counts_raw[[x]][-c(1:2, 20:21), 3], counts_raw[[x]][-c(1:3, 21), 4], counts_raw[[x]][-c(1:4), 5]), ncol =4))

#calculate counts in 2001 adjusted for mortality
counts_adjust <- lapply(1:27, function(x) counts_res[[x]] + counts_res[[x]]*(1-fn_prob))

#inuit counts using the different mortality information
counts_adjust[[12]] <- counts_res[[12]] + counts_res[[12]]*(1-inuit_prob)

#add the counts in the year 2001
counts_adjust <- lapply(1:27, function(x) cbind(counts_raw[[x]][1:17, 1], counts_adjust[[x]]))

#matrix of counts by age, standardized for their size across years, within each language
counts_stzd <- lapply(1:27, function(x) 
  sapply(1:5, function(y) 
    counts_adjust[[x]][ , y] / sum(counts_adjust[[x]][ , y]) * sum(counts_adjust[[x]]) / 5
  ))

#chi square test to compare the standardized counts by age between years, for each language
#extract pearson's residual, check those that deviate the most given mean and sd specific to each language
#the combinations of language, age, and year for which the p value is below 0.01 may be excluded
#(calculations exclude ages 50 and above as they have little impact on the projected numbers)
chisq_resid <- lapply(1:27, function(x) 
    matrix(
      c(chisq.test(counts_stzd[[x]][-c(11:17), ] + 10)$residuals),
      ncol = 5 ) 
    )

#check which combinations of age, language and year have high residual values
#make matrices indicating those that are highly unlikely
exclude_age <- lapply(1:27, function(x) 
  ifelse(abs(chisq_resid[[x]]) %>%
           pnorm(., mean(.), sd(.), lower.tail = F) < 0.01, 1, 0) %>%
    rbind(
      matrix(rep(0, 7*5), ncol = 5)
    ))
  
#matrices with NAs instead of the adjusted counts where p values are below 0.01
#whether the unlikely age-specific counts are included in the computation of the census totals
#does not make a difference as to which years are included or not
counts_stzd_excl <- lapply(1:27, function(x) ifelse(exclude_age[[x]]==1, NA, counts_stzd[[x]]))

#language and census specific sums (with and without the unlikely counts by age)
sum_lc <- sapply(1:27, function(x) apply(counts_adjust[[x]], 2, mean, na.rm =T)*nrow(counts_adjust[[x]]))

#absolute relative differences of the language and census specific sums to their mean
abs_rel_dif <- c(sapply(1:27, function(l) abs(sum_lc[ , l] - apply(sum_lc, 2, mean)[l]) / apply(sum_lc, 2, mean)[l]))

#identify the most extreme values assuming normal distribution
exclude_total <- matrix(ifelse(pnorm(abs_rel_dif, 0, sd(abs_rel_dif), lower.tail = F)<0.001, 1, 0), ncol = length(names))

#replace the extreme values in the totals with NA
sum_lc_excl <- ifelse(exclude_total==1, NA, sum_lc)

#vector of number of observations for each language 
n <- sapply(1:27, function(x) length(sum_lc[ , x][!is.na(sum_lc[ , x])]))
n_excl <- sapply(1:27, function(x) length(sum_lc_excl[ , x][!is.na(sum_lc_excl[ , x])]))

#mean of the log of sum_lc
mean_sum_lc <- apply(log(sum_lc), 2, mean, na.rm = T)
mean_sum_lc_excl <- apply(log(sum_lc_excl), 2, mean, na.rm = T)

#standard error of the log of sum_lc
se_sum_lc <- apply(log(sum_lc), 2, sd, na.rm = T) / sqrt(n)
se_sum_lc_excl <- apply(log(sum_lc_excl), 2, sd, na.rm = T) / sqrt(n_excl)

#language and age specific means
mean_la <- sapply(1:27, function(l) apply(counts_stzd[[l]], 1, mean, na.rm = T))
mean_la_excl <- sapply(1:27, function(l) apply(counts_stzd_excl[[l]], 1, mean, na.rm = T))

#language and age specific variances
var_la <- sapply(1:27, function(l) apply(counts_stzd[[l]], 1, var, na.rm = T))
var_la_excl <- sapply(1:27, function(l) apply(counts_stzd_excl[[l]], 1, var, na.rm = T))

#phi values for the negative binomial distribution
phi <- sapply(1:27, function(l) sapply(1:17, function(a) var_la[a, l] / mean_la[a, l]^2))
phi_excl <- sapply(1:27, function(l) sapply(1:17, function(a) var_la_excl[a, l] / mean_la_excl[a, l]^2))

#replace nas with 0
phi[is.na(phi)] <- 0
phi_excl[is.na(phi_excl)] <- 0

#population sizes in 2001 (3,000 samples) (t-distribution)
popsize <- sapply(1:27, function(x) exp(rt(3000, n[x]-1)*se_sum_lc[x] + mean_sum_lc[x]))
popsize_excl <- sapply(1:27, function(x) exp(rt(3000, n_excl[x]-1)*se_sum_lc_excl[x] + mean_sum_lc_excl[x]))

#age distributions in 2001 (3,000 samples)
agedis <- lapply(1:27, function(l) sapply(1:17, function(a) rnbinom(3000, mu = mean_la[a, l], size = 1 / phi[a, l])))
agedis_excl <- lapply(1:27, function(l) sapply(1:17, function(a) rnbinom(3000, mu = mean_la_excl[a, l], size = 1 / phi_excl[a, l])))

#standardize values to cancel out the variation in total sizes
agedis_stzd <- lapply(1:27, function(l) 
  sapply(1:3000, function(r) 
    round(popsize[r, l] / agedis[[l]][r, ] %>% sum()*agedis[[l]][r, ])))

agedis_excl_size <- lapply(1:27, function(l) 
  sapply(1:3000, function(r) 
    round(popsize_excl[r, l] / agedis[[l]][r, ] %>% sum()*agedis[[l]][r, ])))

agedis_excl_age <- lapply(1:27, function(l) 
  sapply(1:3000, function(r) 
    round(popsize_excl[r, l] / agedis_excl[[l]][r, ] %>% sum()*agedis_excl[[l]][r, ])))

################################################################################
#STEP 3: TRENDS IN MEAN NB OF CHILDREN
  #3.1 ESTIMATION
  #3.2 PROJECTION
  #3.3 COMBINATION WITH FERTILITY SCHEDULES
################################################################################
#3.1 ESTIMATION#################################################################
#populations in 2001, 1996, 1991, ..., 1981
#define function
fct <- function(l, r, agedis){
  
  #establish initial population
  backcast_pop <- matrix(c(rep(0, 17*4), agedis[[l]][ , r]), ncol = 5)
  
  #mortality regime
  prob <- if(l == 12){surv_prob_inuit}else{surv_prob_fn}
  
  #loop to find pop in each year
  for (i in 5:2){
    
    backcast_pop[ , i-1] <- c(backcast_pop[ , i][-1] + backcast_pop[ , i][-1] * (1-prob[2:17, i-1]), 0)
    
  }
  
  return(backcast_pop)
  
}

#run function with the standard age distribution,
#with the distribution excluding unlikely population sizes,
#and with the distribution excluding inlikely population sizes and age-specific counts
back_pops <- lapply(1:27, function(l)
  lapply(1:3000, function(r) 
    fct(l, r, agedis_stzd)))

back_pops_excl_size <- lapply(1:27, function(l)
  lapply(1:3000, function(r) 
    fct(l, r, agedis_excl_size)))

back_pops_excl_age <- lapply(1:27, function(l)
  lapply(1:3000, function(r) 
    fct(l, r, agedis_excl_age)))

#specify the xTFR C parameter
C <- lapply(1:27, function(l) 
  sapply(1:5, function(y) 
    sapply(1:3000, function(r) back_pops[[l]][[r]][1, y])))

C_excl_size <- lapply(1:27, function(l) 
  sapply(1:5, function(y) 
    sapply(1:3000, function(r) back_pops_excl_size[[l]][[r]][1, y])))

C_excl_age <- lapply(1:27, function(l) 
  sapply(1:5, function(y) 
    sapply(1:3000, function(r) back_pops_excl_age[[l]][[r]][1, y])))

#W parameter
W <- lapply(1:27, function(l) 
  sapply(1:5, function(y) 
    sapply(1:3000, function(r) sum(back_pops[[l]][[r]][4:10, y]))))

W_excl_size <- lapply(1:27, function(l) 
  sapply(1:5, function(y) 
    sapply(1:3000, function(r) sum(back_pops_excl_size[[l]][[r]][4:10, y]))))

W_excl_age <- lapply(1:27, function(l) 
  sapply(1:5, function(y) 
    sapply(1:3000, function(r) sum(back_pops_excl_age[[l]][[r]][4:10, y]))))

#number of women ages 25-34 (for pi parameter)
pi <- lapply(1:27, function(l) 
  sapply(1:5, function(y) 
    sapply(1:3000, function(r) sum(back_pops[[l]][[r]][7:8, y]))))

pi_excl_size <- lapply(1:27, function(l) 
  sapply(1:5, function(y) 
    sapply(1:3000, function(r) sum(back_pops_excl_size[[l]][[r]][7:8, y]))))

pi_excl_age <- lapply(1:27, function(l) 
  sapply(1:5, function(y) 
    sapply(1:3000, function(r) sum(back_pops_excl_age[[l]][[r]][7:8, y]))))

#xTFR values
xTFR <- lapply(1:27, function(l) 
  sapply(1:5, function(y) 
    sapply(1:3000, function(r) (10.65-12.55*(pi[[l]][r, y]/W[[l]][r, y]))*(C[[l]][r, y] / W[[l]][r, y]))))

xTFR_excl_size <- lapply(1:27, function(l) 
  sapply(1:5, function(y) 
    sapply(1:3000, function(r) (10.65-12.55*(pi_excl_size[[l]][r, y]/W_excl_size[[l]][r, y]))*(C_excl_size[[l]][r, y] / W_excl_size[[l]][r, y]))))

xTFR_excl_age <- lapply(1:27, function(l) 
  sapply(1:5, function(y) 
    sapply(1:3000, function(r) (10.65-12.55*(pi_excl_age[[l]][r, y]/W_excl_age[[l]][r, y]))*(C_excl_age[[l]][r, y] / W_excl_age[[l]][r, y]))))

#convert zero values and NAs to 0.001
for (i in 1:27){
  
  xTFR[[i]][xTFR[[i]]<=0] <- 0.001
  xTFR[[i]][is.na(xTFR[[i]])] <- 0.001
  
  xTFR_excl_size[[i]][xTFR_excl_size[[i]]<=0] <- 0.001
  xTFR_excl_size[[i]][is.na(xTFR_excl_size[[i]])] <- 0.001
  
  xTFR_excl_age[[i]][xTFR_excl_age[[i]]<=0] <- 0.001
  xTFR_excl_age[[i]][is.na(xTFR_excl_age[[i]])] <- 0.001
  
}

#3.2 PROJECTION#################################################################
#exponential model
exp_model <- lapply(1:27, function(x) 
  lapply(1:3000, function(r) 
    lm(log(xTFR[[x]][r, ]) ~ c(1:5))))

exp_model_excl_size <- lapply(1:27, function(x) 
  lapply(1:3000, function(r) 
    lm(log(xTFR_excl_size[[x]][r, ]) ~ c(1:5))))

exp_model_excl_age <- lapply(1:27, function(x) 
  lapply(1:3000, function(r) 
    lm(log(xTFR_excl_age[[x]][r, ]) ~ c(1:5))))

#log model
log_model <- lapply(1:27, function(x) 
  lapply(1:3000, function(r) 
    lm(c(xTFR[[x]][r, ], mean(xTFR[[x]][r, ])) ~ log(c(1:5, 25)))))

log_model_excl_size <- lapply(1:27, function(x) 
  lapply(1:3000, function(r) 
    lm(c(xTFR_excl_size[[x]][r, ], mean(xTFR_excl_size[[x]][r, ])) ~ log(c(1:5, 25)))))

log_model_excl_age <- lapply(1:27, function(x) 
  lapply(1:3000, function(r) 
    lm(c(xTFR_excl_age[[x]][r, ], mean(xTFR_excl_age[[x]][r, ])) ~ log(c(1:5, 25)))))

#extract predicted values from exponential model if not increasing, log model otherwise
pred_val <- lapply(1:27, function(x)
  sapply(1:3000, function(r) 
    if(exp_model[[x]][[r]][1]$coefficients[2]<c(-0.001)){
      exp(exp_model[[x]][[r]][1]$coefficients[1] + exp_model[[x]][[r]][1]$coefficients[2]*1:25)}
    else{log_model[[x]][[r]][1]$coefficients[1] + log_model[[x]][[r]][1]$coefficients[2]*log(1:25)}))

pred_val_excl_size <- lapply(1:27, function(x)
  sapply(1:3000, function(r) 
    if(exp_model_excl_size[[x]][[r]][1]$coefficients[2]<c(-0.001)){
      exp(exp_model_excl_size[[x]][[r]][1]$coefficients[1] + exp_model_excl_size[[x]][[r]][1]$coefficients[2]*1:25)}
    else{log_model_excl_size[[x]][[r]][1]$coefficients[1] + log_model_excl_size[[x]][[r]][1]$coefficients[2]*log(1:25)}))

pred_val_excl_age <- lapply(1:27, function(x)
  sapply(1:3000, function(r) 
    if(exp_model_excl_age[[x]][[r]][1]$coefficients[2]<c(-0.001)){
      exp(exp_model_excl_age[[x]][[r]][1]$coefficients[1] + exp_model_excl_age[[x]][[r]][1]$coefficients[2]*1:25)}
    else{log_model_excl_age[[x]][[r]][1]$coefficients[1] + log_model_excl_age[[x]][[r]][1]$coefficients[2]*log(1:25)}))

#3.3 COMBINATION WITH FERTILITY SCHEDULES#######################################
#Load WPP fertility information
fertility1 <- read.xlsx("https://population.un.org/wpp/Download/Files/1_Indicator%20(Standard)/EXCEL_FILES/3_Fertility/WPP2024_FERT_F02_FERTILITY_RATES_BY_5-YEAR_AGE_GROUPS_OF_MOTHER.xlsx")
fertility2 <- read.xlsx("https://population.un.org/wpp/Download/Files/1_Indicator%20(Standard)/EXCEL_FILES/3_Fertility/WPP2024_FERT_F02_FERTILITY_RATES_BY_5-YEAR_AGE_GROUPS_OF_MOTHER.xlsx", sheet= "Medium variant") 

#arrange information into df, estimates
fert_df1 <- fertility1 %>% 
  slice(-c(1:10)) %>% 
  dplyr::select(X3, X11, X12:X20) %>%
  filter(X12 != '...') %>%
  rename(location = X3, year = X11) %>%
  pivot_longer(X12:X20, values_to = 'rate', names_to = 'age') %>%
  mutate(age = rep(seq(10, 50, 5), 197802/9), rate = as.numeric(rate)/200)

#arrange information into df, projection
fert_df2 <- fertility2 %>% 
  slice(-c(1:10)) %>% 
  dplyr::select(X3, X11, X12:X20) %>%
  filter(X12 != '...') %>%
  rename(location = X3, year = X11) %>%
  pivot_longer(X12:X20, values_to = 'rate', names_to = 'age') %>%
  mutate(age = rep(seq(10, 50, 5), 205821/9), rate = as.numeric(rate)/200)

#put estimates and projection together
fert_df <- bind_rows(fert_df1, fert_df2)

#find locations with tfr between 3 and 4 in 2001
locations <- fert_df %>% 
  filter(year==2004, location!="Australia/New Zealand") %>%
  group_by(location) %>%
  summarise(TFR = sum(rate)) %>%
  filter(TFR>=3, TFR<4) %>%
  pull(location)

#data frame with fertility information 
#from selected locations tfr within 95% CI of Atikamekw
#year 2001
fert_df_red <- fert_df %>% filter(location %in% locations, year %in% c(2004, 2099)) 

#transform data into age distributions (years 2004, i.e. 2001, and 2099, i.e. 2096)
fert_age_dist_01 <- lapply(locations, function(x)
  rep(seq(12.5, 52.5, 5), round(c(fert_df_red %>% filter(location==x, year==2004) %>% pull(rate))*1000)))

fert_age_dist_96 <- lapply(locations, function(x)
  rep(seq(12.5, 52.5, 5), round(c(fert_df_red %>% filter(location==x, year==2099) %>% pull(rate))*1000)))

#find the mean of the logged values specific to each year and location
fert_mean_01 <- sapply(1:length(locations), function(x) fert_age_dist_01[[x]] %>% log() %>% mean())
fert_mean_96 <- sapply(1:length(locations), function(x) fert_age_dist_96[[x]] %>% log() %>% mean())

#find the sd of the logged values specific to each year and location
fert_sd_01 <- sapply(1:length(locations), function(x) fert_age_dist_01[[x]] %>% log() %>% sd())
fert_sd_96 <- sapply(1:length(locations), function(x) fert_age_dist_96[[x]] %>% log() %>% sd())

#print the statistics in each year (for the method section of the paper)
data.frame(year = c(2001, 2096), 
           mean = round(c(mean(exp(fert_mean_01)), mean(exp(fert_mean_96))), 1),
           sd = round(c(mean(exp(fert_sd_01)), mean(exp(fert_sd_96))), 2))

#find the slopes of the means between 2001 and 2096
fert_slopes_means <- (fert_mean_96 - fert_mean_01) / 20

#find the slopes of the sds between 2001 and 2096
fert_slopes_sds <- (fert_sd_96 - fert_sd_01) / 20

#find the mean and the sd of the means
mean_means <- mean(fert_mean_01)
sd_means <- sd(fert_mean_01)

#find the mean and the sd of the sds
mean_sds <- mean(fert_sd_01)
sd_sds <- sd(fert_sd_01)

#find the mean and sd of the slope means
slopes_means_mean <- mean(fert_slopes_means)
slopes_means_sd <- sd(fert_slopes_means)

#find the mean and sd of the slope sds
slopes_sds_mean <- mean(fert_slopes_sds)
slopes_sds_sd <- sd(fert_slopes_sds)

#generate random schedules
#slope
slope_mean <- rnorm(3000, slopes_means_mean, slopes_means_sd) 
slope_sd <- rnorm(3000, slopes_sds_mean, slopes_sds_sd) 

#mean and sd in 1996
mean_01 <- rnorm(3000, mean_means, sd_means)
sd_01 <- rnorm(3000, mean_sds, sd_sds)

#projection of the mean and sd
proj_means <- sapply(1:3000, function(x) mean_01[x] + slope_mean[x]*1:20)
proj_sds <- sapply(1:3000, function(x) sd_01[x] + slope_sd[x]*1:20)

#estimation of the schedule in each year
fert_sched <- lapply(1:3000, function(x) 
  sapply(1:20, function(y) dlnorm(seq(10, 50, 5), proj_means[y,x], proj_sds[y,x])))

#standardize schedules so that the sum of each = 1
fert_sched_sums <- sapply(1:3000, function(x) apply(fert_sched[[x]], 2, sum))
fert_sched <- lapply(1:3000, function(x) 
  sapply(1:20, function(y) fert_sched[[x]][ , y] / fert_sched_sums[y, x]))

#intergenerational transmission = xTFR X age schedules (years 1996-2101)
#freeze model
it_freeze <- lapply(1:27, function(x) 
  lapply(1:3000, function(r) 
    sapply(1:10, function(y) pred_val[[x]][y+4, r]*fert_sched[[r]][ , y])))

#add the years 2051-2096 (constant)
it_freeze <- lapply(1:27, function(x) 
  lapply(1:3000, function(r) 
    cbind(it_freeze[[x]][[r]], pred_val[[x]][14, r]*fert_sched[[r]][ , 11:20])))

#unlimited model
it_unlimit <- lapply(1:27, function(x) 
  lapply(1:3000, function(r) 
    sapply(1:20, function(y) pred_val[[x]][y+4, r]*fert_sched[[r]][ , y])))

#Excluding the extreme total population values
#freeze model
it_freeze_excl_size <- lapply(1:27, function(x) 
  lapply(1:3000, function(r) 
    sapply(1:10, function(y) pred_val_excl_size[[x]][y+4, r]*fert_sched[[r]][ , y])))

#add the years 2051-2096 (constant)
it_freeze_excl_size <- lapply(1:27, function(x) 
  lapply(1:3000, function(r) 
    cbind(it_freeze_excl_size[[x]][[r]], pred_val_excl_size[[x]][14, r]*fert_sched[[r]][ , 11:20])))

#unlimited model
it_unlimit_excl_size <- lapply(1:27, function(x) 
  lapply(1:3000, function(r) 
    sapply(1:20, function(y) pred_val_excl_size[[x]][y+4, r]*fert_sched[[r]][ , y])))

#Now excluding the extreme age values
#freeze model
it_freeze_excl_age <- lapply(1:27, function(x) 
  lapply(1:3000, function(r) 
    sapply(1:10, function(y) pred_val_excl_age[[x]][y+4, r]*fert_sched[[r]][ , y])))

#add the years 2051-2096 (constant)
it_freeze_excl_age <- lapply(1:27, function(x) 
  lapply(1:3000, function(r) 
    cbind(it_freeze_excl_age[[x]][[r]], pred_val_excl_age[[x]][14, r]*fert_sched[[r]][ , 11:20])))

#unlimited model
it_unlimit_excl_age <- lapply(1:27, function(x) 
  lapply(1:3000, function(r) 
    sapply(1:20, function(y) pred_val_excl_age[[x]][y+4, r]*fert_sched[[r]][ , y])))

################################################################################
#STEP 4: PROJECTION
################################################################################
#create folder to store results
dir.create("Results")

#define function
forecast_fct <- function(l, runs, agedis_list, it_list, label){
  
  #create list for results
  results_list <- list()
  
  for(r in 1:runs){
    
    #establish population in 2001
    pop <- list(c(agedis_list[[l]][ , r], rep(0, 3)))
    
    #specify mortality
    qx <- if(l == 12){1-surv_prob_inuit[ , -c(1:4)]}else{1-surv_prob_fn[ , -c(1:4)]}
    
    #intergenerational transmission
    it <- it_list[[l]][[r]]
    
    #Loop through 2101
    for (i in 1:20){
      
      #remove the dead
      pop[[i+1]] <- pop[[i]] - rbinom(20, pop[[i]], qx[-1, i])
      
      #find the total number of newborns who acquire the language
      newborns <- sum(rbinom(20, pop[[i+1]], c(0, 0, it[-9, i], rep(0, 10))))
      
      #remove the newborns that die between 0 and 2.5
      newborns <- newborns - rbinom(1, newborns, qx[1, i])
      
      #add the newborns to the population next year
      pop[[i+1]] <- c(newborns, pop[[i+1]])[-21]
      
    }
    
    results_list[[r]] <- bind_rows(lapply(1:21, function(i) 
      data.frame(name = names[l],
                 run = r, 
                 age = seq(0, 95, 5),
                 year = seq(2001, 2101, 5)[i],
                 pop = pop[[i]])))
    
  }
  
  results_list %>% bind_rows() %>% saveRDS(paste("Results/", names[l], label, sep = ''))
  
  }

#vector of specification names
specifications <- c("_excl_none_freeze", "_excl_size_freeze", "_excl_age_freeze",
                    "_excl_none_unlimit", "_excl_size_unlimit", "_excl_age_unlimit")

#Run function
lapply(1:length(names), function(x) forecast_fct(x, 3000, agedis_stzd,      it_freeze,           "_excl_none_freeze"))
lapply(1:length(names), function(x) forecast_fct(x, 3000, agedis_excl_size, it_freeze_excl_size, "_excl_size_freeze"))
lapply(1:length(names), function(x) forecast_fct(x, 3000, agedis_excl_age,  it_freeze_excl_age,  "_excl_age_freeze"))
lapply(1:length(names), function(x) forecast_fct(x, 3000, agedis_stzd,      it_unlimit,          "_excl_none_unlimit"))
lapply(1:length(names), function(x) forecast_fct(x, 3000, agedis_excl_size, it_unlimit_excl_size,"_excl_size_unlimit"))
lapply(1:length(names), function(x) forecast_fct(x, 3000, agedis_excl_age,  it_unlimit_excl_age, "_excl_age_unlimit"))

################################################################################
#STEP 5: ANALYSIS
#5.0 COMPARISON OF RESULTS BETWEEN SPECIFICATIONS
#5.1 FIG 1: MAP
#5.2 FIG 2: ILLUSTRATION OF METHODS
#5.3 FIG 3: TOTAL NUMBER OF SPEAKERS 2001-2101
#5.4 TABLE 1: POPULATION SIZE & AGE OF YOUNGEST SPEAKER, 2001 & 2101
#5.5 TABLE 2: EXTINCTION RISKS
#5.6 FIG 4: TREEMAPS
#5.7 VALIDITY CHECK, PERIOD 2001-2021
################################################################################
#5.0 COMPARISON OF RESULTS BETWEEN SPECIFICATIONS###############################
#Number of speakers in 2101, by language and specification
spk_nb <- bind_rows(
  lapply(specifications, function(s)
    bind_rows(
      lapply(names, function(x) readRDS(paste("Results/", x, s, sep = '')))) %>%
      filter(year==2101) %>%
      group_by(name, run) %>%
      summarise(sum_pop = sum(pop, na.rm = T)) %>%
      group_by(name) %>%
      summarise(q10 = quantile(sum_pop, .10),
                q50 = quantile(sum_pop, .50),
                q90 = quantile(sum_pop, .90)) %>%
      mutate(specification = s)
  ))

#Extinction risk in 2101, by language and specification
ext <- bind_rows(
  lapply(specifications, function(s)
    bind_rows(
      lapply(names, function(x) readRDS(paste("Results/", x, s, sep = ''))
      )
    ) %>%
      filter(year==2101) %>%
      group_by(name, run) %>%
      summarise(sum_pop = sum(pop, na.rm = T)) %>%
      filter(sum_pop==0) %>%
      group_by(name) %>%
      summarise(ext_risk = n()/3000) %>%
      mutate(specification = s) 
  ))

#table with speaker population sizes in 2101 across specifications
spk_nb %>%
  arrange(q50) %>%
  mutate(value = paste(round(q50), " (", round(q10), "-", round(q90), ")", sep = "")) %>%
  dplyr::select(name, value, specification) %>%
  pivot_wider(names_from = specification, values_from = value) %>%
  print(n = 27) %>%
  write.xlsx("Figures/comparison_speakers_acrossspecifications.xlsx")

#table with extinction risks in 2101 across specifications
ext %>%
  arrange(-ext_risk) %>%
  pivot_wider(names_from = specification, values_from = ext_risk) %>%
  print(n = 27) %>%
  write.xlsx("Figures/comparison_extinction_acrossspecifications.xlsx")

#5.1 FIG 1: MAP ################################################################
#decide on which specification to use for the main results
specification <- "_excl_age_freeze"

#create folder to store figures
dir.create("Figures")

#Population in 2001 and coordinates
fig1_df <- bind_rows(lapply(names, function(x)
  readRDS(paste("Results/", x, specification, sep = '')) %>% 
    filter(year==2001) %>%
    group_by(name, year, run) %>%
    summarise(pop = sum(pop)) %>% 
    group_by(name, year) %>% 
    summarise(q2 = quantile(pop, .5, na.rm = T)))) %>%
  mutate(year = factor(year)) %>%
  left_join(read.xlsx("https://zenodo.org/records/12530583/files/coordinates.xlsx?download=1")) %>%
  mutate(latitude = ifelse(name == "Cree langs", 54, latitude),
         longitude = ifelse(name == "Cree langs", -77, longitude), 
         latitude = ifelse(name == "Gwich'in", 67, latitude),
         longitude = ifelse(name == "Gwich'in", -139, longitude),
         latitude = ifelse(name == "Mohawk", 45, latitude),
         longitude = ifelse(name == "Mohawk", -74, longitude),
         latitude = ifelse(name == "Ktunaxa", 49, latitude),
         longitude = ifelse(name == "Ktunaxa", -115, longitude))

#map data from natural earth
world <- ne_countries(scale = "small", country=c("Canada", "United States"), returnclass = "sf")

# filter to bbox
bbox <- st_bbox(world)

#set limits
ylimits <- c(min(fig1_df$latitude)-1, max(fig1_df$latitude)+8)
xlimits <- c(min(fig1_df$longitude)-2, max(fig1_df$longitude)+8)

#Not sure what this does cause I copied it from Simon's
world.rst <- ne_load(type="MSR_50M", category='raster', destdir=".", returnclass="sf")
world.rst.df <- raster::as.data.frame(world.rst, xy=TRUE)
world.rst.df <- world.rst.df[dplyr::between(world.rst.df$x, bbox[['xmin']], bbox[['xmax']]), ]
world.rst.df <- world.rst.df[dplyr::between(world.rst.df$y, xlimits[[1]], xlimits[[2]]), ]

#Produce map
ggplot(world)+
  geom_sf(fill = "mintcream")+
  geom_point(data=fig1_df,
             aes(x=longitude, y=latitude, size=q2, color=family, group = family),
             alpha=0.5)+
  theme(
    panel.background = element_rect(fill = "slategray1"),
    axis.ticks = element_blank(),
    axis.title = element_blank(),
    axis.text = element_blank(),
    axis.line = element_blank(),
    legend.key.size = unit(1, "cm"),
    legend.key.height = unit(1, 'cm'),
    legend.title = element_text(size=12), 
    legend.text = element_text(size=12)
  )+
  guides(colour = guide_legend(override.aes = list(size=10)))+
  scale_color_brewer(palette = "Set1")+
  geom_text_repel(data=fig1_df,
                  aes(x=longitude, y=latitude, label=name), max.overlaps = 27)+
  scale_y_continuous(breaks = seq(45, 75, 10), limits = ylimits)+
  xlim(xlimits)+
  scale_size_continuous(breaks=c(100, 1000, 10000),
                        range=c(2, 20),
                        name="Population size",
                        limit=c(0, 100000))+
  theme_bw(base_size=14)

ggsave('Figures/Fig1.tiff', width=9, height=6.5, dpi=1200)

#5.2 FIG. 2: ILLUSTRATION OF METHODS############################################
#EXAMPLE WITH OJI-CREE
#specify run
r <- 1

#pyramids w raw data
A <- ggplot(sc %>% filter(name=="Oji-Cree", age<100) %>% mutate(year = paste("Census of", year)))+
  geom_col(aes(age, pop, fill = year, group = year))+
  facet_wrap(~year, ncol = 1)+
  coord_flip()+
  ylab("Number of speakers")+
  xlab("Age")+
  scale_y_continuous(breaks = c(0, 400, 800, 1200))+
  theme(legend.position = 'none')+
  scale_fill_brewer(palette = 'Set1')

#Standard error and mean specific to Oji-Cree
se <- sd(sum_lc[, 19])/sqrt(5)
m <- mean(sum_lc[,19])

#Data frame with total population sizes and density of T-distribution
df <- data.frame(values = round(seq(-36*64, 36*64, round(se/10))+m), 
                 density = c(dt(seq(-36*64, 36*64, round(se/10))/se, 4))) %>%
  mutate(points = c(rep(NA, 6), 0, rep(NA, 12), 0, rep(NA, 25), 0, rep(NA, 5), 0, rep(NA, 6), 0, rep(NA, 14)),
         Census = factor(c(rep(NA, 6), 2021, rep(NA, 12), 2011, rep(NA, 25), 2001, rep(NA, 5), 2016, rep(NA, 6), 2006, rep(NA, 14))))

#Figure
B1 <- ggplot(df)+
  geom_area(aes(x = values, y = density), alpha = .3)+
  geom_point(aes( x = values, y = points, color = Census, group = Census), size = 4)+
  xlab("Population size")+
  ylab("")+
  scale_color_brewer(palette = 'Set1', na.translate = F)+
  theme(axis.text.y = element_blank(),
        axis.ticks.y = element_blank(),
        legend.position = c(.85, .65),
        legend.text = element_text(size = 7), 
        legend.title = element_text(size = 8),
        legend.key.size = unit(.4, "cm"))+
  scale_x_continuous(breaks = c(7000, 9000, 11000))

#density in given run
df <- bind_rows(
  lapply(1:5, function(c) 
    data.frame(age = seq(0, 80, 5),
               census = factor(seq(2001, 2021, 5)[c]),
               lower = agedis_excl_age[[19]] %>% apply(., 1, quantile, .1),
               upper = agedis_excl_age[[19]] %>% apply(., 1, quantile, .9),
               observed = c(counts_stzd_excl[[19]][ , c]))))

#figure
B2 <- ggplot(df)+
  geom_point(aes(age, observed, group = census, color = census), size = 1.5)+
  geom_segment(aes(x = age, xend = age, y = lower, yend = upper), linewidth = 3, alpha = .03)+
  coord_flip()+
  ylab("Number of speakers")+
  xlab("Age")+
  theme(legend.position = 'none')+
  scale_color_brewer(palette = "Set1")

#Population in 2001, run r
df <- data.frame(age = seq(0, 80, 5), pop = agedis_excl_age[[19]][ , r], year = "Year 2001")

#figure
C1 <- ggplot(df)+
  geom_col(aes(age, pop))+
  coord_flip()+
  ylab("Number of speakers")+
  xlab("Age")+
  scale_y_continuous(breaks = c(0, 400, 800))+
  scale_x_continuous(breaks = c(0, 40, 80))+
  facet_wrap(~year)

#xTFR
df <- data.frame(year = seq(1981, 2101, 5), 
                 estimated = c(xTFR[[19]][r, ], rep(NA, 20)), 
                 modelled = c(pred_val_excl_age[[19]][1:14, r], rep(pred_val_excl_age[[19]][14, r], 11)))

#figure
C2 <- ggplot(df)+
  geom_point(aes(year, estimated), shape = 1, size = 2)+
  geom_line(aes(year, modelled), linewidth = 1, linetype = 'dotted')+
  expand_limits(y=0)+
  xlab("Year")+
  ylab("Avg.no.of children")+
  scale_x_continuous(breaks = c(2001, 2051, 2101))+
  scale_y_continuous(breaks = c(0, 1, 2))

#results: number of speakers by age in selected years
df <- readRDS(paste("Results/Oji-Cree", specification, sep = '')) %>% 
  filter(run==r, year %in% c(2051, 2101)) %>%
  mutate(year = paste("Year", year))

#figure
D1 <- ggplot(df)+
  geom_col(aes(age, pop))+
  coord_flip()+
  ylab("Number of speakers")+
  xlab("Age")+
  facet_wrap(~year)+
  theme(panel.spacing = unit(.5, "lines"))+
  scale_x_continuous(breaks = c(0, 50, 100))+
  scale_y_continuous(breaks = c(0, 400, 800))

#results: total number of speakers by year
df <- readRDS(paste("Results/Oji-Cree", specification, sep = '')) %>% 
  filter(run==r) %>%
  group_by(year) %>%
  summarise(popsize = sum(pop))

#figure
D2 <- ggplot(df)+
  geom_line(aes(year, popsize))+
  expand_limits(y = 0)+
  ylab("Total size")+
  xlab("Year")+
  scale_x_continuous(breaks = seq(2001, 2101, 50))+
  scale_y_continuous(breaks = c(0, 5000, 10000))

#empty figure
empty <- ggplot()+
  theme_minimal()

#arrange together
ggarrange(A,
          ggarrange(ggarrange(B1, B2),
                    ggarrange(empty, C1, empty, C2, 
                              widths = c(.75, 2, .75, 2.5), nrow = 1), 
                    ggarrange(D1, D2, widths = c(3, 2)), 
                    ncol = 1, 
                    labels = c("B", "C", "D"), 
                    heights = c(3,2,2)),
          labels = c("A", ""),
          widths = c(2, 6))

#save
ggsave("Figures/NewFig2.tiff", dpi = 1200, width = 8, height = 6)

#5.3 FIG 3: TOTAL NUMBER OF SPEAKERS 2001-2101##################################
#df with population sizes whole period
fig3_df <- bind_rows(lapply(names, function(x)
  readRDS(paste("Results/", x, specification, sep = '')) %>% 
    filter(year>=2001) %>%
    group_by(run, name, year) %>%
    summarise(sum_pop = sum(pop, na.rm = T)) %>%
    group_by(name, year) %>%
    summarise(q2 = quantile(sum_pop, .5, na.rm = T), 
              lower = quantile(sum_pop, .1, na.rm = T), 
              upper = quantile(sum_pop, .9, na.rm = T)))) 

#languages by initial and final size
groups <- fig3_df %>% 
  filter(year==2001 | year==2101) %>% 
  dplyr::select(name, year, q2) %>%
  pivot_wider(names_from = year, values_from = q2) %>%
  arrange(`2001`) %>%
  ungroup() %>%
  mutate(initial = factor(rep(1:9, each = 3))) %>%
  group_by(initial) %>%
  arrange(`2101`, .by_group = T) %>%
  ungroup() %>%
  mutate(final = factor(rep(1:3, 9))) %>%
  dplyr::select(name, initial, final)

#add groups to the main df
fig3_df <- fig3_df %>% left_join(groups) %>% ungroup()

#function for the breaks on the plot's y axis
breaks_fct <- function(q2) {
  
  x <- floor(log10(max(q2)))-1
  
  c <- floor(max(q2) / 10^x)*10^x
  
  c(0, c/2, c)
  
}

#name labels: change to na except year where should appear
fig3_df <- fig3_df %>% 
  mutate(coord_year = case_when(
    name == "Ktunaxa" ~ 2031, 
    name == "Haida" ~ 2021,
    name == "Tlingit" ~ 2081,
    
    name == "Gwich'in" ~ 2026, 
    name == "Tsimshian" ~ 2026,
    name == "Nuu-chah-nulth" ~ 2076,
    
    name == "Mohawk" ~ 2081,  
    name == "Ntlakapamux" ~ 2036,
    name == "Wolastoqewi" ~ 2026,
    
    name == "Tsilhqot'in" ~ 2021,  
    name == "Secwepemctsin" ~ 2071,
    name == "Nisga'a" ~ 2036,
    
    name == "Anicinabemowin" ~ 2051,  
    name == "Dakelh" ~ 2046,
    name == "Gitxsan" ~ 2021,
    
    name == "Blackfoot" ~ 2026,  
    name == "Tlicho" ~ 2066,
    name == "Slavey (N & S)" ~ 2031,
    
    name == "Atikamekw" ~ 2081,  
    name == "Mi'kmaq" ~ 2016,
    name == "Dakotan langs" ~ 2046,
    
    name == "Innu-Naskapi" ~ 2056,  
    name == "Dene" ~ 2093,
    name == "Oji-Cree" ~ 2071,
    
    name == "Cree langs" ~ 2021,  
    name == "Inuktitut" ~ 2086,
    name == "Ojibwa langs" ~ 2051
    
))
  
fig3_df <- fig3_df %>% 
  mutate(coord_size = case_when(
    name == "Ktunaxa" ~ 95, 
    name == "Haida" ~ 20,
    name == "Tlingit" ~ 20,
    
    name == "Gwich'in" ~ 360, 
    name == "Tsimshian" ~ 20,
    name == "Nuu-chah-nulth" ~ 140,
    
    name == "Mohawk" ~ 180,  
    name == "Ntlakapamux" ~ 480,
    name == "Wolastoqewi" ~ 45,
    
    name == "Tsilhqot'in" ~ 860,  
    name == "Secwepemctsin" ~ 300,
    name == "Nisga'a" ~ 50,
    
    name == "Anicinabemowin" ~ 1500,  
    name == "Dakelh" ~ 770,
    name == "Gitxsan" ~ 300,
    
    name == "Blackfoot" ~ 3000,  
    name == "Slavey (N & S)" ~ 2400,
    name == "Tlicho" ~ 1300,

    name == "Atikamekw" ~ 11000,  
    name == "Mi'kmaq" ~ 8500,
    name == "Dakotan langs" ~ 1000,
    
    name == "Innu-Naskapi" ~ 16000,  
    name == "Dene" ~ 9900,
    name == "Oji-Cree" ~ 4000,
    
    name == "Cree langs" ~ 80000,  
    name == "Inuktitut" ~ 65000,
    name == "Ojibwa langs" ~ 20500
    
  ))

#plot
ggplot(fig3_df, aes(year, q2))+
  geom_line(aes(group = final, color = final), alpha = .8, linetype = 'dashed')+
  geom_ribbon(aes(x = year, ymin = lower, ymax = upper, group = final, fill = final), alpha = .1)+
  geom_text(aes(label = name, x = coord_year, y = coord_size))+
  facet_wrap(~ initial, scale = 'free_y')+
  scale_y_continuous(breaks = breaks_fct, limits = c(0, NA))+
  scale_x_continuous(breaks = c(2001, 2051, 2101))+
  scale_color_brewer(palette = "Set1")+
  scale_fill_brewer(palette = "Set1")+
  theme(strip.text.x = element_blank())+
  ylab("Number of speakers")+
  xlab("Year")+
  theme(legend.position = "none")

ggsave("Figures/Fig3.tiff", width = 7.5, height = 5, dpi = 1200)

#all languages
all <- bind_rows(lapply(names, function(x)
  readRDS(paste("Results/", x, specification, sep = '')))) %>% 
    group_by(run, year) %>% 
    summarise(sum_pop = sum(pop)) %>%
    group_by(year) %>%
    summarise(q2 = quantile(sum_pop, .5, na.rm = T), 
              lower = quantile(sum_pop, .1, na.rm = T), 
              upper = quantile(sum_pop, .9, na.rm = T))

#5.4 TABLE 1: POP. SIZE & AGE OF YOUNGEST SPEAKER, 2001-2101####################
#df with populatoin sizes in 2001 and 2101, for each language
tbl1_df_sum <- bind_rows(lapply(names, function(x)
  readRDS(paste("Results/", x, specification, sep = '')) %>% 
    filter(year==2001 | year==2101))) %>%
  group_by(run, name, year) %>%
  summarise(sum_pop = sum(pop, na.rm = T)) %>%
  group_by(name, year) %>%
  summarise(q2 = round(quantile(sum_pop, .5, na.rm = T)), 
            lower = round(quantile(sum_pop, .1, na.rm = T)), 
            upper = round(quantile(sum_pop, .9, na.rm = T))) %>%
  mutate(metric = "speakers")

#df with it values in 2001 and 2101, for each language
tbl1_df_it <- bind_rows(lapply(names, function(x) 
  readRDS(paste('IntergenerationalTransmission/', x, sep = '')) %>% 
    filter(year==2001 | year==2101))) %>%
  group_by(year, name) %>%
  summarise(q2 = round(quantile(modelled, .5, na.rm = T), 2),
            lower = round(quantile(modelled, .1, na.rm = T), 2),
            upper = round(quantile(modelled, .9, na.rm = T), 2)) %>%
  mutate(metric = "it")

#vector of names ordered by population size in 2001
names_ordered <- tbl1_df_sum %>% filter(year==2001) %>% arrange(q2) %>% pull(name)

#clean and save table
tbl1_df <- tbl1_df_sum %>% 
  mutate(value = paste(trimws(format(q2, big.mark = ',')), " (", trimws(format(lower, big.mark = ',')),"-", trimws(format(upper, big.mark = ',')), ")", sep = ""),
         name = factor(name, levels=names_ordered)) %>%
  dplyr::select(name, year, value, metric) %>% 
  pivot_wider(names_from = metric, values_from = value) %>%
  left_join(tbl1_df_sum %>% filter(year==2001) %>% dplyr::select(name, q2) %>% rename(initial=q2)) %>%
  arrange(initial) %>%
  dplyr::select(-initial) %>%
  pivot_wider(names_from = year, values_from = speakers) %>%
  dplyr::select(name, `2001`, `2101`) %>% 
  rename(X2001 = `2001`, X2101 = `2101`) %>%
  bind_rows(data.frame(name = "Total", 
                      `2001` = paste(all %>% filter(year==2001) %>% pull(q2) %>% round(),
                                     " (",
                                     all %>% filter(year==2001) %>% pull(lower) %>% round(),
                                     "-",
                                     all %>% filter(year==2001) %>% pull(upper) %>% round(),
                                     ")",
                                     sep = ""),
                       `2101` = paste(all %>% filter(year==2101) %>% pull(q2) %>% round(),
                                      " (",
                                      all %>% filter(year==2101) %>% pull(lower) %>% round(),
                                      "-",
                                      all %>% filter(year==2101) %>% pull(upper) %>% round(),
                                      ")",
                                      sep = "")))

tbl1_df %>% print(n = 28)

write.xlsx(tbl1_df, "Figures/table1.xlsx")

#5.5 TABLE 2: EXTINCTION RISKS##################################################
#table 2: extinction risks, all ages, all years
tbl2_allages <- bind_rows(lapply(names, function(x)
  readRDS(paste("Results/", x, specification, sep = '')))) %>%
  group_by(run, name, year) %>%
  summarise(sum_pop = sum(pop)) %>%
  filter(sum_pop==0) %>%
  group_by(name, year) %>%
  summarise(extinct = n()/30) %>% 
  mutate(group = 'all')

#extinction risks: no speakers below age 50
tbl2_below50 <- bind_rows(lapply(names, function(x)
  readRDS(paste("Results/", x,specification, sep = '')))) %>%
  filter(age<50) %>%
  group_by(run, name, year) %>%
  summarise(sum_pop = sum(pop)) %>%
  filter(sum_pop==0) %>%
  group_by(name, year) %>%
  summarise(extinct = n()/30) %>% 
  mutate(group = 'below50')

#extinction risks: no speakers below age 15
tbl2_below15 <- bind_rows(lapply(names, function(x)
  readRDS(paste("Results/", x, specification, sep = '')))) %>%
  filter(age<15) %>%
  group_by(run, name, year) %>%
  summarise(sum_pop = sum(pop)) %>%
  filter(sum_pop==0) %>%
  group_by(name, year) %>%
  summarise(extinct = n()/30) %>% 
  mutate(group = 'below15')

#merge everything
tbl2 <- bind_rows(tbl2_allages, tbl2_below50, tbl2_below15) 

#add year when risk reaches specific threshold
tbl2 <- tbl2 %>% 
  left_join(tbl2 %>%  
              filter(extinct>=10) %>%
              group_by(name, group) %>%  
              summarise(risk10 = min(year))) %>%
  left_join(tbl2 %>%  
              filter(extinct>=50) %>%
              group_by(name, group) %>%  
              summarise(risk50 = min(year))) %>%
  filter(year==2101) %>%
  arrange(group, -extinct) %>%
  mutate(value = paste(round(extinct, 1), " (", ifelse(is.na(risk10), " ", risk10), "-", ifelse(is.na(risk50), " ", risk50), ")", sep="")) %>%
  dplyr::select(name, group, value) %>%
  pivot_wider(values_from = value, names_from = group) %>%
  dplyr::select(name, all, below50, below15) 

tbl2  %>% print(n = 27)

write.xlsx(tbl2, "Figures/table2.xlsx")

#5.6 FIG. 4: TREEMAPS###########################################################
#2001 data
df01 <- bind_rows(lapply(names, function(x) 
  readRDS(paste("Results/", x, specification, sep = '')))) %>%
  filter(year==2001) %>%
  group_by(run, name) %>%
  summarise(pop = sum(pop)) %>%
  group_by(name) %>%
  summarise(med = quantile(pop, .5)) %>%
  mutate(year = 2001) 

#2101 data
df2101 <- bind_rows(lapply(names, function(x) 
  readRDS(paste("Results/", x, specification, sep = '')))) %>%
  filter(year==2101) %>%
  group_by(run, name) %>%
  summarise(pop = sum(pop)) %>%
  group_by(name) %>%
  summarise(med = quantile(pop, .5, na.rm = T)) %>%
  mutate(year = 2101) 

#put together, add Family
df_treemap <- bind_rows(df01, df2101) %>%
  left_join(read.xlsx('https://zenodo.org/records/12530583/files/coordinates.xlsx?download=1') %>% 
              rename(Family = family) %>%
              dplyr::select(name, Family))

#size of maps
sizes <- df_treemap %>% group_by(year) %>% summarise(total = sum(med)) %>% pull(total) 
sizes[2] / sizes[1] 

#plot 1: 2001
ggplot(df_treemap %>% filter(year==2001), aes(area = med, fill = Family, label = name, subgroup = Family))+
  geom_treemap(colour = "black", size = 2)+
  geom_treemap_text()+
  scale_fill_brewer(palette = "Set1")+
  theme(legend.position = 'none')

ggsave("Figures/treemap1.tiff", height = 3.3, width=5, dpi=1200)

#plot 2: 2101
ggplot(df_treemap %>% filter(year==2101), aes(area = med, fill = Family, label = name, subgroup = Family))+
  geom_treemap(colour = "black", size = 2)+
  geom_treemap_text()+
  scale_fill_brewer(palette = "Set1")

ggsave("Figures/treemap2.tiff", height = 3.3, width=5, dpi=1200)

##5.7 VALIDITY CHECK, PERIOD 2001-2021##########################################
#table counts by age years 2001 through 2021
st3 <- bind_rows(lapply(names, function(x)
  readRDS(paste("Results/", x, specification, sep = '')) %>% 
    filter(year>=2001 & year<=2021))) %>%
  group_by(name, year, age) %>%
  mutate(pop = ifelse(is.na(pop), 0, pop),
         pop2 = ifelse(sample(c(0,1), 1)==0, floor(pop/5)*5, ceiling(pop/5)*5)) %>%
  summarise(median = quantile(pop, .5),
            tenperc = quantile(pop, .1),
            ninetyperc = quantile(pop, .9))

#add the real data
st3_sc <- left_join(st3, sc) %>% 
  left_join(
    data.frame(exclude1 = c(exclude_total),
               name = rep(names, each =5),
               year = seq(2001, 2021, 5))
    ) %>%
  left_join(
    bind_rows(
      lapply(1:27, function(x) 
        data.frame(exclude2 = c(exclude_age[[x]]),
                   name = rep(names[x], each =5*17),
                   age = rep(seq(0, 80, 5), 5),
                   year = rep(seq(2001, 2021, 5), each = 17))
        )
      ) %>%
      mutate(age = age + (year - 2001))
    ) %>%
  mutate(within = ifelse(pop>=tenperc & pop<=ninetyperc, 1, 0),
         exclude2 = ifelse(is.na(exclude2), 0, exclude2))  
  
#proportion correct by year
correct_year <- st3_sc %>%
  filter(exclude1!=1, exclude2!=1) %>%
  group_by(year) %>%
  summarise(total = mean(within)) %>%
  pull(total) 

#proportion correct total
correct_total <- st3_sc %>%
  filter(exclude1!=1, exclude2!=1) %>%
  pull(within) %>%
  mean() %>%
  print()

#proportion correct by language and year
st3_sc %>%
  filter(exclude1!=1, exclude2!=1) %>%
  group_by(name, year) %>%
  summarise(perc_correct = mean(within)) %>%
  pivot_wider(names_from = year, values_from = perc_correct) %>%
  left_join(
    st3_sc %>%
      filter(exclude1!=1, exclude2!=1) %>%
      group_by(name) %>%
      summarise(total = mean(within))
    ) %>%
  bind_rows(data.frame(name = "Total", 
                       year = seq(2001, 2021, 5),
                       within = correct_year) %>%
              pivot_wider(names_from = year, values_from = within) %>%
              mutate(total = correct_total)) %>%
  left_join(data.frame(name = names, n = apply(sum_lc, 2, mean, na.rm = T))) %>%
  print(n = 28) -> a

write.xlsx(a[,-8], "Figures/validation_byyear.xlsx")

#proportion correct by age and year
st3_sc %>%
  filter(exclude1!=1, exclude2!=1) %>%
  group_by(age, year) %>%
  summarise(correct = mean(within)) %>%
  pivot_wider(names_from = year, values_from = correct) %>%
  left_join(st3_sc %>%
              filter(exclude1!=1, exclude2!=1) %>%
              group_by(age) %>%
              summarise(total = mean(within))) %>%
  print(n = 21) -> a
  
write.xlsx(a %>% mutate(age = paste(age, '-', age+4, sep = '')), "Figures/validation_byage.xlsx")

################################################################################
#IN TEXT COMMENTS, APPENDICES
################################################################################
#exclusion of years and age in computation of baseline populations###############
bind_rows(
  lapply(1:27, function(x)
         data.frame(name = names[x],
                    age = rep(seq(0, 80, 5), 5),
                    year = rep(seq(2001, 2021, 5), each = 17),
                    excluded = c(exclude_age[[x]])
         )
  )
) %>%
  filter(excluded==1) 

data.frame(name = rep(names, each = 5), 
           year = seq(2001, 2021, 5), 
           excluded = c(exclude_total)) %>%
  filter(excluded==1)


#Baseline populations###########################################################
#load data
df1 <- bind_rows(lapply(names, function(x)
  readRDS(paste("Results/", x, specification, sep = '')) %>% 
    filter(year==2001) %>%
    group_by(name, year, run) %>%
    summarise(pop = sum(pop)) %>% 
    group_by(name, year) %>% 
    summarise(q2 = quantile(pop, .5, na.rm = T)))) %>%
  mutate(year = factor(year))

#Median and mean number of speakers across all languages
df1 %>% 
  ungroup %>%
  summarise(mean=mean(q2),
            median=quantile(q2, .5))

#Minimum and maximum
bind_rows(df1 %>% ungroup() %>% filter(q2==min(q2)),
          df1 %>% ungroup() %>% filter(q2==max(q2))) %>% 
  distinct(name, q2)

#distribution of total speaker numbers by language##############################
#Standard error and mean 
se <- apply(sum_lc, 2, sd, na.rm = T) / sqrt(n)
m <- apply(sum_lc, 2, mean, na.rm = T) 

#Data frame with total population sizes and density of T-distribution
df <- data.frame(name = names,
                 c01 = round(sum_lc[1, ]),
                 c06 = round(sum_lc[2, ]),
                 c11 = round(sum_lc[3, ]),
                 c16 = round(sum_lc[4, ]),
                 c21 = round(sum_lc[5, ]),
                 mean = round(m, 1),
                 se = round(se, 1)) %>%
  arrange(mean)

write.xlsx(df, "Figures/TableS2.xlsx")

#distributions by age###########################################################
agedis_df <- bind_rows(lapply(1:27, function(l)
  data.frame(name = names[l],
             census = factor(rep(seq(2001, 2021, 5), each = 17)),
             age = rep(seq(0, 80, 5), 5),
             pop = c(counts_stzd_excl[[l]])))) %>%
  left_join(bind_rows(lapply(1:27, function(l) 
    data.frame(name = names[l],
               tenperc = apply(agedis_excl_age[[l]], 1, quantile, .1),
               ninetyperc = apply(agedis_excl_age[[l]], 1, quantile, .9),
               age = seq(0, 80, 5)))))  %>%
  filter(name!="Wolastoqewi" | census!=2001,
         name!="Haida" | census!=2001,
         name!="Ktunaxa" | census!=2006,
         name!="Mohawk" | census!=2016,
         name!="Ktunaxa" | census!=2021,
         name!="Dakotan langs" | census!=2021)

#viz
ggplot(agedis_df %>% 
         filter(name %in% names[1:15])
       )+
  geom_point(aes(age, pop, group = census, color = census), size = .8)+
  geom_segment(aes(x = age, xend = age, y = tenperc, yend = ninetyperc), linewidth = 2, alpha = .05)+
  facet_wrap(~name, ncol = 3, scales = 'free_x')+
  coord_flip()+
  xlab("Age in 2001")+
  ylab("Number of speakers")+
  theme(legend.position = 'none')+
  scale_color_brewer(palette = 'Set1')

ggsave("Figures/Appendix_pyramids1.tiff", width = 6.5, height = 9)

ggplot(agedis_df %>% filter(name %in% names[16:27]))+
  geom_point(aes(age, pop, group = census, color = census), size = .8)+
  geom_segment(aes(x = age, xend = age, y = tenperc, yend = ninetyperc), linewidth = 2, alpha = .05)+
  facet_wrap(~name, ncol = 3, scales = 'free_x')+
  coord_flip()+
  xlab("Age in 2001")+
  ylab("Number of speakers")+
  theme(legend.position = 'bottom')+
  scale_color_brewer(palette = 'Set1')

ggsave("Figures/Appendix_pyramids2.tiff", width = 6.5, height = 8)

#Intergenerational transmission#################################################
#find median and 80% CI, put in df
xTFR_df <- bind_rows(lapply(1:27, function(x) 
  data.frame(name = names[x],
             year = seq(1981, 2101, 5),
             pred_10 = sapply(1:27, function(x)
               apply(pred_val_excl_age[[x]], 1, quantile, .1))[ , x],
             pred_50 = sapply(1:27, function(x)
               apply(pred_val_excl_age[[x]], 1, quantile, .5))[ , x],
             pred_9 = sapply(1:27, function(x)
               apply(pred_val_excl_age[[x]], 1, quantile, .9))[ , x])))

xTFR_df %>% filter(year==2001) %>% arrange(pred_50)
xTFR_df %>% filter(year==2001 | year==2096) %>% distinct(name, year, pred_50) %>%
  pivot_wider(names_from = year, values_from = pred_50) %>% arrange(`2001`/`2096`) %>%
  print(n = 27)

#visualise result
ggplot(xTFR_df %>% filter(year>=2001, year<=2046, name %in% names[1:15]))+
  geom_line(aes(year, pred_50), linewidth=1, linetype = 'dashed')+
  geom_ribbon(aes(year, ymin = pred_10, ymax = pred_9), alpha = .3)+
  facet_wrap(~name, ncol = 3)+
  expand_limits(y = 0)+
  scale_x_continuous(breaks = c(2001, 2021, 2041))+
  scale_y_continuous(breaks = c(0, 1, 2, 3), limits = c(0, 2.1))+
  ylab("Mean number of children")+
  xlab("Year")

ggsave("Figures/Appendix_xTFR1.tiff", height = 7, width = 5)

ggplot(xTFR_df %>% filter(year>=2001, year<=2046, name %in% names[16:27]))+
  geom_line(aes(year, pred_50), linewidth=1, linetype = 'dashed')+
  geom_ribbon(aes(year, ymin = pred_10, ymax = pred_9), alpha = .3)+
  facet_wrap(~name, ncol = 3)+
  expand_limits(y = 0)+
  scale_x_continuous(breaks = c(2001, 2021, 2041))+
  scale_y_continuous(breaks = c(0, 1, 2, 3), limits = c(0, 2.1))+
  ylab("Mean number of children")+
  xlab("Year")

ggsave("Figures/Appendix_xTFR2.tiff", height = 5, width = 5)

#Trajectories###################################################################
#Descending
fig3_df %>% 
  filter(year<=2006, lead(q2)<q2) %>%
  pull(name) %>% 
  unique() -> descending

#Ascending
fig3_df %>% 
  filter(year==2001 | year==2101) %>% 
  distinct(name, year, q2) %>%
  pivot_wider(names_from = year, values_from = q2) %>%
  filter(`2101` > `2001`) %>%
  pull(name) %>%
  print() -> ascending

#Ascending then descending
names[!names %in% c(ascending, descending)]

#Population sizes 2101 vs. 2001#################################################
#proportions
tbl1_df_sum %>%
  pivot_wider(names_from = year, values_from = c(q2, lower, upper)) %>%
  mutate(prop = q2_2101 / q2_2001) %>% 
  dplyr::select(name, q2_2001, q2_2101, prop) %>%
  arrange(prop) %>%
  print(n = 27)

#cumlated sum in 2001
tbl1_df_sum %>%
  ungroup() %>%
  filter(year==2001) %>%
  arrange(q2) %>%
  mutate(cs = cumsum(q2), prop = round(cs / 167536 * 100,2), n = row_number()) %>%
  distinct(name, q2, cs, prop, n) %>%
  print(n = 27)

#cumulated sum in 2101
tbl1_df_sum %>%
  ungroup() %>%
  filter(year==2101) %>%
  arrange(q2) %>%
  mutate(cs = cumsum(q2), prop = round(cs / 167536 * 100,2), n = row_number()) %>%
  distinct(name, q2, cs, prop, n) %>%
  print(n = 27)

tbl1_df_sum %>% 
  left_join(read.xlsx("https://zenodo.org/records/12530583/files/coordinates.xlsx?download=1'") %>% distinct(name, family)) %>%
  group_by(year, family) %>%
  summarise(sum_fam = sum(q2)) %>%
  group_by(year) %>%
  mutate(total = sum(sum_fam),
         prop = sum_fam / total) %>% 
  arrange(prop)

#leaders
a1 <- tbl1_df_sum %>% 
  filter(name %in% c("Atikamekw", "Innu-Naskapi"), year==2001) 

a2 <- tbl1_df_sum %>% 
  filter(name %in% c( "Oji-Cree", "Ojibwa langs"), year==2001) 

b1 <- tbl1_df_sum %>% 
  filter(name %in% c("Atikamekw", "Innu-Naskapi"), year==2101) 

b2 <- tbl1_df_sum %>% 
  filter(name %in% c( "Oji-Cree", "Ojibwa langs"), year==2101) 

sum(a2$q2) / sum(a1$q2)
sum(b2$q2) / sum(b1$q2)

#END############################################################################