This document is the accompanying technical appendix to the article of the same name to be published in the Cambridge Journal of Economics. The R code used for our analysis is embedded in this file.

1 Preparatory work

We load relevant packages:

require(data.table)
require(ggplot2)
require(tidyr)
# require(plyr)
require(knitr)
require(RMySQL)
# require(ineq)
# require(grid)
require(dplyr)
# require(data.tree) 
require(ggrepel)
require(xtable)

Unfortunately, we cannot share our data because they are proprietary (from Clarivate’s Web of Science). Most of our analysis will use a data table with 79 million lines, where each line corresponds to one reference having a SSH article either as the citing or the cited document. Furthermore, we have a MySQL database with more data. We define our connection to this MySQL database here:

source("connection_info.R") # file with the connection info (usr and pswd)

ESH <- dbConnect(MySQL(), user=usr, password=pswd, dbname='OST_Expanded_SciHum',
                 host='127.0.0.1')

We create our disciplinary categories based on the NSF classification while aggregating some fields:

# Discipline names:    
discipline_info <- 
  fread(paste0(
    path_to_discipline_info,
    "Liste_Discipline.txt"))  

discipline_info <- discipline_info[,list(Code_Discipline, EGrande_Discipline, 
                                         EDiscipline, ESpecialite)]

# We create a new column aggregating some specialties into disciplines
# Everything that is Psychology and below is 
# aggregated from the specialty to the Discipline:
i <- discipline_info[EDiscipline=="Psychology", max (Code_Discipline)]
discipline_info[Code_Discipline <= i, discipline := EDiscipline]

# Everything that is Humanities is aggregated
i <- discipline_info[EDiscipline=="Humanities", max (Code_Discipline)]
discipline_info[Code_Discipline <= i, discipline := EDiscipline]

#Everything that is Arts is aggregated
i <- discipline_info[EDiscipline=="Arts", max (Code_Discipline)]
discipline_info[Code_Discipline <= i, discipline := EDiscipline]

#Everything that is Earth and Space is aggregated
i <- discipline_info[EDiscipline=="Earth and Space", max (Code_Discipline)]
discipline_info[Code_Discipline <= i, discipline := EDiscipline]

#Everything that is Biology is aggregated
i <- discipline_info[EDiscipline=="Biology", max (Code_Discipline)]
discipline_info[Code_Discipline <= i, discipline := EDiscipline]

#Everything that is Mathematics is aggregated
i <- discipline_info[EDiscipline=="Mathematics", max (Code_Discipline)]
discipline_info[Code_Discipline <= i, discipline := EDiscipline]

#Everything that is Chemistry is aggregated
i <- discipline_info[EDiscipline=="Chemistry", max (Code_Discipline)]
discipline_info[Code_Discipline <= i, discipline := EDiscipline]

#Everything that is Physics is aggregated
i <- discipline_info[EDiscipline=="Physics", max (Code_Discipline)]
discipline_info[Code_Discipline <= i, discipline := EDiscipline]


core_socsci <- c("Economics", "Geography", "Political Science and Public Administration", 
                 "Sociology", "Demography", "International Relations",
                 "Anthropology and Archaeology")
core_pro_fie <- c("Management", "Education", "Law")
core_med <- c("General & Internal Medicine", "Psychiatry", 
              "Environmental & Occupational Health", "Neurology & Neurosurgery")
cor_eng_tec <- c("Computers", "Operations Research")
#Aggregating social sciences
discipline_info[EDiscipline == "Social Sciences" & !ESpecialite %in% core_socsci,
                discipline := "Other Social Sciences"]
discipline_info[EDiscipline == "Social Sciences" & ESpecialite %in% core_socsci,
                discipline := ESpecialite]

#Aggregating professional fields
discipline_info[EDiscipline == "Professional Fields" & !ESpecialite %in% core_pro_fie,
                discipline := "Other Professional Fields"]
discipline_info[EDiscipline == "Professional Fields" & ESpecialite %in% core_pro_fie,
                discipline := ESpecialite]

#Aggregating clinical medicine
discipline_info[EDiscipline == "Clinical Medicine" & !ESpecialite %in% core_med,
                discipline := "Other Clinical Medicine"]
discipline_info[EDiscipline == "Clinical Medicine" & ESpecialite %in% core_med,
                discipline := ESpecialite]

#Aggregating Engineering and Technology
discipline_info[EDiscipline == "Engineering and Technology" & 
                  !ESpecialite %in% cor_eng_tec,
                discipline := "Other Engineering \n and Technology"]
discipline_info[EDiscipline == "Engineering and Technology" & 
                  ESpecialite %in% cor_eng_tec,
                discipline := "Computers & \n Operations Research"]
# all the rest is not aggregate and the ESpecialite becomes the discipline:
discipline_info[is.na(discipline), discipline := EDiscipline]

# Playing with some names to put on two lines:        

#Cleaning Political Science name
discipline_info[discipline == "Political Science and Public Administration",
                discipline := "Political Science &\nPublic Administration"]
#Cleaning Anthropology name
discipline_info[discipline == "Anthropology and Archaeology", 
                discipline :=  "Anthropology &\n Archaeology"]

# Core social science and humanities:
core_SSH <- c("Psychology","Economics",
              "Political Science &\nPublic Administration",
              "Sociology", "Management",
              "Anthropology &\n Archaeology")

# Colors for every disciplines. 
# We used Angrist's colors and added purple for Management
colors <- c("Psychology" = "#0000FF", 
            "Economics" = "black", 
            "Sociology" = "#FF0000", 
            "Anthropology &\n Archaeology" = "#CCCC00",  "Management" = "purple", 
            "Political Science &\nPublic Administration" = "#228B22", 
            "Others" = alpha("gray",.3))


# Linetypes for every disciplines. We used Angrist's linetype and 
# added dotdash for Management
linetypes <- c("Psychology" = "longdash", 
               "Economics" = "solid",
               "Sociology" = "dotted",
               "Anthropology &\n Archaeology" = "solid",
               "Political Science &\nPublic Administration" = "dashed",
               "Management" = "dotdash",
               "Others" = "solid")

We regroup most functions used later in the following code block:

#Renaming Anthropology and Political Science to match Angrist et al's discipline names
renaming_ssh_angrist <- function(dt_citing_cited_journals){
  dt_citing_cited_journals[Discipline_citing == "Anthropology &\n Archaeology",
                           Discipline_citing := "Anthropology"]
  dt_citing_cited_journals[Discipline_cited == "Anthropology &\n Archaeology",
                           Discipline_cited := "Anthropology"]
  dt_citing_cited_journals[
    Discipline_citing == "Political Science &\nPublic Administration",
    Discipline_citing := "Political Science"]
  dt_citing_cited_journals[
    Discipline_cited == "Political Science &\nPublic Administration",
    Discipline_cited := "Political Science"]
  dt_citing_cited_journals
}

#Recategorizing journals to match Angrist et al's categories
renaming_business_angrist <- function(dt_citing_cited_journals, dt_angrist_journals){
  dt_citing_cited_journals[citing_code_revue %in% 
                             dt_angrist_journals[angrist_disc == "Accounting"]$Code_Revue, 
                           `:=` (citing_discipline = 201, Discipline_citing = 
                                                                                                                        "Accounting")]
  dt_citing_cited_journals[cited_code_revue %in% 
                             dt_angrist_journals[angrist_disc == "Accounting"]$Code_Revue,
                           `:=` (cited_discipline = 201, Discipline_cited =     
                                   "Accounting")]
  dt_citing_cited_journals[citing_code_revue %in% 
                             dt_angrist_journals[angrist_disc == "Finance"]$Code_Revue, 
                           `:=` (citing_discipline = 202,       
                                 Discipline_citing = "Finance")]
  dt_citing_cited_journals[cited_code_revue %in% 
                             dt_angrist_journals[angrist_disc == "Finance"]$Code_Revue, 
                           `:=` (cited_discipline = 202, 
                                 Discipline_cited = "Finance")]
  dt_citing_cited_journals[citing_code_revue %in% 
                             dt_angrist_journals[angrist_disc == "Marketing"]$Code_Revue, 
                           `:=` (citing_discipline = 203, 
                                 Discipline_citing = "Marketing")]
  dt_citing_cited_journals[cited_code_revue %in% 
                             dt_angrist_journals[angrist_disc == "Marketing"]$Code_Revue, 
                           `:=` (cited_discipline = 203, 
                                 Discipline_cited = "Marketing")]
  dt_citing_cited_journals
}

adding_disc_name_to_dt_citing_cited <- function(dt_citing_cited_all)
{
  dt_citing_cited_all <- merge(dt_citing_cited_all, 
                               discipline_info[,list(Code_Discipline, discipline)], 
                               by.x = "citing_discipline", by.y = "Code_Discipline")
  setnames(dt_citing_cited_all, "discipline", "Discipline_citing")
    dt_citing_cited_all <- merge(dt_citing_cited_all, 
                                 discipline_info[,list(Code_Discipline, discipline)], 
                                 by.x = "cited_discipline", by.y = "Code_Discipline")
    setnames(dt_citing_cited_all, "discipline", "Discipline_cited")
    dt_citing_cited_all
}
  

# Set of functions adapted from
# https://gist.github.com/eliocamp/eabafab2825779b88905954d84c82b32 
# on 2019-11-14
# They are used to generate two linetype aesthetics 
# in the same graph with separate legends:
#' Allows to add another scale 
#' 
#' @param new_aes character with the aesthetic for which new scales will be 
#' created
#'
new_scale <- function(new_aes) {
  structure(ggplot2::standardise_aes_names(new_aes), class = "new_aes")
}

#' Convenient functions
new_scale_fill <- function() {
  new_scale("fill")
}

new_scale_color <- function() {
  new_scale("colour")
}

new_scale_colour <- function() {
  new_scale("colour")
}

new_scale_linetype <- function() {
  new_scale("linetype")
}

#' Special behaviour of the "+" for adding a `new_aes` object
#' It changes the name of the aesthethic for the previous layers, appending
#' "_new" to them. 
ggplot_add.new_aes <- function(object, plot, object_name) {
  plot$layers <- lapply(plot$layers, bump_aes, new_aes = object)
  plot$scales$scales <- lapply(plot$scales$scales, bump_aes, new_aes = object)
  plot$labels <- bump_aes(plot$labels, new_aes = object)
  plot
}


bump_aes <- function(layer, new_aes) {
  UseMethod("bump_aes")
}

bump_aes.Scale <- function(layer, new_aes) {
  old_aes <- layer$aesthetics[remove_new(layer$aesthetics) %in% new_aes]
  new_aes <- paste0(old_aes, "_new")
  
  layer$aesthetics[layer$aesthetics %in% old_aes] <- new_aes
  
  if (is.character(layer$guide)) {
    layer$guide <- match.fun(paste("guide_", layer$guide, sep = ""))()
  }
  layer$guide$available_aes[layer$guide$available_aes %in% old_aes] <- new_aes
  layer
}

bump_aes.Layer <- function(layer, new_aes) {
  original_aes <- new_aes
  
  old_aes <- names(layer$mapping)[remove_new(names(layer$mapping)) %in% new_aes]
  new_aes <- paste0(old_aes, "_new")
  
  old_geom <- layer$geom
  
  old_setup <- old_geom$handle_na
  new_setup <- function(self, data, params) {
    colnames(data)[colnames(data) %in% new_aes] <- original_aes
    old_setup(data, params)
  }
  
  new_geom <- ggplot2::ggproto(paste0("New", class(old_geom)[1]), old_geom,
                               handle_na = new_setup)
  
  new_geom$default_aes <- change_name(new_geom$default_aes, old_aes, new_aes)
  new_geom$non_missing_aes <- change_name(new_geom$non_missing_aes, old_aes, new_aes)
  new_geom$required_aes <- change_name(new_geom$required_aes, old_aes, new_aes)
  new_geom$optional_aes <- change_name(new_geom$optional_aes, old_aes, new_aes)
  
  layer$geom <- new_geom
  
  old_stat <- layer$stat
  
  old_setup2 <- old_stat$handle_na
  new_setup <- function(self, data, params) {
    colnames(data)[colnames(data) %in% new_aes] <- original_aes
    old_setup2(data, params)
  }
  
  new_stat <- ggplot2::ggproto(paste0("New", class(old_stat)[1]), old_stat,
                               handle_na = new_setup)
  
  new_stat$default_aes <- change_name(new_stat$default_aes, old_aes, new_aes)
  new_stat$non_missing_aes <- change_name(new_stat$non_missing_aes, old_aes, new_aes)
  new_stat$required_aes <- change_name(new_stat$required_aes, old_aes, new_aes)
  new_stat$optional_aes <- change_name(new_stat$optional_aes, old_aes, new_aes)
  
  layer$stat <- new_stat
  
  layer$mapping <- change_name(layer$mapping, old_aes, new_aes)
  layer
}

bump_aes.list <- function(layer, new_aes) {
  old_aes <-  names(layer)[remove_new(names(layer)) %in% new_aes]
  new_aes <- paste0(old_aes, "_new")
  
  names(layer)[names(layer) %in% old_aes] <- new_aes
  layer
}

change_name <- function(list, old, new) {
  UseMethod("change_name")
}

change_name.character <- function(list, old, new) {
  list[list %in% old] <- new
  list
}

change_name.default <- function(list, old, new) {
  nam <- names(list)
  nam[nam %in% old] <- new
  names(list) <- nam
  list
}

change_name.NULL <- function(list, old, new) {
  NULL
}

remove_new <- function(aes) {
  stringi::stri_replace_all(aes, "", regex = "(_new)*")
}

Finally, a few parameters to be reused later:

min_period = 1950; max_period = 2018


# Number disciplines in graph:
n_disc = 7

# To change the period on which we count the most cited disciplines
firstYearImpDisc = 2011

2 Information on the corpus

We first want general information on the journals, starting with the number of journals per disciplines in the SSH:

# loading data
dt_citing_cited <- readRDS(paste0(path_to_project_data,
                              "dt_citing_cited_all.rds"))

# aggregating by SSH disciplines
dt_journals_disc <- dt_citing_cited[, 
                                    list(ID_Art, Year, citing_discipline, 
                                         citing_code_revue)] %>%
  unique() %>%  .[Year %in% min_period:(max_period) & citing_discipline>100] %>%
  
  .[,list(tot_j=length(unique(citing_code_revue))) ,
    by=citing_discipline]

dt_journals_disc <- merge(dt_journals_disc,
                          discipline_info[,list(Code_Discipline,discipline)],
                          by.x="citing_discipline", by.y="Code_Discipline")

dt_journals_disc <- dt_journals_disc[citing_discipline<200, 
                                     list(Number_of_journals =sum(tot_j)),
                                     by=discipline] 
# Note taht "<200" is there to remove "Unknown"

dt_journals_disc <- dt_journals_disc[order(-Number_of_journals)]

pos_core_ssh <- which(dt_journals_disc$discipline %in% core_SSH)
pos_other_ssh <- which(! 1:nrow(dt_journals_disc) %in% pos_core_ssh)
  
dt_journals_disc[c(pos_core_ssh,pos_other_ssh)]
# Saving journal numbers
tot_j_ssh <- dt_journals_disc[discipline != "Unknown",Number_of_journals] %>%  sum()
# also for NSE journals:
n_journals_NSE <- c(
  dt_citing_cited[Year %in% min_period:(max_period) &
                    citing_discipline<=100,unique(citing_code_revue)] ,
  
  dt_citing_cited[Year %in% min_period:(max_period) &
                    cited_discipline<=100,unique(cited_code_revue)]  ) %>% 
  unique() %>% length()


rm(dt_citing_cited,dt_journals_disc)

The total number of journals in the SSH is 6699. Their number increases through time as the following figure shows:

dt_citing_cited <- readRDS(paste0(path_to_project_data,
                                  "dt_citing_cited_all.rds"))

#Plotting the number of journals per year in the corpus
dt_citing_cited <- dt_citing_cited[between(Year, min_period, max_period),]
dt_citing_cited <- dt_citing_cited[between(citing_discipline,101,200), 
                                   list(ID_Art,Year, citing_discipline,
                                        citing_code_revue)]
nb_journals <- dt_citing_cited[, list(Year, citing_code_revue)] %>% unique()
nb_journals <- nb_journals[, rev_total := .N, by = Year]
ggplot(nb_journals, aes(x=Year, y=rev_total)) + geom_line(lwd=1.25) + theme_minimal() + 
  xlab("") + ylab("Number") + 
  ggtitle(
    "Number of SSH journals per year in the corpus"
    ) 

rm(dt_citing_cited)

There are also 13517 journals from the NSE that cite at least once the SSH or are cited at least once by the SSH.

Next, we look at the distribution of articles in the SSH:

dt_citing_cited <- readRDS(paste0(path_to_project_data, 
                                "dt_citing_cited_all.rds"))
dt_citing_cited <- dt_citing_cited[between(Year, min_period, max_period),]

# Two main data table, one with every articles and their references (dt_citing_cited) 
# and one with every articles only (dt_all_art)
dt_citing_cited <- dt_citing_cited[between(citing_discipline,101,200),
                                   list(ID_Art,Year,
                                        citing_discipline,citing_code_revue)]
dt_citing_cited <-  merge(dt_citing_cited,
                          discipline_info[,list(Code_Discipline,discipline)],
                          by.x = "citing_discipline", by.y= "Code_Discipline")
dt_all_art <- unique(dt_citing_cited)

#Counting number of article per discipline per year
dt_art_by_j_y <-  dt_all_art[,.(nb_art=.N), by= .(Year,discipline)]
setkey(dt_art_by_j_y,discipline,Year)

# Finding the order at the end to order in the legend
order_disc <- dt_art_by_j_y[Year== max_period][order(-nb_art),discipline]
dt_art_by_j_y$discipline <- factor(dt_art_by_j_y$discipline, levels = order_disc)

#Ploting number of article per discipline and per year in corpus
ggplot()  + geom_line(dt_art_by_j_y[!discipline %in% core_SSH], 
                      mapping = aes(x=Year, y = nb_art, linetype = discipline), 
                      colour = alpha("gray",.5),lwd=1.25) +
  geom_line(dt_art_by_j_y[discipline %in% core_SSH], 
            mapping = aes(x=Year, y=nb_art, col = discipline), lwd=1.25) +
  scale_color_manual(name = "Main social sciences", breaks = core_SSH, values = colors) + 
  scale_linetype_discrete(name="Other SSH categories") + 
  scale_y_continuous(labels=function(x) format(x, big.mark = ",", scientific = FALSE)) +
  guides(color = guide_legend(order = 1),
                  linetype = guide_legend(order = 2)) + ylab("Number of articles") + 
  theme_minimal() + theme(axis.text = element_text(size=12), axis.title =  element_text(size=12))