/* 
Data preparation for the paper:
 >> Trapped in Transformative Agreements? A Multifaceted Analysis of >1,000 Contracts <<

 Authors: Laura Rothfritz, Ulrich Herb, W. Benedikt Schmal

 Contact: wolfgang-benedikt.schmal@tu-ilmenau.de
 
 Date: 29 August 2024
*/

clear all
set more off
set seed 1896

global main "YOURPATH\code" // Adjust according to your folder structure
global plots "YOURPATH\plots" // Adjust according to your folder structure

local prep = 1
local excel_export = 0

*****
*** DATA PREPARATION
if `prep' == 1 {
	cd "$main"
	import excel "Esac_Initiative_Aug24.xlsx", clear first // 

	destring Size, replace force

	tab Publisher, sort

	replace Publisher  = "Walter de Gruyter" if strpos(lower(Publisher), "gruyter") > 0
	replace Publisher  = "Walter de Gruyter" if strpos(lower(Publisher), "guyter") > 0
	replace Publisher  = "OUP" if strpos(lower(Publisher), "oxford") > 0
	replace Publisher  = "SAGE" if strpos(Publisher, "SAGE") > 0
	replace Publisher  = "SAGE" if strpos(Publisher, "Sage") > 0
	replace Publisher  = "Royal Soc Chem" if strpos(Publisher, "RSC") > 0
	replace Publisher  = "Royal Soc Chem" if strpos(Publisher, "RSC") > 0
	replace Publisher  = "Royal Soc Chem" if strpos(Publisher, "Royal Society of Chemistry") > 0
	replace Publisher  = "BMJ" if strpos(Publisher, "BMJ") > 0
	replace Publisher  = "Cambridge University Press" if strpos(lower(Publisher), "cambrige") > 0
	replace Publisher  = "IOP Publishing" if strpos(Publisher, "IOP") > 0
	replace Publisher  = "ACS" if strpos(Publisher, "ACS") > 0
	replace Publisher  = "ACS" if strpos(lower(Publisher), "american chemical") > 0
	replace Publisher  = "Springer Nature" if strpos(lower(Publisher), "springer") > 0
	* replace Publisher  = "Springer Nature" if strpos(Publisher, "Akadémiai Kiadó") > 0
	replace Publisher  = "SPIE" if strpos(Publisher, "SPIE") > 0
	replace Publisher  = "Emerald" if strpos(lower(Publisher), "emerald") > 0
	replace Publisher  = "Brill" if strpos(lower(Publisher), "brill") > 0
	replace Publisher  = "IWA Publishing" if strpos(Publisher, "IWA") > 0
	replace Publisher  = "Trans Tech" if strpos(Publisher, "Trans Tech") > 0
	replace Publisher  = "IEEE" if strpos(Publisher, "IEEE") > 0
	replace Publisher  = "ACM" if strpos(lower(Publisher), "computing machinery") > 0
	replace Publisher  = "Bioscientifica" if strpos(lower(Publisher), "bioscientifica") > 0
	replace Publisher  = "Wolters Kluwer Health" if strpos(lower(Publisher), "wolters") > 0
	replace Publisher  = "World Scientific" if strpos(lower(Publisher), "world scientific") > 0
	replace Publisher  = "John Benjamins Publishing" if strpos(Publisher, "John Benjamins") > 0
	replace Publisher  = "Geological Society London" if strpos(Publisher, "Geological") > 0

	replace Publisher  = "Oxford UP" if strpos(lower(Publisher), "oup") > 0
	replace Publisher  = "Cambridge UP" if strpos(lower(Publisher), "cambridge") > 0
	replace Publisher  = "Rockefeller UP" if strpos(lower(Publisher), "rockefeller") > 0

	tab Publisher, sort

	compress
	replace EndDate = "12/31/2021" if EndDate == "12/31/202"
	gen startdate = daily( StartDate , "MDY")
	gen enddate = daily( EndDate , "MDY")
	format startdate %td
	format enddate %td
	replace enddate = daily( EndDate , "DMY") if enddate ==.

	gen length = enddate - startdate
	su length, d
	replace length = length +1
	gen length_year = length / 365
	gen totalsize = Size * length_year
	replace totalsize = 12 if totalsize==.

	gen publ2 = Publisher
	bys Publisher: gen hv1 = _N
	replace publ2 = "Other" if hv1 <= 39
		
	replace OALicense = "CC-BY variations allowed" if OALicense == "Both our fully Gold OA and our hybrid journals allow authors to publish articles under Creative Commons (CC) licences, thereby enabling readers to freely access and re-distribute their articles. We encourage the use of CC-BY licences for Gold OA journal articles, but authors are able to choose more restrictive CC licences if they wish to prevent commercial use or adaptation in new works."
	replace OALicense = "CC-BY-NC" if inlist(OALicense, "CC-BY NC 4.0", "CC BY-NC 4.0 license default", "CC-BY NC")
	replace OALicense = "CC-BY-NC" if OALicense == "CC-BY-NC but “when there is an obligation of the Funder to publish under CC-BY, the author can mention this at the time of submission and the Karger team will work with them to make this possible.”"
	replace OALicense = "Undefined" if inlist(OALicense, "Not defined", "Not specified in the agreement", "")
	replace OALicense = "CC-BY variations allowed" if inlist(OALicense, "CC-BY, CC-BY-NC", "CC-BY, CC-BY-NC-ND")
	replace OALicense = "CC-BY mandatory" if inlist(OALicense, "CC-BY")
	replace OALicense = "Undefined" if strpos(OALicense, "License not specified ") > 0
	replace OALicense = "CC-BY-NC-ND" if inlist(OALicense, "CC BY-NC-ND 4.0", "CC-BY-NC-ND, unless the funder mandates otherwise")
	replace OALicense = "CC-BY preference, exceptions allowed" if OALicense == "CC-BY preferred, exceptions allowed"
	replace OALicense = "CC-BY preference, exceptions allowed" if strpos(OALicense, "CC-BY mandatory, but exceptions") > 0
	replace OALicense = "Publisher-specific/Undefined" if inlist(OALicense, "Publisher-specific license type", "Publisher specific license type", "Undefined", "CC-BY on request")
	replace OALicense = "CC-BY preference, exceptions allowed" if inlist(OALicense, "CC BY, with some journals using a CC BY-NC license as default", "CC-BY in all titles except those that don’t currently offer it")
	
	tab OALicense, sort
	
	replace OALicense = "CC-BY preference w/ exceptions" if strpos(OALicense, "CC-BY preference, exceptions allowed") > 0
	replace OALicense = "Publisher-specific/Other" if inlist(OALicense, "Publisher-specific/Undefined", "CC-BY mandatory since third year of term; CC-BY preferred in the first two years of term")
	replace OALicense = "Publisher-specific/Other" if inlist(OALicense, "Publisher-specific/Undefined/Other")

	tab OALicense, sort

	replace Cost = "Stable Cost" if Cost == "Agreement costs within the range of the previous spending level"

	gen big3 = 0
	replace big3 = 1 if inlist(Publisher, "Elsevier", "Springer Nature", "Wiley")
	
	tab big3
	gen big5 = 0
	replace big5 = 1 if inlist(Publisher, "Elsevier", "Springer Nature", "Wiley", "Taylor & Francis", "SAGE")
	tab big5

	tab Cost
	tab Cost if big3 == 1
	tab Cost if big5 == 1

	bys AgreementID: gen count = _N
	tab count
	replace AgreementID = "acs2023czelib2" if AgreementID == "acs2023czelib" & Publisher == "Karger"
	drop count
	cd "$main"
	save esac_data.dta, replace


	use ESAC_march_match.dta, clear
	merge 1:1 AgreementID using esac_data

	drop _merge
	replace oldTA = 0 if oldTA ==.
	
	if `excel_export' == 1 {
		export excel "TA_list_Aug08.xlsx", firstrow(var) replace
	}	
	
	compress
	save esac_data.dta, replace

*** continue 
	
	use esac_data.dta, clear

	** gen country codes
	gen europe = 1 
	replace europe = 0 if inlist(Country, "United States", " Australia", "Saudi Arabia", "Israel", "Hong Kong", "South Africa", "Canada", "Japan", "Turkey")
	replace europe = 0 if inlist(Country, "China", "Colombia", "South Korea", "Qatar", "Palestine", "Singapore", "Taiwan")
	replace europe = 0 if strpos(Country, "Armenia")
	tab europe

	gen country2 = 0
	replace country2 = 1 if Country == "United States"
	replace country2 = 2 if europe == 1
	replace country2 = 3 if Country == "China"

	gen country3 = country2
	replace country3 = -1 if country2 == 2
	replace country3 = -2 if country2 == 3
	replace country3 = 2 if Country == "China"
	replace country3 = 3 if Country == "Germany"
	replace country3 = 4 if Country == "Netherlands"
	replace country3 = 5 if Country == "United Kingdom"
	replace country3 = 6 if Country == "Austria"
	replace country3 = 7 if Country == "Hungary"
	replace country3 = 8 if country3 == -1
	tab country3

	gen counter = 1

	tab AccessCosts
	replace AccessCosts = "More than 50%" if AccessCosts =="more than 50%"
	replace AccessCosts = "Other" if strpos(AccessCosts , "no distinction")
	replace AccessCosts = "Other" if strpos(AccessCosts , "costs vs")
	replace AccessCosts = "Other" if strpos(AccessCosts , "Unknown")
	replace AccessCosts = "Other" if strpos(AccessCosts , "vs")
	replace AccessCosts = "20%-50%" if strpos(AccessCosts , "20%-50%")
	replace AccessCosts = "20%-50%" if strpos(AccessCosts , "20% – 50%")
	replace AccessCosts = "20%-50%" if strpos(AccessCosts , "20%-35%")
	replace AccessCosts = "20%-50%" if strpos(AccessCosts , "35%") 
	replace AccessCosts = "20%-50%" if strpos(AccessCosts , "20% – 50%")

	gen ac2 = AccessCosts
	replace ac2 = "1) 0%" if AccessCosts == "0%"
	replace ac2 = "2) 1%-5%%" if AccessCosts == "1%-5%"
	replace ac2 = "3) 5%-20%" if AccessCosts == "5%-20%"
	replace ac2 = "4) 20%-50%" if AccessCosts == "20%-50%"
	replace ac2 = "5) >50%" if AccessCosts == "More than 50%"
	replace ac2 = "6) Undefined" if AccessCosts == "Other"
	tab ac2 AccessCosts

	split StartDate, parse(/)
	destring StartDate3, gen(startyear)

	bys Publisher: gen tacount = _N
	tab tacount


	split AgreementID, parse(2)
	replace AgreementID3 = AgreementID4 if AgreementID3 == ""
	replace AgreementID3 = AgreementID2 if AgreementID3 == ""
	ereplace AgreementID3 = sieve(AgreementID3), omit(0123456789)
	drop AgreementID2 AgreementID4
	bys AgreementID1 AgreementID3:  gen count_ta_funder_cons = _N
	tab count_ta_funder_cons

	gen agr_id2 = AgreementID1 + AgreementID3
	egen lo_bp = min(startyear), by(agr_id2)

	gen follow_up_ta = 0
	replace follow_up_ta = 1 if count_ta_funder_cons > 1 & lo_bp < startyear

	gen followupta_exists = 0
	replace followupta_exists = 1 if count_ta_funder_cons > 1 & lo_bp == startyear

	gen loglength = log(length)
	gen lsize = log(Size)
	gen ltac = log(tacount)
	encode Publisher, gen(publFE)
	encode Country, gen(countryFE)
	
	drop AgreementID1 AgreementID3 StartDate3 StartDate2 StartDate1 hv1 URL PageUrl
	compress
	save esac_data.dta, replace // final dataset
}
**

********************************************************************************
					 *** END OF PREPARATION SCRIPT ***
********************************************************************************
