# -*- coding: utf-8 -*-
"""
Scraping Script for the ESAC Database including contract details for the paper

>>Trapped in Transformative Agreements?  A Multifaceted Analysis of >1,000 Contracts<<

Authors: Laura Rothfritz, Ulrich Herb, W. Benedikt Schmal
Contact: wolfgang-benedikt.schmal@tu-ilmenau.de
Date: 29 August, 2024

"""
# Packages 
import os
os.chdir('YOUR PATH') # adjust accordingly

import scrapy
from scrapy.crawler import CrawlerProcess
import pandas as pd
import time
import random
###

class EsacSpider(scrapy.Spider):
    name = "esac"
    allowed_domains = ["esac-initiative.org"]
    start_urls = ["https://esac-initiative.org/about/transformative-agreements/agreement-registry/"]
    counter = 0
    scraped_data = []

    def parse(self, response):
        all_registries = response.xpath("//table[@id='tablepress-10']//tbody//tr")
        for registry in all_registries:
            detail_url = registry.xpath("./td[7]/a/@href").get('')
            yield scrapy.Request(detail_url, callback=self.parse_registry_page, dont_filter=True, meta={'detail_url': detail_url})
            self.counter += 1
            if self.counter % 50 == 0:
                self.log(f'> > > Scraped {self.counter} pages up to now < < <')
            time.sleep(random.uniform(2, 6)) # random delay to avoid website blocking and lower website resource consumption

    def parse_registry_page(self, response):
        registery_url = response.meta['detail_url']

        publisher = response.xpath("//td[contains(text(),'Publisher')]/following-sibling::td/h2/text()").get('')
        if not publisher:
            publisher = response.xpath("//td[contains(text(),'Publisher')]/following-sibling::td/text()").get('')
        print(f'Publisher: {publisher}')

        agreement_id = response.xpath("//td[contains(text(),'Agreement ID')]/following-sibling::td/text()").get('')
        if '\n' in agreement_id:
            agreement_id = agreement_id.replace('\n', '').strip()
        print(f'Agreement Id: {agreement_id}')
        agreement_labeling = response.xpath("//td[contains(text(),'Agreement labeling')]/following-sibling::td/text()").get('')
        agreement_published = response.xpath("//td[contains(text(),'Has the agreement been')]/following-sibling::td/text()").get('')
        url = response.xpath("//td[text()='URL']/following-sibling::td/a/@href").get('')
        agreement_period = response.xpath("//td[contains(text(),'Agreement period')]/following-sibling::td/text()").get('')

        if agreement_period:
            agreement_period = agreement_period.split("–")
            start_date = agreement_period[0].strip()
            end_date = agreement_period[-1].strip()
        else:
            start_date, end_date = '', ''

        consortia_institution = response.xpath("//td[contains(text(),'Consortia / Institution')]/following-sibling::td/text()").get('')
        country = response.xpath("//td[contains(text(),'Country')]/following-sibling::td/text()").get('')
        size = response.xpath("//strong[contains(text(),'SIZE')]/parent::td/following-sibling::td/text()").get('')
        comments_on_size_output = response.xpath("//strong[text()='Comments on size/article output']/parent::td/following-sibling::td/text()").get('')
        cost = response.xpath("//strong[text()='COSTS']/parent::td/following-sibling::td/text()").get('')

        comments_on_cost_development = response.xpath("//strong[text()='Comments on cost development']/parent::td/following-sibling::td/text()").getall()
        if comments_on_cost_development:
            comments_on_cost_development = "".join(comments_on_cost_development)
        else:
            comments_on_cost_development = ''
       
        financial_shift = response.xpath("//strong[text()='FINANCIAL SHIFT']/parent::td/following-sibling::td/text()").get('')
        risk_sharing = response.xpath("//strong[text()='RISK SHARING']/parent::td/following-sibling::td/text()").get('')
        oa_coverage = response.xpath("//strong[text()='OA COVERAGE']/parent::td/following-sibling::td/text()").get('')
        fully_open_access_journals_covered = response.xpath("//td[text()='Are fully open access journals covered by the agreement?']/following-sibling::td/text()").get('')
        oa_license = response.xpath("//strong[text()='OA LICENSE']/parent::td/following-sibling::td/text()").get('')

        article_types = response.xpath("//strong[text()='ARTICLE TYPES']/parent::td/following-sibling::td/text()").getall()
        if article_types:
            article_types = ", ".join(article_types).replace("\n", '').strip()
            
        access_costs = response.xpath("//strong[text()='ACCESS COSTS']/parent::td/following-sibling::td/text()").get('')
        comments_on_access_costs = response.xpath("//strong[text()='Comments on access costs']/parent::td/following-sibling::td/text()").get('')
        access_coverage = response.xpath("//strong[text()='ACCESS COVERAGE']/parent::td/following-sibling::td/text()").get('')
        perpetual_access_rights = response.xpath("//strong[text()='PERPETUAL ACCESS RIGHTS']/parent::td/following-sibling::td/text()").get('')
        workflow_assessment = response.xpath("//strong[text()='WORKFLOW ASSESSMENT']/parent::td/following-sibling::td/text()").get('')
        comments_on_workflows = response.xpath("//strong[text()='Comments on workflows']/parent::td/following-sibling::td/text()").get('')
        overall_assessment_and_comments = response.xpath("//strong[text()='OVERALL ASSESSMENT AND COMMENTS']/parent::td/following-sibling::td/text()").get('') 

        item = {
            'Page Url': registery_url,
            'Publisher': publisher,
            'Agreement ID': agreement_id,
            'Agreement Labeling': agreement_labeling,
            'Agreement Published': agreement_published,
            'URL': url,
            'Start Date': start_date,
            'End Date': end_date,
            'Consortia/Institution': consortia_institution,
            'Country': country,
            'Size': size,
            'Commnets on size/article output': comments_on_size_output,
            'Cost': cost,
            'Comments on Cost Development': comments_on_cost_development,
            'Financial Shift': financial_shift,
            'Risk Sharing': risk_sharing,
            'OA Coverage': oa_coverage,
            'Fully Open Access Journals Covered': fully_open_access_journals_covered,
            'OA License': oa_license,
            'Article Types': article_types,
            'Access Costs': access_costs,
            'Comments on Acess Costs': comments_on_access_costs,
            'Access Coverage': access_coverage,
            'Perpetual Access Rights': perpetual_access_rights,
            'Workflow Assessment': workflow_assessment,
            'Comments on Workflows': comments_on_workflows,
            'Overall Assessment and Comments': overall_assessment_and_comments,
        }
        self.scraped_data.append(item)
        yield item

    def closed(self, reason):
        df = pd.DataFrame(self.scraped_data)
        df.to_excel("ESAC_Initiative_Aug24.xlsx", index=False) # adjust file name!


if __name__ == "__main__":
    process = CrawlerProcess(settings={
        'LOG_LEVEL': 'INFO'
    })
    process.crawl(EsacSpider)
    process.start()


# # #   E N D   O F   S C R I P T   # # #