{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Retreive NWO and ZonMw funded publications from the OpenAire API\n",
    "From the OpenAlex we retreive all publications that attribute NWO and/or ZonMw funding.The retreived data is used for the following purposes:\n",
    "- URLs of the pdfs: a URL indicating the download location for the pdf\n",
    "- Datasets: variable indicating datasets related to the publication based on data from the Data Citation Corpus\n",
    "- Other metadata to be used for making various crosssections in the final analysis"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Add data from the OpenAire API"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get file with NWO and/or ZonMw funded publications\n",
    "import os\n",
    "import pandas as pd\n",
    "current_directory = os.getcwd()\n",
    "parent_directory = os.path.dirname(current_directory)\n",
    "openalex_works_df = pd.read_pickle(parent_directory + '/OpenAlex/OpenAlex_works_df.pkl')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.00%\n"
     ]
    }
   ],
   "source": [
    "import requests\n",
    "import xml.etree.ElementTree as ET\n",
    "import pandas as pd\n",
    "import time\n",
    "\n",
    "# Initialize a list to store the results\n",
    "datasetrows = []\n",
    "softwarerows = []\n",
    "\n",
    "# Your personal access token (replace this with your actual token)\n",
    "access_token = \" \"\n",
    "\n",
    "# Function to handle API requests with retries and exponential backoff\n",
    "def make_request_with_backoff(url, max_retries=5, backoff_factor=2):\n",
    "    retries = 0\n",
    "    wait_time = 1  # Start with 1 second wait time\n",
    "    \n",
    "    # Include the refresh token in the request headers\n",
    "    refresh_token = \" \"\n",
    "    response = requests.get(f\"https://services.openaire.eu/uoa-user-management/api/users/getAccessToken?refreshToken={refresh_token}\")\n",
    "    access_token = response.json()['access_token']\n",
    "\n",
    "    headers = {\n",
    "        \"Authorization\": f\"Bearer {access_token}\"\n",
    "    }\n",
    "    \n",
    "    while retries < max_retries:\n",
    "        response = requests.get(url, headers=headers)  # Pass the headers with the request\n",
    "        if response.status_code == 200:\n",
    "            return response\n",
    "        elif response.status_code == 429:  # Too many requests error\n",
    "            retries += 1\n",
    "            print(f\"Rate limit exceeded. Retrying in {wait_time} seconds...\")\n",
    "            time.sleep(wait_time)\n",
    "            wait_time *= backoff_factor  # Exponentially increase the wait time\n",
    "        else:\n",
    "            # If the status code is not 200 or 429, return None\n",
    "            print(f\"Failed to retrieve data from {url}. Status code: {response.status_code}\")\n",
    "            return None\n",
    "    return None  # Return None if all retries fail\n",
    "\n",
    "# Loop through each DOI in the DataFrame\n",
    "for i, doi in enumerate(openalex_works_df['doi']):\n",
    "    if i % 10 == 0:\n",
    "        print(\"{:.2%}\".format(i / len(openalex_works_df)))\n",
    "    \n",
    "    # Make the API request\n",
    "    response = make_request_with_backoff(f'https://api.openaire.eu/search/publications?doi={doi}')\n",
    "    \n",
    "    # Check if the request was successful\n",
    "    if response.status_code == 200:\n",
    "        root = ET.fromstring(response.content)\n",
    "        \n",
    "        # Loop through the XML tree to find instances of datasets and software\n",
    "        for instance in root.findall('.//rel'):\n",
    "            # Find the instancetype and its class\n",
    "            instancetype = instance.find('.//to')\n",
    "            # return attributes of instancetype\n",
    "\n",
    "            if instancetype is not None:\n",
    "                classname = instancetype.attrib.get('type', '')\n",
    "\n",
    "                # Extract internal DOI (doi_dedup___)\n",
    "                internal_doi = instancetype.text if instancetype.text else 'Unknown DOI'\n",
    "\n",
    "                # Proceed only if it's a dataset or software\n",
    "                if classname in ['dataset', 'software']:\n",
    "                    # Make the second API request to fetch additional information\n",
    "                    response1 = make_request_with_backoff(f'https://api-beta.openaire.eu/graph/researchProducts/{internal_doi}')\n",
    "                    \n",
    "                    if response1 is not None and response1.status_code == 200:\n",
    "                        \n",
    "                        response1_json = response1.json()\n",
    "\n",
    "                        if response1_json.get('type') == 'software':\n",
    "                            title = response1_json.get('mainTitle', 'Unknown Title')\n",
    "                            repository_url = response1_json.get('codeRepositoryUrl', 'No Repository URL')\n",
    "                            license_info = response1_json.get('instance', [{}])[0].get('license', 'No License Available')\n",
    "                            access_rights = response1_json.get('bestAccessRight', {}).get('label', 'Unknown Access Rights')\n",
    "                            \n",
    "                            # Get all instances of software URLs\n",
    "                            software_urls = []\n",
    "                            if 'instance' in response1_json:\n",
    "                                for instance1 in response1_json['instance']:\n",
    "                                    software_urls.extend(instance1.get('url', []))\n",
    "\n",
    "                            # Add software info to the rows\n",
    "                            softwarerows.append({\n",
    "                                'doi': doi,\n",
    "                                'internal_doi': internal_doi,\n",
    "                                'title': title,\n",
    "                                'repository_url': repository_url,\n",
    "                                'license': license_info,\n",
    "                                'access_rights': access_rights,\n",
    "                                'urls': software_urls,\n",
    "                                'type': 'Software'\n",
    "                            })\n",
    "                        \n",
    "                        elif response1_json.get('type') == 'dataset':\n",
    "                            author = response1_json.get('author', [{}])[0]\n",
    "                            title = response1_json.get('mainTitle', 'Unknown Title')\n",
    "                            description = response1_json.get('description', 'No Description Available')\n",
    "                            publication_date = response1_json.get('publicationDate', 'Unknown Date')\n",
    "                            publisher = response1_json.get('publisher', 'Unknown Publisher')\n",
    "                            license_info = response1_json.get('instance', [{}])[0].get('license', 'No License Available')\n",
    "                            subjects = response1_json.get('subjects', 'No Subjects Available')\n",
    "\n",
    "                            # Add dataset info to the rows\n",
    "                            datasetrows.append({\n",
    "                                'doi': doi,\n",
    "                                'internal_doi': internal_doi,\n",
    "                                'author': author,\n",
    "                                'title': title,\n",
    "                                'description': description,\n",
    "                                'publication_date': publication_date,\n",
    "                                'publisher': publisher,\n",
    "                                'license': license_info,\n",
    "                                'subjects': subjects,\n",
    "                                'type': 'Dataset'\n",
    "                            })\n",
    "\n",
    "                                                \n",
    "                    else:\n",
    "                        print(f\"Failed to retrieve additional data for DOI: {internal_doi}\")\n",
    "\n",
    "                    time.sleep(5)  # Respect API rate limits\n",
    "                    \n",
    "                    # Extract hostedby ID for datasets and software\n",
    "                    hostedby = instance.find('.//hostedby')\n",
    "                    hostedby_id = hostedby.attrib.get('id') if hostedby is not None else 'Unknown ID'\n",
    "\n",
    "                    # Extract URL\n",
    "                    url_element = instance.find('.//url')\n",
    "                    url = url_element.text if url_element is not None else 'No URL available'\n",
    "\n",
    "    else:\n",
    "        print(f\"Failed to retrieve data for DOI: {doi}\")\n",
    "    time.sleep(1)\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset_OA = pd.DataFrame(datasetrows)\n",
    "software_OA = pd.DataFrame(softwarerows)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Preparing the final data set \n",
    "From the OA API, now format the dataset to be cleaned and formatted so that the KPIs can easily be extracted, analysed and combined with results from other API and GROBID"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "For the dataset:\n",
    "- clean column names\n",
    "- extract author names\n",
    "- extract subjects\n",
    "- remove internal_doi column\n",
    "- remove duplicates"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "# remove duplicates (doi in combination with internal_doi are the same)\n",
    "dataset_OA_dropped_duplicates = dataset_OA.drop_duplicates(subset=[\"doi\",\"internal_doi\"], keep=\"last\")\n",
    "dataset_OA_dropped_duplicates = dataset_OA_dropped_duplicates.reset_index(drop=True)\n",
    "\n",
    "# clean column names (doi to be doi of publication)\n",
    "dataset_OA_dropped_duplicates = dataset_OA_dropped_duplicates.rename(columns={\"doi\":\"doi_publication\"})\n",
    "# remove internal_doi column\n",
    "dataset_OA_dropped_duplicates = dataset_OA_dropped_duplicates.drop(columns=[\"internal_doi\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Add empty columns for 'keywords' and 'SDG' to the DataFrame\n",
    "dataset_OA_dropped_duplicates['keywords'] = None\n",
    "dataset_OA_dropped_duplicates['SDG'] = None\n",
    "\n",
    "# Iterate over each row in the DataFrame and populate the new columns\n",
    "for index, row in dataset_OA_dropped_duplicates.iterrows():\n",
    "    # Get the 'subjects' value for the current row\n",
    "    subjects = row['subjects']\n",
    "    # if subjects is a list, continue\n",
    "    if type(subjects) != list:\n",
    "        continue\n",
    "    length = len(subjects)\n",
    "    keywords = []\n",
    "    # Iterate over each value in the 'subjects' list\n",
    "    for i in range(length):\n",
    "        # Get the 'scheme' and 'value' for the current 'subjects' value\n",
    "        item = subjects[i]['subject']\n",
    "        if item['scheme'] == 'keyword':\n",
    "            keywords.append(item['value'])\n",
    "        elif item['scheme'] == 'SDG':\n",
    "            dataset_OA_dropped_duplicates.at[index, 'SDG'] = item['value']\n",
    "    # Update the 'keywords' column for the current row\n",
    "    dataset_OA_dropped_duplicates.at[index, 'keywords'] = keywords\n",
    "            \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [],
   "source": [
    "# only perform operation if it is a dictionary, only if fullName is in the dictionary\n",
    "dataset_OA_dropped_duplicates['authors'] = dataset_OA_dropped_duplicates['author'].apply(lambda x: x['fullName'] if isinstance(x, dict) and 'fullName' in x else None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [],
   "source": [
    "final_dataset = dataset_OA_dropped_duplicates.drop(columns=[\"author\", \"subjects\"])\n",
    "# replace 'No Licence Available' with None\n",
    "final_dataset['license'] = final_dataset['license'].replace('No License Available', None)\n",
    "final_dataset['publisher'] = final_dataset['publisher'].replace('Unknown Publisher', None)\n",
    "final_dataset['description'] = final_dataset['description'].replace('No Description Available', None)\n",
    "final_dataset['publication_date'] = final_dataset['publication_date'].replace('Unknown Date', None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [],
   "source": [
    "# save to pickle\n",
    "final_dataset.to_pickle('OpenAire_dataset_citations_df.pkl')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "for software: \n",
    "- remove duplicates\n",
    "- remove internal_doi column\n",
    "- extract respository from repository url"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "metadata": {},
   "outputs": [],
   "source": [
    "software_OA_drop_duplicates = software_OA.drop_duplicates(subset=[\"doi\",\"internal_doi\"], keep=\"last\")\n",
    "software_OA_drop_duplicates = software_OA_drop_duplicates.reset_index(drop=True)\n",
    "\n",
    "# clean column names (doi to be doi of publication)\n",
    "software_OA_drop_duplicates = software_OA_drop_duplicates.rename(columns={\"doi\":\"doi_publication\"})\n",
    "# remove internal_doi column\n",
    "software_OA_drop_duplicates = software_OA_drop_duplicates.drop(columns=[\"internal_doi\"])\n",
    "\n",
    "# No Licence Available -> None\n",
    "software_OA_drop_duplicates['license'] = software_OA_drop_duplicates['license'].replace('No License Available', None)\n",
    "# Unknown Access Rights -> None\n",
    "software_OA_drop_duplicates['access_rights'] = software_OA_drop_duplicates['access_rights'].replace('Unknown Access Rights', None)\n",
    "# No Repository URL -> None\n",
    "software_OA_drop_duplicates['repository_url'] = software_OA_drop_duplicates['repository_url'].replace('No Repository URL', None)\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {},
   "outputs": [],
   "source": [
    "# extract from the repository_url the repository name, between https:// and .com, it is not a list so we can use apply\n",
    "software_OA_drop_duplicates['repository'] = software_OA_drop_duplicates['repository_url'].apply(lambda x: x[8:x.find('.com')] if isinstance(x, str) else None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {},
   "outputs": [],
   "source": [
    "# save to pickle\n",
    "software_OA_drop_duplicates.to_pickle('OpenAire_software_citations_df.pkl')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.18"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
