#!/opt/local/bin/python3.9

#%%
import sys
#import os
import numpy
import re
#from fuzzywuzzy import fuzz
#from importlib import reload
import glob
from pybtex.database.input import bibtex
import pybtex.errors
import requests
from bs4 import BeautifulSoup
import networkx as nx
from networkx.algorithms import community
import json
import datetime
import time
from unidecode import unidecode
from geopy.geocoders import Nominatim
import socks
import socket
from collections import OrderedDict
from progress.bar import ChargingBar


#%% some pre-settings
affiliationsFile = '../Data/affiliations.json'
countries = {}
affiliations = {}

# Define the base URL for the SCOPUS API
api_key = '4633b3e59961cda16424a39df3fac9ba'
base_url = 'https://api.elsevier.com/content/search/scopus'


# Set up the SOCKS proxy
socks.set_default_proxy(socks.SOCKS5, "localhost", 1080)  # Change 8080 to your SSH tunnel port
socket.socket = socks.socksocket  # Override socket.socket with SOCKS proxy



def getCoord(country):
    # Create a Nominatim geolocator object
    geolocator = Nominatim(user_agent="getCoordinates")

    location = geolocator.geocode(country)

    # Get the latitude and longitude coordinates
    latitude = location.latitude
    longitude = location.longitude

    return latitude, longitude



# Define a function to retrieve the DOI and cities of the authors for a given article
def get_author_affiliations(title, journal, year, doi):
    # Construct the query string
    headers = {'X-ELS-APIKey': api_key}
    if doi == "None":
        query = f"TITLE(\"{title}~\") AND SRCTITLE({journal}~) AND PUBYEAR = {year}"
        # Send the request to the Scopus API to get the Scopus ID
        params = {
            'query': query,
            'field': 'dc:identifier',
            'view': 'complete'
        }
        response = requests.get(base_url, headers=headers, params=params)
        # Extract the Scopus ID from the response
        try:
            scopus_id = response.json()['search-results']['entry'][0]['dc:identifier'].split(':')[-1]
        except:
            scopus_id = "None"
        # Send a second request to retrieve the full metadata for the article
        params = {
            'field': 'authname,affiliation',  # Request author names and affiliations
        }
        response = requests.get(f'https://api.elsevier.com/content/abstract/scopus_id/{scopus_id}', headers=headers, params=params)
    else:
        #    'field': 'dc:title,dc:creator,prism:doi,affiliation,authname',
        params = {
            'field': 'authname,affiliation',  # Request author names and affiliations
        }
        response = requests.get(f'https://api.elsevier.com/content/abstract/doi/{doi}', headers=headers, params=params)
    # Extract the DOI and cities of the authors from the response, if they exist
    try:
        result = BeautifulSoup(response.content, 'xml')
        
        # Extract affiliations
        for aff in result.find_all('affiliation'):
            try:
                aff_id = aff.get('id')
                if aff_id not in affiliations:
                    aff_name = aff.find('affilname').text
                    affiliations[aff_id] = aff_name
                    #print(affiliations[aff_id])
                    countries[aff_id] = geocode_location(affiliations[aff_id])
                    #print(countries[aff_id])
            except:
                countries[aff_id] = 'None'
            #print(countries)

        
        authors_info = []
        authors = result.find_all('author')
        for author in authors:
            initials = author.find('ce:initials').text
            indexed_name = author.find('ce:indexed-name').text
            surname = author.find('ce:surname').text
            given_name = author.find('ce:given-name').text
            full_name = f"{given_name} {surname}"
            author_affiliations = []

            for aff_id_tag in author.find_all('dn:affiliation'):
                aff_id = aff_id_tag.get('id')
                if aff_id in affiliations:
                    author_affiliations.append({
                        'affiliation_name': affiliations[aff_id],
                        'author_country': countries[aff_id]
                    })
            authors_info.append({
                'author_name': indexed_name,
                'author_affiliations': author_affiliations
            })
            

        return authors_info
    except:
        print("error")
        return "None"


# Define a function to geocode a location using the OpenStreetMap Nominatim API
def geocode_location(location):
    # Construct the API request URL
    base_url = 'https://nominatim.openstreetmap.org/search'
    params = {
        'q': location.rstrip(),
        'format': 'json',
        'addressdetails': 1,
        "accept-language": "en",
        'limit': 1
    }
    #print(location)
    url = f'{base_url}?{"&".join([f"{k}={v}" for k, v in params.items()])}'


    # Make the API request and wait for 1 second to avoid hitting the rate limit
    response = requests.get(url)
    time.sleep(.21)

    # Parse the JSON response and extract the country name
    result = response.json()[0] if response.ok and len(response.json()) > 0 else "None"
    country = result['address']['country'] if result else "None"
    #print(country)

    return country



#%% import data

# import rp.bib
print("Read rp.bib")
pybtex.errors.set_strict_mode(True)
parser = bibtex.Parser()
bibdata = parser.parse_file("../rp.bib")
labels = sorted(bibdata.entries.keys())

# import available affiliations
try:
    with open(affiliationsFile,'r') as f:
        publicationsAffiliations = json.load(f)
        f.close()
except:
    publicationsAffiliations = {}

 
#%% retrieve afiliations for each paper
cnt = 0
maxCnt = 31
maxCnt = len(bibdata.entries)
   

bar = ChargingBar('Find affiliations', max=len(bibdata.entries), suffix='%(percent).1f%% - %(eta)ds')


for bib_id in labels:

    bar.next()

    
    # skip if bib entry is already in affiliation.json
    if bib_id in publicationsAffiliations:# and citations[bib_id] != []:
       #print(f"{bib_id} already included")
       continue

    if cnt > maxCnt:
       break

    cnt += 1

    doi = "None"
    title = "None"
    year = "0000"
    journal = "None"
    p = {}

    #print("(" + str(cnt) + "/" + str(len(bib_id)) + ") ==========================================")
    #print(f"Current paper: \033[1m" + bib_id + "\033[0m")


    # if doi available, use this one for request
    if "doi" in bibdata.entries[bib_id].fields:
        doi = bibdata.entries[bib_id].fields['doi']

    if "title" in bibdata.entries[bib_id].fields:
        title = bibdata.entries[bib_id].fields['title']

    if "year" in bibdata.entries[bib_id].fields:
        year = bibdata.entries[bib_id].fields['year']

    if "journal" in bibdata.entries[bib_id].fields:
        journal = bibdata.entries[bib_id].fields['journal']
    
    try:
        for author in bibdata.entries[bib_id].persons['author']:
            firstname = author.first_names[0]
            lastname = author.last_names[0]
    except:
        firtname = 'none'
        lastname = 'none'

    au_affiliations = get_author_affiliations(title, journal, year, doi)

    publicationsAffiliations[bib_id] = au_affiliations

    #%% save results
    if cnt % 50 == 0:
        sorted_dict = OrderedDict(sorted(publicationsAffiliations.items()))
        f = open(affiliationsFile,'w')
        json.dump(sorted_dict, f, indent=4)
        f.close()

bar.finish()

sorted_dict = OrderedDict(sorted(publicationsAffiliations.items()))
f = open(affiliationsFile,'w')
json.dump(sorted_dict, f, indent=4)
f.close()
