#%%
import sys
import os
import numpy
import re
from importlib import reload
import glob
from pybtex.database.input import bibtex
import pybtex.errors
import requests
from bs4 import BeautifulSoup
import json
import datetime
import time
from collections import OrderedDict
from progress.bar import ChargingBar


#%% some pre-settings
username = "marwan@pik-potsdam.de" # required for crossref requests (use your email address)
subjectsFile = '../Data/subjects.json'

start_time = time.time()

dic = {"{\k c}":"c","{\k a}":"a","{":"","}":"","\\r":"","\r":"","\l":"","\H":"","\c":"","\o":"o","\k":"","\\O":"O","\\ae":"","\\c ":"","\\u ":"","\\.":"","\\. ":"","\\^":"","\\v ":"","\\v":"","\\` ":"","\\' ":"","\\`":"","\\\'":"","\\\"":"","\\~":"",
"\\\"a":"a", "\\\"u":"u", "\\\"o":"o", "\\ss":"ss", "\\'a":"a", "\\'o":"o", "\\'e":"e", "\\'i":"i", "\\i":"i", "\\l ":"l", "{":"", "}":"","\\\\":""}

def replace_all(text, dic):
    for i, j in dic.items():
        text = text.replace(i, j)
    return text


#%% define some functions
def get_subjects_semanticscholar(doi):
    url = f'https://api.semanticscholar.org/v1/paper/{doi}?fields=fields'
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        subjects = data.get('fields', {}).get('ofStudy', [])
        return subjects
    else:
        print(f"Error fetching data for DOI: {doi}")
        return []

def get_subjects_crossref(doi):
    url = f'https://api.crossref.org/works/{doi}'
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        subjects = data['message'].get('subject', [])
        return subjects
    else:
        print(f"Error fetching data for DOI: {doi}")
        return []


#%% import data

# import rp.bib
pybtex.errors.set_strict_mode(True)
parser = bibtex.Parser()
bibdata = parser.parse_file("../rp.bib")

# import available results
try:
    f = open(subjectsFile,'r')
    subject = json.load(f)
    f.close()
except:
    subject = {}

#%% retrieve subject for each paper
labels = sorted(bibdata.entries.keys())

bar = ChargingBar('Citations', max=len(bibdata.entries), suffix='%(percent).1f%% - %(eta)ds')

cnt = 0
for bib_id in labels:
    bar.next()
    
    if bib_id in subject:# and citations[bib_id] != []:
       if subject[bib_id]:
          #print('paper already included')
          continue

    if "doi" in bibdata.entries[bib_id].fields:
        doi = bibdata.entries[bib_id].fields["doi"]
        subject[bib_id] = get_subjects_crossref(doi)

    cnt += 1
    if cnt % 50 == 0:
        sorted_dict = OrderedDict(sorted(subject.items()))
        f = open(subjectsFile,'w')
        json.dump(sorted_dict, f, indent=4)
        f.close()

bar.finish()

sorted_dict = OrderedDict(sorted(subject.items()))
f = open(subjectsFile,'w')
json.dump(sorted_dict, f, indent=4)
f.close()





end_time = time.time()
script_time = end_time - start_time

print(f"Search/ download of subjects in {script_time} sec.")


