import numpy as np
import pandas as pd
import pdfplumber
# import spacy
import re
import os
import time
import json


#########################
from openai import OpenAI
#############################################
## Variables
api_key = 'API Key Here'
ChatGPT_baseURL = 'https://api.openai.com/v1/chat/completions'
GPT_MODEL = "gpt-4o"
############################################


print("all well")
print('start')

## Functions

def extract_text_from_pdf(pdf_path, output_txt_path):
    """Extracts text from a PDF file and saves it as a .txt file."""
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n"

    # Save extracted text to a .txt file
    with open(output_txt_path, "w", encoding="utf-8") as txt_file:
        txt_file.write(text)

    return text


def query_GPT(m):
    message = [{"role": "user", "content": m}]
    response = client.chat.completions.create(model=GPT_MODEL, messages=message, temperature= 1 )
    
    return response.choices[0].message.content


## Main
if __name__ == "__main__":
    client = OpenAI(api_key=api_key)

    prompt = '''
    Task Overview:
    Given the extracted text from a research paper, identify and extract metadata related to every sensor device used in this study. The extracted information should be categorized into predefined entity labels. Ensure that the information is extracted accurately and presented in a structured JSON format:
    Entity Labels and Their Definitions:
    model_name:	The term by which the instrument is known. This could be a trade name or an alias.
    model_id:	The unique identifier used to differentiate each model of an instrument made by certain manufacturer.
    version_number:	The current version of the instrument model. It differentiates instruments within the same model. The usually refers to a version of the hardware.
    mobility:	Whether the instrument can be moved around for measuring the species , A boolean indication (Yes/No).
    measured_entities:	List of all the types of measurement done by the instrument
    firmware_software_version:	Current firmware or software version of the model.
    instrument_type:	The category of instrument based on the species measured by the instrument.
    manufacturer:	The person, group, or organization that develops or produces the instrument.
    patent_number:	The serial number of the patent, if the instrument is patented.
    patent_issued_country:	The country issuing the patent.
    dimensions:	The size of the instrument in physical space. The dimension could have attributes of depth, height, and length. Each dimension includes a value and unit. (Use this if the dimensions aren’t available discretely, else use the below fields.)
    dimension_depth:	The depth or thickness of the instrument.
    dimension_height:	The vertical height of the instrument.
    dimension_length:	The horizontal length of the instrument.
    composition:	The description of the composition of combining parts or elements making up of the instrument.
    price:	The cost of the instrument. This could be a potential price or price range of the instrument, such as the manufacturer recommended price, actual price, or price range to purchase the instrument.
    price_type:	Whether the price/price range is the potential price or actual price to purchase the instrument.
    indoor_outdoor_use:	Whether the instrument is intended to be used inside a building or structure that is protected from the natural environment. Or, if the instrument can be used outdoors and can tolerate exposure to the natural environment.
    is_personal_device:	Whether or not the instrument is intended to be used to and track information for individuals.
    is_wearable_device:	Whether or not the instrument can be worn by individuals on their body or carried, and track information. A boolean indication (Yes/No)
    is_water_or_splash_proof:	Whether or not the instrument can tolerate exposure to water. A boolean indication (Yes/No).
    needs_power_source:	Whether or not the instrument needs a source of power for its normal function. If power is needed, the type of power should be listed. See "Source of Power."
    power_source:	The type of power that supports the instrument for its normal function/s.
    battery_operation_time_limit:	The duration of battery life, if the "source of power" is battery.
    battery_capacity:	The amount of electric charge the battery can deliver at the rated voltage.
    output_voltage:	The voltage released by the battery.
    is_rechargeable:	Whether or not the battery's electric charge can be restored by connecting the battery to a recharging device. A boolean indication (Yes/No).
    battery_type:	The category of battery, based on the chemical used in the battery's electrochemical cells.
    charger	If the battery is rechargeable, this element is used to describe the charger.
    time_to_full_charge:	The time taken to recharge the battery.
    has_display:	Whether or not the instrument is capable of displaying information. If yes, more information can be recorded in the following data element, such as how many monitors, and what type of monitors does it have.
    number_of_displays:	The number of displays with the instrument.
    display_type:	The category of the monitor used to display information.
    warranty_time:	The length of time covered by the instrument's warranty.
    warranty_condition:	The facts or conditions under which the warranty is valid.
    lifetime_of_device:	The duration of time during which the instrument is expected to function properly according to the manufacturer.
    recommended_maintenance_method:	The method suggested for maintaining the instrument.
    recommended_maintenance_frequency:	The frequency at which the maintenance should be repeated.
    Instructions:
    Extract relevant metadata from the provided text file, ensuring accuracy in categorization.
    If information is missing, return "N/A" for that field.
    Output the extracted metadata in a json format. Dont include additional information. I only need the Json structured metadata for the sensors mentioned in this paper. 
'''

    folder_path = 'Path to the folder containing the papers in PDF'
    # df = pd.DataFrame()
    files_name = []
    gpt_res = []
    execution_time = []
    for file_name in os.listdir(folder_path):
        print(file_name)
        if file_name.lower().endswith(".pdf"):  # Check if the file is a PDF
            pdf_file_path = os.path.join(folder_path, file_name)
            # print(pdf_file_path)
            text_path = pdf_file_path[:-3] + 'txt'
            # print(text_path)
            start_time = time.time()
            extracted_text = extract_text_from_pdf(pdf_file_path, text_path)
            
            m = prompt + ' The following is the text for the research paper: ' + "\n" + extracted_text
            # print(m)
            res = query_GPT(m)
            stop_time = time.time()
            print(res)
            # Extract JSON part using regex
            
            files_name.append(file_name)
            gpt_res.append(res)
            execution_time.append(stop_time - start_time)
            # print(files_name)
            dic = dict()
            dic["Filename"] = files_name
            dic["GPT Response"] = gpt_res
            dic["execution_time"] = execution_time
            df = pd.DataFrame(dic)
            df.to_excel('./Meta_data_json/all_papers_instrument.xlsx')
            print(df)
    df["Filename"] = files_name
    df["GPT Response"] = gpt_res
    df["execution_time"] = execution_time

    df.to_excel('./Meta_data_json/all_papers_instrument.xlsx')