import re
import time
from geopy.geocoders import Nominatim
from geopy.distance import geodesic
import pickle

# Initialize geolocator and cache
geolocator = Nominatim(user_agent="Research Project Sociology (info@dummy.eu)")

# Load cache from file if it exists
try:
    with open('location_cache.pkl', 'rb') as f:
        location_cache = pickle.load(f)
except FileNotFoundError:
    location_cache = {}

# Function to get location data
def get_location_data(location_name):
    if location_name in location_cache:
        return location_cache[location_name]
    print(f"Attempting to geocode: {location_name}")  # Debug print statement
    time.sleep(1)  # Rate limiting
    location = geolocator.geocode(location_name)
    
    if location is not None:
        location_data = (location.latitude, location.longitude)
    else:
        location_data = None
    
    location_cache[location_name] = location_data  # Cache the result

    # Save cache to file
    with open('location_cache.pkl', 'wb') as f:
        pickle.dump(location_cache, f)

    return location_data

# Function to read the file and return its content as a string
def read_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        event_location = lines[0].strip()  # Assume the event location is on the first line
        text = ''.join(lines[1:])  # The rest of the text
    return event_location, text

# Function to write the table to a new file
def write_table(file_path, table_data):
    try:
        with open(file_path, 'w', encoding='utf-8') as f:
            for row in table_data:
                f.write(row + '\n')
    except Exception as e:
        print(f"An error occurred while writing to the file: {e}")

# Function to extract "##+##" sequences and create a table
def create_table_from_text(text, event_location):
    table_data = []
    location_event = get_location_data(event_location)

    # Updated regular expression
    sequences = re.findall(r'(\d{2})\+(\d{2}) Jahre\s*([a-zA-ZäöüÄÖÜ\s-]+?)(?=[^\wäöüÄÖÜ\s-]|$|\n)', text)
    
    for sequence in sequences:
        location_name = sequence[2].rstrip()  # Remove any trailing spaces
        location_visitor = get_location_data(location_name)
        
        if location_visitor is not None and location_event is not None:
            distance = geodesic(location_event, location_visitor).kilometers
        else:
            distance = -1  # Set to -1 if location is not found

        row = f"{sequence[0]}\t{sequence[1]}\t{distance}"
        table_data.append(row)
    return table_data

def extract_c(input_file_path, output_file_path):
    event_location, text = read_file(input_file_path)
    table_data = create_table_from_text(text, event_location)
    write_table(output_file_path, table_data)

    print(f"Table has been written to {output_file_path}")

# Example usage
# extract_c("input_file.txt", "output_file.txt")
