import pandas as pd
import numpy as np
import json
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import re

# Read CSV data
df = pd.read_csv('../embed/similarity_results_all_simple.csv')  # Please replace with actual file path

# Define keyword list - Please modify these keywords according to your specific task
KEYWORDS = ['last mile', 'truck drone', 'parcel', 'logistics', 'drone delivery', 'delivery services',
            'traveling salesman', 'truck-drone', 'last-mile']

print(f"Keywords used ({len(KEYWORDS)}): {KEYWORDS}")


# Read JSONL file to get TI and AB information
def load_jsonl(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line.strip()))
    return data


# Please replace the following path with the actual data.jsonl file path
jsonl_data = load_jsonl('../data/data.jsonl')

# Create mapping from ID to data
id_to_data = {item['UT']: item for item in jsonl_data if 'UT' in item}
print(f"Successfully loaded {len(id_to_data)} records from JSONL file")


# Keyword feature extraction function
def extract_keyword_features(ti, ab, keywords):
    """Extract keyword-related features without manual weights, let the algorithm learn"""
    if not ti or pd.isna(ti):
        ti = ""
    if not ab or pd.isna(ab):
        ab = ""

    ti_lower = str(ti).lower()
    ab_lower = str(ab).lower()

    features = {}

    # 1. Whether title contains any keyword
    features['ti_contains_any'] = 0
    for keyword in keywords:
        if keyword.lower() in ti_lower:
            features['ti_contains_any'] = 1
            break

    # 2. Whether abstract contains any keyword
    features['ab_contains_any'] = 0
    for keyword in keywords:
        if keyword.lower() in ab_lower:
            features['ab_contains_any'] = 1
            break

    # 3. Number of matching keywords in title
    ti_count = 0
    for keyword in keywords:
        if keyword.lower() in ti_lower:
            ti_count += 1
    features['ti_keyword_count'] = ti_count

    # 4. Number of matching keywords in abstract
    ab_count = 0
    for keyword in keywords:
        if keyword.lower() in ab_lower:
            ab_count += 1
    features['ab_keyword_count'] = ab_count

    # 5. Total matching keywords
    features['total_keyword_count'] = ti_count + ab_count

    # 6. Whether both title and abstract contain keywords
    features['both_contain'] = 1 if (features['ti_contains_any'] == 1 and features['ab_contains_any'] == 1) else 0

    return features


keyword_features_list = []

for ut in df['UT']:
    if ut in id_to_data:
        data_item = id_to_data[ut]
        ti = data_item.get('TI', '')
        ab = data_item.get('Qwen7B_summary', '')
        features = extract_keyword_features(ti, ab, KEYWORDS)
    else:
        # If no corresponding data found, use default values
        features = {
            'ti_keyword_count': 0,
            'ab_keyword_count': 0,
            'total_keyword_count': 0,
            'both_contain': 0,
        }
    keyword_features_list.append(features)

# Add keyword features to DataFrame
keyword_features_df = pd.DataFrame(keyword_features_list)
df = pd.concat([df, keyword_features_df], axis=1)

# Base features
base_features = ['Probability_0']

# Add max value features
df['summary_core_max'] = df[['summary_core_0', 'summary_core_1', 'summary_core_2', 'summary_core_3']].max(axis=1)
base_features.append('summary_core_max')

df['summary_core1_max'] = df[['summary_core1_0', 'summary_core1_1', 'summary_core1_2']].max(axis=1)
base_features.append('summary_core1_max')

df['TI_core_max'] = df[['TI_core_0', 'TI_core_1', 'TI_core_2', 'TI_core_3']].max(axis=1)
base_features.append('TI_core_max')

df['TI_core1_max'] = df[['TI_core1_0', 'TI_core1_1', 'TI_core1_2']].max(axis=1)
base_features.append('TI_core1_max')

# Keyword features
keyword_feature_names = list(keyword_features_df.columns)

# Three feature combinations
features_prob_only = ['Probability_0']  # Use only Probability_0
features_base = base_features  # Use all base features
features_base_keywords = base_features + keyword_feature_names  # Base features + keyword features

# Prepare target variable
y = df['True_Label']
ids = df['UT']

print("\n" + "=" * 80)
print("Performance comparison of methods")
print("=" * 80)

# 1. Method considering only Probability_1
print("\n1. Method considering only Probability_1")
print("-" * 40)

# Use Probability_0 <= 0.5 as threshold for predicting 1
prob1_predictions = (df['Probability_0'] <= 0.5).astype(int)

prob1_accuracy = accuracy_score(y, prob1_predictions)
prob1_precision = precision_score(y, prob1_predictions)
prob1_recall = recall_score(y, prob1_predictions)
prob1_f1 = f1_score(y, prob1_predictions)

print(f"Accuracy: {prob1_accuracy:.4f}")
print(f"Precision: {prob1_precision:.4f}")
print(f"Recall: {prob1_recall:.4f}")
print(f"F1 Score: {prob1_f1:.4f}")

# 2. Gradient Boosting method using all base features
print("\n2. Gradient Boosting method using all base features")
print("-" * 40)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Store prediction results
gb_base_predictions = np.zeros(len(df))
gb_base_probabilities = np.zeros(len(df))

for fold, (train_idx, test_idx) in enumerate(cv.split(df[features_base], y), 1):
    X_train, X_test = df[features_base].iloc[train_idx], df[features_base].iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    # Train Gradient Boosting model
    gb_model = GradientBoostingClassifier(random_state=42)
    gb_model.fit(X_train, y_train)

    # Predict
    y_pred = gb_model.predict(X_test)
    y_prob = gb_model.predict_proba(X_test)[:, 1]  # Probability of positive class

    # Store prediction results
    gb_base_predictions[test_idx] = y_pred
    gb_base_probabilities[test_idx] = y_prob

# Calculate overall metrics for Gradient Boosting
gb_base_accuracy = accuracy_score(y, gb_base_predictions)
gb_base_precision = precision_score(y, gb_base_predictions)
gb_base_recall = recall_score(y, gb_base_predictions)
gb_base_f1 = f1_score(y, gb_base_predictions)

print(f"Accuracy: {gb_base_accuracy:.4f}")
print(f"Precision: {gb_base_precision:.4f}")
print(f"Recall: {gb_base_recall:.4f}")
print(f"F1 Score: {gb_base_f1:.4f}")

# 3. Gradient Boosting method using base features + keyword features
print("\n3. Gradient Boosting method using base features + keyword features")
print("-" * 40)

# Store prediction results
gb_keyword_predictions = np.zeros(len(df))
gb_keyword_probabilities = np.zeros(len(df))

for fold, (train_idx, test_idx) in enumerate(cv.split(df[features_base_keywords], y), 1):
    X_train, X_test = df[features_base_keywords].iloc[train_idx], df[features_base_keywords].iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    # Train Gradient Boosting model
    gb_model = GradientBoostingClassifier(random_state=42)
    gb_model.fit(X_train, y_train)

    # Predict
    y_pred = gb_model.predict(X_test)
    y_prob = gb_model.predict_proba(X_test)[:, 1]  # Probability of positive class

    # Store prediction results
    gb_keyword_predictions[test_idx] = y_pred
    gb_keyword_probabilities[test_idx] = y_prob

# Calculate overall metrics for Gradient Boosting + keywords
gb_keyword_accuracy = accuracy_score(y, gb_keyword_predictions)
gb_keyword_precision = precision_score(y, gb_keyword_predictions)
gb_keyword_recall = recall_score(y, gb_keyword_predictions)
gb_keyword_f1 = f1_score(y, gb_keyword_predictions)

print(f"Accuracy: {gb_keyword_accuracy:.4f}")
print(f"Precision: {gb_keyword_precision:.4f}")
print(f"Recall: {gb_keyword_recall:.4f}")
print(f"F1 Score: {gb_keyword_f1:.4f}")

# 4. Random Forest method using all base features
print("\n4. Random Forest method using all base features")
print("-" * 40)

# Store prediction results
rf_base_predictions = np.zeros(len(df))
rf_base_probabilities = np.zeros(len(df))

for fold, (train_idx, test_idx) in enumerate(cv.split(df[features_base], y), 1):
    X_train, X_test = df[features_base].iloc[train_idx], df[features_base].iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    # Train Random Forest model
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)

    # Predict
    y_pred = rf_model.predict(X_test)
    y_prob = rf_model.predict_proba(X_test)[:, 1]  # Probability of positive class

    # Store prediction results
    rf_base_predictions[test_idx] = y_pred
    rf_base_probabilities[test_idx] = y_prob

# Calculate overall metrics for Random Forest
rf_base_accuracy = accuracy_score(y, rf_base_predictions)
rf_base_precision = precision_score(y, rf_base_predictions)
rf_base_recall = recall_score(y, rf_base_predictions)
rf_base_f1 = f1_score(y, rf_base_predictions)

print(f"Accuracy: {rf_base_accuracy:.4f}")
print(f"Precision: {rf_base_precision:.4f}")
print(f"Recall: {rf_base_recall:.4f}")
print(f"F1 Score: {rf_base_f1:.4f}")

# 5. Random Forest method using base features + keyword features
print("\n5. Random Forest method using base features + keyword features")
print("-" * 40)

# Store prediction results
rf_keyword_predictions = np.zeros(len(df))
rf_keyword_probabilities = np.zeros(len(df))

for fold, (train_idx, test_idx) in enumerate(cv.split(df[features_base_keywords], y), 1):
    X_train, X_test = df[features_base_keywords].iloc[train_idx], df[features_base_keywords].iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    # Train Random Forest model
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)

    # Predict
    y_pred = rf_model.predict(X_test)
    y_prob = rf_model.predict_proba(X_test)[:, 1]  # Probability of positive class

    # Store prediction results
    rf_keyword_predictions[test_idx] = y_pred
    rf_keyword_probabilities[test_idx] = y_prob

# Calculate overall metrics for Random Forest + keywords
rf_keyword_accuracy = accuracy_score(y, rf_keyword_predictions)
rf_keyword_precision = precision_score(y, rf_keyword_predictions)
rf_keyword_recall = recall_score(y, rf_keyword_predictions)
rf_keyword_f1 = f1_score(y, rf_keyword_predictions)

print(f"Accuracy: {rf_keyword_accuracy:.4f}")
print(f"Precision: {rf_keyword_precision:.4f}")
print(f"Recall: {rf_keyword_recall:.4f}")
print(f"F1 Score: {rf_keyword_f1:.4f}")

# Output confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y, rf_keyword_predictions))

# Performance comparison summary
print("\n" + "=" * 80)
print("Performance Comparison Summary")
print("=" * 80)

comparison_df = pd.DataFrame({
    'Method': [
        'Method considering only Probability_1',
        'GB with base features',
        'GB with base + keyword features',
        'RF with base features',
        'RF with base + keyword features'
    ],
    'Accuracy': [
        prob1_accuracy,
        gb_base_accuracy,
        gb_keyword_accuracy,
        rf_base_accuracy,
        rf_keyword_accuracy
    ],
    'Precision': [
        prob1_precision,
        gb_base_precision,
        gb_keyword_precision,
        rf_base_precision,
        rf_keyword_precision
    ],
    'Recall': [
        prob1_recall,
        gb_base_recall,
        gb_keyword_recall,
        rf_base_recall,
        rf_keyword_recall
    ],
    'F1 Score': [
        prob1_f1,
        gb_base_f1,
        gb_keyword_f1,
        rf_base_f1,
        rf_keyword_f1
    ]
})

# Sort by accuracy
comparison_df = comparison_df.sort_values('Accuracy', ascending=False)

# Format output
comparison_df['Accuracy'] = comparison_df['Accuracy'].map('{:.4f}'.format)
comparison_df['Precision'] = comparison_df['Precision'].map('{:.4f}'.format)
comparison_df['Recall'] = comparison_df['Recall'].map('{:.4f}'.format)
comparison_df['F1 Score'] = comparison_df['F1 Score'].map('{:.4f}'.format)

print(comparison_df.to_string(index=False))

# Save results
print("\nSaving results to files...")

results_df = df.copy()
results_df['Prob1_Prediction'] = prob1_predictions
results_df['GB_Base_Prediction'] = gb_base_predictions
results_df['GB_Base_Probability'] = gb_base_probabilities
results_df['GB_Keyword_Prediction'] = gb_keyword_predictions
results_df['GB_Keyword_Probability'] = gb_keyword_probabilities
results_df['RF_Base_Prediction'] = rf_base_predictions
results_df['RF_Base_Probability'] = rf_base_probabilities
results_df['RF_Keyword_Prediction'] = rf_keyword_predictions
results_df['RF_Keyword_Probability'] = rf_keyword_probabilities

# Save complete results
results_df.to_csv('keyword_as_feature_comparison.csv', index=False, encoding='utf-8-sig')

# Save performance comparison
comparison_df.to_csv('performance_comparison_keyword_feature.csv', index=False, encoding='utf-8-sig')

print("Results saved to keyword_as_feature_comparison.csv and performance_comparison_keyword_feature.csv")

# Add code to output examples where probability_0 prediction is wrong but final RF prediction is correct
print("\nExamples where Probability_0 prediction is incorrect but RF (base + keywords) prediction is correct:")

# Find indices where prob1_predictions != y and rf_keyword_predictions == y
error_prob_correct_rf = (prob1_predictions != y) & (rf_keyword_predictions == y)

# For those examples, print in the specified format
for idx in np.where(error_prob_correct_rf)[0]:
    ut = df.iloc[idx]['UT']
    if ut in id_to_data:
        data_item = id_to_data[ut]
        title = data_item.get('TI', 'N/A')
        abstract = data_item.get('Qwen7B_summary', 'N/A')  # Use full abstract if available
    else:
        title = 'N/A'
        abstract = 'N/A'

    prob_0 = df.iloc[idx]['Probability_0']
    prob0_pred = "True" if prob1_predictions[idx] == 1 else "False"
    true_label = y.iloc[idx]
    total_kw_count = df.iloc[idx]['total_keyword_count']

    # Find matched keywords (calculate on the fly)
    ti = data_item.get('TI', '') if ut in id_to_data else ''
    ab = data_item.get('Qwen7B_summary', '') if ut in id_to_data else ''
    ti_lower = str(ti).lower()
    ab_lower = str(ab).lower()
    matched_keywords = list(set([kw for kw in KEYWORDS if kw.lower() in ti_lower or kw.lower() in ab_lower]))

    # For max values, find which core is the max
    summary_core_cols = ['summary_core_0', 'summary_core_1', 'summary_core_2', 'summary_core_3']
    summary_core_max_col = summary_core_cols[np.argmax(df.iloc[idx][summary_core_cols])]
    mapping = {'summary_core_0': 'logistics', 'summary_core_1': 'supply chain', 'summary_core_2': 'package delivery', 'summary_core_3': 'warehouse'}
    summary_core_max_col = mapping[summary_core_max_col]

    summary_core1_cols = ['summary_core1_0', 'summary_core1_1', 'summary_core1_2']
    summary_core1_max_col = summary_core1_cols[np.argmax(df.iloc[idx][summary_core1_cols])]
    mapping = {'summary_core1_0': 'drone', 'summary_core1_1': 'Unmanned Aerial Vehicle', 'summary_core1_2': 'Unmanned Aerial System'}
    summary_core1_max_col = mapping[summary_core1_max_col]

    ti_core_cols = ['TI_core_0', 'TI_core_1', 'TI_core_2', 'TI_core_3']
    ti_core_max_col = ti_core_cols[np.argmax(df.iloc[idx][ti_core_cols])]
    mapping = {'TI_core_0': 'logistics', 'TI_core_1': 'supply chain', 'TI_core_2': 'package delivery', 'TI_core_3': 'warehouse'}
    ti_core_max_col = mapping[ti_core_max_col]

    ti_core1_cols = ['TI_core1_0', 'TI_core1_1', 'TI_core1_2']
    ti_core1_max_col = ti_core1_cols[np.argmax(df.iloc[idx][ti_core1_cols])]
    mapping = {'TI_core1_0': 'drone', 'TI_core1_1': 'Unmanned Aerial Vehicle', 'TI_core1_2': 'Unmanned Aerial System'}
    ti_core1_max_col = mapping[ti_core1_max_col]

    summary_core_max_val = df.iloc[idx]['summary_core_max']
    summary_core1_max_val = df.iloc[idx]['summary_core1_max']
    ti_core_max_val = df.iloc[idx]['TI_core_max']
    ti_core1_max_val = df.iloc[idx]['TI_core1_max']

    print(f"\nExample ID: {ut}")
    print(f"Title: {title}")
    print(f"Abstract: {abstract}")
    print(f"MPNet prediction: {prob0_pred}")
    print(f"True label: {true_label}")
    print(f"total_keyword_count: {total_kw_count}")
    print(f"Matched keywords: {', '.join(matched_keywords) if matched_keywords else 'None'}")
    print(f"summary_core_max: {summary_core_max_val:.4f} (max from {summary_core_max_col})")
    print(f"summary_core1_max: {summary_core1_max_val:.4f} (max from {summary_core1_max_col})")
    print(f"TI_core_max: {ti_core_max_val:.4f} (max from {ti_core_max_col})")
    print(f"TI_core1_max: {ti_core1_max_val:.4f} (max from {ti_core1_max_col})")

print("\nAnalysis complete!")