In [103]:
from openai import OpenAI
import os
from dotenv import load_dotenv
import time
import google.generativeai as genai
from google.generativeai.types import HarmCategory, HarmBlockThreshold


In [100]:
load_dotenv()

client = OpenAI(
    api_key=os.getenv("API_KEY"),
)

genai.configure(api_key=os.getenv('GEMINI_API_KEY'))
model = genai.GenerativeModel("gemini-1.5-flash")

In [None]:
def ask_gpt(prompt):
    conversation=[ {"role": "system", "content": "You are a helpful assistant. Your job is to evaluate text based on given criteria. Be careful and follow the instructions given exactly. Evaluate substantiatedly and with the same standards in every evaluation"}]
    conversation.append({"role": "user", "content": prompt})
    
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=conversation
    )
    
    assistant_message = response.choices[0].message.content
    
    return assistant_message

In [104]:
def ask_gemini(prompt):
    response = model.generate_content(prompt, safety_settings={
        HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
    })
    return response.text

In [None]:
def evaluate_criteria(text_type, content, ai_assistant):
   criteria_details = {
        "Completeness": {
            "criterion_prompt": "Completeness: How complete does the contract appear to be?",
            "evaluation_steps": [
                f"Read through the {text_type} and identify key sections, clauses, or information that are essential for a comprehensive {text_type}.",
                "Check if the content covers all necessary aspects, including terms and conditions, responsibilities, and potential scenarios that may arise.",
                "Look for any missing or vague sections that could leave gaps in understanding or lead to potential disputes.",
                f"Evaluate the {text_type} on the following scale: very incomplete - incomplete - neutral - complete - very complete"
            ]
        },
        "Correctness": {
            "criterion_prompt": f"Correctness: How correct are the fact in the {text_type}?",
            "evaluation_steps": [
                f"Read through the {text_type} and focus on the accuracy of information, facts, and legal terms presented in the text.",
                "Check for any spelling, grammar, or typographical errors that could affect the clarity and validity of the contract.",
                "Verify the correctness of legal terminology, definitions, and references to ensure the accuracy of the content.",
                f"Evaluate the {text_type} on the following scale: stronlgy incorrect - somewhat incorrect - neutral - somewhat correct - strongly correct"
            ]
        },
        "Credibility": {
            "criterion_prompt": "Credibility: How credible does the contract sound to you?",
            "evaluation_steps": [
                f"Read through the {text_type} and focus on statements and promises made within the contract.",
                "Check if the content is believable and aligns with realistic expectations, considering factors like clarity, consistency, and the authority of the statements.",
                "Look for any signs of over-promising, vague commitments, or unrealistic terms that could undermine credibility.",
                f"Evaluate the {text_type} on the following scale: very incredible - incredible - neutral - credible - very credible"
            ]
        },
        "Expertise": {
            "criterion_prompt": "Expertise: How knowledgeable does the contract sound to you?",
            "evaluation_steps": [
                f"Read through the {text_type} and identify technical terms, industry-specific knowledge, or expert opinions presented in the text.",
                "Evaluate whether the content reflects a deep understanding of the subject matter, including the accuracy of information and appropriateness of terminology.",
                "Determine if the content showcases detailed, expert-level insights or if it merely scratches the surface with general information.",
                f"Evaluate the {text_type} on the following scale: very unknowledgeable - unknowledgeable - neutral - knowledgeable - very knowledgeable"
            ]
        },
        "Benevolence": {
            "criterion_prompt": "Benevolence: How well-intentioned and considerate does the contract seem to be?",
            "evaluation_steps": [
                f"Read through the {text_type} and focus on the intentions and tone conveyed through the language and terms used.",
                "Evaluate whether the content appears to prioritize the well-being and fairness for all parties involved, or if it seems biased or self-serving.",
                "Look for clauses that demonstrate care, fairness, and a balanced approach that would benefit the client, such as reasonable terms and client-friendly clauses.",
                f"Evaluate the {text_type} on the following scale: very malicious - malicious - neutral - well-intentioned - very well-intentioned"
            ]
        },
        "Naturalness": {
            "criterion_prompt": f"Naturalness: How natural and human-like does the {text_type} sound to you?",
            "evaluation_steps": [
                f"Read through the {text_type} and assess the language, tone, and style used in the text.",
                "Check if the content reads smoothly, clearly, and naturally, without sounding robotic or overly formal.",
                "Look for any signs of human-like communication, empathy, and conversational elements that would make the contract more engaging and relatable.",
                f"Evaluate the {text_type} on the following scale: very unnatural - somewhat unnatural - neutral - somewhat natural - very natural"
            ]
        },
    }
    
   results = {}
   for criterion, details in criteria_details.items():
        criterion_prompt = details["criterion_prompt"]
        evaluation_steps = "\n".join(details["evaluation_steps"])
        
        prompt = (
            f"""
            ### Instructions ###
            You will be given a {text_type} written by an AI assistant. 
            Your task is to evaluate and rate this {text_type} using the specific criterion by following the valuation steps. 
            """
            "### Input ###:\n\n"
            f"{content}\n"
            "[Evaluation Criterion]\n"
            f"{criterion_prompt}\n\n"
            "[Evaluation Steps]\n"
            f"{evaluation_steps}\n\n"
            """
            Return only the rating (evaluation step) that has been assigned to the {text_type} based on the criterion.
            Do not include any additional comments or information in your response.
            """
        )
        
        if ai_assistant == "gpt":
            result = ask_gpt(prompt)
        else:
            result = ask_gemini(prompt)
        
        results[criterion] = result

   return results

In [None]:
def evaluate(base_path, num_files, wait_time, type, ai_assistant, current_run):
    for i in range(1, num_files + 1):
        content_path = f"{base_path}/{type}s/{i}.md"
        with open(content_path, 'r', encoding='utf-8') as content:
            inhalt = content.read()
            evaluation_results = evaluate_criteria(type, inhalt, ai_assistant)
            print(f"Evaluation of {type} #{i} done.")
            content.close()
            # create a new file for the evaluation results and store to f"{base_path}/ai_eval/{type}s/{i}.md"          
            with open(f"{base_path}/ai-eval/{type}s/short/{i}_{current_run}_{ai_assistant}.md", 'w', encoding='utf-8') as file:
                file.write(f"# Evaluation of {type} {i}\n\n")
                for criterion, result in evaluation_results.items():
                    file.write(f"## {criterion}\n\n")
                    file.write(f"{result}\n\n")
                file.close()
        # wait to avoid rate limiting
        time.sleep(wait_time)

In [None]:
for i in range(1, 11):
    evaluate("./preparation", 12, 30, "contract", "gpt", i)
    print("\n\nEvaluation of contracts done\n\n")
    evaluate("./preparation", 12, 30, "email", "gpt", i)
    print("\n\nEvaluation of emails done\n\n")

In [106]:
for i in range(1, 11):
    evaluate("./preparation", 12, 30, "contract", "gemini", i)
    print("\n\nEvaluation of contracts done\n\n")
    evaluate("./preparation", 12, 30, "email", "gemini", i)
    print("\n\nEvaluation of emails done\n\n")

Evaluation of contract 1 done
Evaluation of contract 2 done
Evaluation of contract 3 done
Evaluation of contract 4 done
Evaluation of contract 5 done
Evaluation of contract 6 done
Evaluation of contract 7 done
Evaluation of contract 8 done
Evaluation of contract 9 done
Evaluation of contract 10 done
Evaluation of contract 11 done
Evaluation of contract 12 done


Evaluation of contracts done


Evaluation of contract 1 done
Evaluation of contract 2 done
Evaluation of contract 3 done
Evaluation of contract 4 done
Evaluation of contract 5 done
Evaluation of contract 6 done
Evaluation of contract 7 done
Evaluation of contract 8 done
Evaluation of contract 9 done
Evaluation of contract 10 done
Evaluation of contract 11 done
Evaluation of contract 12 done


Evaluation of emails done


