import os
from gaussia.metrics.context import Context
from gaussia.metrics.conversational import Conversational
from gaussia.core.retriever import Retriever
from gaussia.schemas.common import Dataset, Batch
from langchain_openai import ChatOpenAI
# 1. Define your retriever
class CustomerServiceRetriever(Retriever):
def load_dataset(self) -> list[Dataset]:
return [
Dataset(
session_id="cs-eval-001",
assistant_id="customer-service-bot",
language="english",
context="You are a helpful customer service assistant for an e-commerce store.",
conversation=[
Batch(
qa_id="q1",
query="I want to return a product I bought last week.",
assistant="I'd be happy to help with your return! You can return items within 30 days of purchase. Please visit our Returns Portal with your order number ready.",
ground_truth_assistant="Items can be returned within 30 days. Use the Returns Portal.",
),
Batch(
qa_id="q2",
query="What's your phone number for support?",
assistant="You can reach our support team at 1-800-EXAMPLE. We're available Monday through Friday, 9 AM to 5 PM EST.",
ground_truth_assistant="Support: 1-800-EXAMPLE, Mon-Fri 9-5 EST.",
),
]
)
]
# 2. Initialize the judge model
judge = ChatOpenAI(model="gpt-4o-mini", api_key=os.getenv("OPENAI_API_KEY"), temperature=0.0)
# 3. Run Context metric
print("=== Context Evaluation ===")
context_results = Context.run(
CustomerServiceRetriever,
model=judge,
use_structured_output=True,
verbose=True,
)
for m in context_results:
print(f"Context Awareness: {m.context_awareness:.2f}")
# 4. Run Conversational metric
print("\n=== Conversational Evaluation ===")
conv_results = Conversational.run(
CustomerServiceRetriever,
model=judge,
use_structured_output=True,
verbose=True,
)
for m in conv_results:
print(f"Quality: {m.conversational_quality_maxim.mean:.1f}/10")
print(f"Sensibleness: {m.conversational_sensibleness.mean:.1f}/10")