The first step is to create a retriever that loads your conversation data. A retriever is a class that inherits from Retriever and implements the load_dataset() method.
from gaussia.core.retriever import Retrieverfrom gaussia.schemas.common import Dataset, Batchclass MyRetriever(Retriever): """Custom retriever to load your AI conversation data.""" def load_dataset(self) -> list[Dataset]: return [ Dataset( session_id="evaluation-session-1", assistant_id="my-assistant-v1", language="english", context="You are a helpful customer service assistant.", conversation=[ Batch( qa_id="q1", query="What are your return policies?", assistant="Our return policy allows returns within 30 days...", ground_truth_assistant="Returns are accepted within 30 days with receipt.", ), Batch( qa_id="q2", query="How can I track my order?", assistant="You can track your order by logging into your account...", ground_truth_assistant="Log into your account and visit Order History.", ), ] ) ]
Each metric returns a list of results. The structure depends on the metric type:
Context
Toxicity
Conversational
for metric in metrics: print(f"Session: {metric.session_id}") print(f"Score: {metric.context_awareness}") # 0-1 scale for interaction in metric.interactions: print(f" [{interaction.qa_id}] {interaction.context_awareness:.2f}")
for metric in metrics: print(f"Session: {metric.session_id}") print(f"Cluster Profiling: {metric.cluster_profiling}") if metric.group_profiling: gp = metric.group_profiling.frequentist print(f"DIDT Score: {gp.DIDT}")
for metric in metrics: print(f"Session: {metric.session_id}") print(f"Quality Maxim: {metric.conversational_quality_maxim.mean:.1f}/10") print(f"Sensibleness: {metric.conversational_sensibleness.mean:.1f}/10")
Here’s a complete example that evaluates an AI assistant using multiple metrics:
import osfrom gaussia.metrics.context import Contextfrom gaussia.metrics.conversational import Conversationalfrom gaussia.core.retriever import Retrieverfrom gaussia.schemas.common import Dataset, Batchfrom langchain_openai import ChatOpenAI# 1. Define your retrieverclass CustomerServiceRetriever(Retriever): def load_dataset(self) -> list[Dataset]: return [ Dataset( session_id="cs-eval-001", assistant_id="customer-service-bot", language="english", context="You are a helpful customer service assistant for an e-commerce store.", conversation=[ Batch( qa_id="q1", query="I want to return a product I bought last week.", assistant="I'd be happy to help with your return! You can return items within 30 days of purchase. Please visit our Returns Portal with your order number ready.", ground_truth_assistant="Items can be returned within 30 days. Use the Returns Portal.", ), Batch( qa_id="q2", query="What's your phone number for support?", assistant="You can reach our support team at 1-800-EXAMPLE. We're available Monday through Friday, 9 AM to 5 PM EST.", ground_truth_assistant="Support: 1-800-EXAMPLE, Mon-Fri 9-5 EST.", ), ] ) ]# 2. Initialize the judge modeljudge = ChatOpenAI(model="gpt-4o-mini", api_key=os.getenv("OPENAI_API_KEY"), temperature=0.0)# 3. Run Context metricprint("=== Context Evaluation ===")context_results = Context.run( CustomerServiceRetriever, model=judge, use_structured_output=True, verbose=True,)for m in context_results: print(f"Context Awareness: {m.context_awareness:.2f}")# 4. Run Conversational metricprint("\n=== Conversational Evaluation ===")conv_results = Conversational.run( CustomerServiceRetriever, model=judge, use_structured_output=True, verbose=True,)for m in conv_results: print(f"Quality: {m.conversational_quality_maxim.mean:.1f}/10") print(f"Sensibleness: {m.conversational_sensibleness.mean:.1f}/10")