Skip to main content

Quickstart

This guide will help you get started with Gaussia and run your first AI evaluation.

Prerequisites

  • Python 3.11 or higher
  • uv (recommended) or pip

Installation

# Install core package
uv add gaussia

# Install with specific metric dependencies
uv add "gaussia[toxicity]"
uv add langchain-openai

Step 1: Create a Retriever

The first step is to create a retriever that loads your conversation data. A retriever is a class that inherits from Retriever and implements the load_dataset() method.
from gaussia.core.retriever import Retriever
from gaussia.schemas.common import Dataset, Batch

class MyRetriever(Retriever):
    """Custom retriever to load your AI conversation data."""

    def load_dataset(self) -> list[Dataset]:
        return [
            Dataset(
                session_id="evaluation-session-1",
                assistant_id="my-assistant-v1",
                language="english",
                context="You are a helpful customer service assistant.",
                conversation=[
                    Batch(
                        qa_id="q1",
                        query="What are your return policies?",
                        assistant="Our return policy allows returns within 30 days...",
                        ground_truth_assistant="Returns are accepted within 30 days with receipt.",
                    ),
                    Batch(
                        qa_id="q2",
                        query="How can I track my order?",
                        assistant="You can track your order by logging into your account...",
                        ground_truth_assistant="Log into your account and visit Order History.",
                    ),
                ]
            )
        ]

Step 2: Run a Metric

Once you have a retriever, you can run any metric. Here’s an example using the Context metric:
from gaussia.metrics.context import Context
from langchain_openai import ChatOpenAI

# Initialize a judge model (any LangChain-compatible model)
judge_model = ChatOpenAI(model="gpt-4o-mini", temperature=0.0)

# Run the Context metric
metrics = Context.run(
    MyRetriever,
    model=judge_model,
    use_structured_output=True,
    verbose=True,
)

# Analyze results
for metric in metrics:
    print(f"Session: {metric.session_id}  ({metric.n_interactions} interactions)")
    print(f"  Context awareness: {metric.context_awareness:.2f}")

    for interaction in metric.interactions:
        status = "✅" if interaction.context_awareness >= 0.8 else "❌"
        print(f"  {status} [{interaction.qa_id}] {interaction.context_awareness:.2f}")

Step 3: Analyze Results

Each metric returns a list of results. The structure depends on the metric type:
for metric in metrics:
    print(f"Session: {metric.session_id}")
    print(f"Score: {metric.context_awareness}")  # 0-1 scale

    for interaction in metric.interactions:
        print(f"  [{interaction.qa_id}] {interaction.context_awareness:.2f}")

Complete Example

Here’s a complete example that evaluates an AI assistant using multiple metrics:
import os
from gaussia.metrics.context import Context
from gaussia.metrics.conversational import Conversational
from gaussia.core.retriever import Retriever
from gaussia.schemas.common import Dataset, Batch
from langchain_openai import ChatOpenAI

# 1. Define your retriever
class CustomerServiceRetriever(Retriever):
    def load_dataset(self) -> list[Dataset]:
        return [
            Dataset(
                session_id="cs-eval-001",
                assistant_id="customer-service-bot",
                language="english",
                context="You are a helpful customer service assistant for an e-commerce store.",
                conversation=[
                    Batch(
                        qa_id="q1",
                        query="I want to return a product I bought last week.",
                        assistant="I'd be happy to help with your return! You can return items within 30 days of purchase. Please visit our Returns Portal with your order number ready.",
                        ground_truth_assistant="Items can be returned within 30 days. Use the Returns Portal.",
                    ),
                    Batch(
                        qa_id="q2",
                        query="What's your phone number for support?",
                        assistant="You can reach our support team at 1-800-EXAMPLE. We're available Monday through Friday, 9 AM to 5 PM EST.",
                        ground_truth_assistant="Support: 1-800-EXAMPLE, Mon-Fri 9-5 EST.",
                    ),
                ]
            )
        ]

# 2. Initialize the judge model
judge = ChatOpenAI(model="gpt-4o-mini", api_key=os.getenv("OPENAI_API_KEY"), temperature=0.0)

# 3. Run Context metric
print("=== Context Evaluation ===")
context_results = Context.run(
    CustomerServiceRetriever,
    model=judge,
    use_structured_output=True,
    verbose=True,
)

for m in context_results:
    print(f"Context Awareness: {m.context_awareness:.2f}")

# 4. Run Conversational metric
print("\n=== Conversational Evaluation ===")
conv_results = Conversational.run(
    CustomerServiceRetriever,
    model=judge,
    use_structured_output=True,
    verbose=True,
)

for m in conv_results:
    print(f"Quality:      {m.conversational_quality_maxim.mean:.1f}/10")
    print(f"Sensibleness: {m.conversational_sensibleness.mean:.1f}/10")

What’s Next?

Metrics Overview

Learn about all available metrics

Generators

Generate synthetic test datasets

Core Concepts

Understand the architecture

Statistical Modes

Frequentist vs Bayesian approaches