TypeScript API Python API Community GitHub Phoenix Cloud

Evaluate an Agent

PreviousEvaluate RAG NextOpenAI Agents SDK Cookbook

Last updated 23 days ago

Was this helpful?

Evaluate an Agent

This notebook serves as an end-to-end example of how to trace and evaluate an agent. The example uses a "talk-to-your-data" agent as its example.

The notebook shows examples of:

Manually instrumenting an agent using Phoenix decorators
Evaluating function calling accuracy using LLM as a Judge
Evaluating function calling accuracy by comparing to ground truth
Evaluating SQL query generation
Evaluating Python code generation
Evaluating the path of an agent

Install Dependencies, Import Libraries, Set API Keys

!pip install -q openai "arize-phoenix>=8.8.0" "arize-phoenix-otel>=0.8.0" openinference-instrumentation-openai python-dotenv duckdb "openinference-instrumentation>=0.1.21"

import dotenv

dotenv.load_dotenv()

import json
import os
from getpass import getpass

import duckdb
import pandas as pd
from IPython.display import Markdown
from openai import OpenAI
from openinference.instrumentation import (
    suppress_tracing,
)
from openinference.instrumentation.openai import OpenAIInstrumentor
from opentelemetry.trace import StatusCode
from pydantic import BaseModel, Field
from tqdm import tqdm

from phoenix.otel import register

if os.getenv("OPENAI_API_KEY") is None:
    os.environ["OPENAI_API_KEY"] = getpass("Enter your OpenAI API key: ")

client = OpenAI()
model = "gpt-4o-mini"
project_name = "talk-to-your-data-agent"

Enable Phoenix Tracing

if os.getenv("PHOENIX_API_KEY") is None:
    os.environ["PHOENIX_API_KEY"] = getpass("Enter your Phoenix API key: ")

os.environ["PHOENIX_COLLECTOR_ENDPOINT"] = "https://app.phoenix.arize.com/"
os.environ["PHOENIX_CLIENT_HEADERS"] = f"api_key={os.getenv('PHOENIX_API_KEY')}"

tracer_provider = register(
    project_name=project_name,
)

OpenAIInstrumentor().instrument(tracer_provider=tracer_provider)

tracer = tracer_provider.get_tracer(__name__)

Prepare dataset

Your agent will interact with a local database. Start by loading in that data:

store_sales_df = pd.read_parquet(
    "https://storage.googleapis.com/arize-phoenix-assets/datasets/unstructured/llm/llama-index/Store_Sales_Price_Elasticity_Promotions_Data.parquet"
)
store_sales_df.head()

Define the tools

Now you can define your agent tools.

Tool 1: Database Lookup

SQL_GENERATION_PROMPT = """
Generate an SQL query based on a prompt. Do not reply with anything besides the SQL query.
The prompt is: {prompt}

The available columns are: {columns}
The table name is: {table_name}
"""


def generate_sql_query(prompt: str, columns: list, table_name: str) -> str:
    """Generate an SQL query based on a prompt"""
    formatted_prompt = SQL_GENERATION_PROMPT.format(
        prompt=prompt, columns=columns, table_name=table_name
    )

    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": formatted_prompt}],
    )

    return response.choices[0].message.content


@tracer.tool()
def lookup_sales_data(prompt: str) -> str:
    """Implementation of sales data lookup from parquet file using SQL"""
    try:
        table_name = "sales"
        # Read the parquet file into a DuckDB table
        duckdb.sql(f"CREATE TABLE IF NOT EXISTS {table_name} AS SELECT * FROM store_sales_df")

        print(store_sales_df.columns)
        print(table_name)
        sql_query = generate_sql_query(prompt, store_sales_df.columns, table_name)
        sql_query = sql_query.strip()
        sql_query = sql_query.replace("```sql", "").replace("```", "")

        with tracer.start_as_current_span(
            "execute_sql_query", openinference_span_kind="chain"
        ) as span:
            span.set_input(value=sql_query)

            # Execute the SQL query
            result = duckdb.sql(sql_query).df()
            span.set_output(value=str(result))
            span.set_status(StatusCode.OK)
        return result.to_string()
    except Exception as e:
        return f"Error accessing data: {str(e)}"

example_data = lookup_sales_data("Show me all the sales for store 1320 on November 1st, 2021")
example_data

Tool 2: Data Visualization

class VisualizationConfig(BaseModel):
    chart_type: str = Field(..., description="Type of chart to generate")
    x_axis: str = Field(..., description="Name of the x-axis column")
    y_axis: str = Field(..., description="Name of the y-axis column")
    title: str = Field(..., description="Title of the chart")


@tracer.chain()
def extract_chart_config(data: str, visualization_goal: str) -> dict:
    """Generate chart visualization configuration

    Args:
        data: String containing the data to visualize
        visualization_goal: Description of what the visualization should show

    Returns:
        Dictionary containing line chart configuration
    """
    prompt = f"""Generate a chart configuration based on this data: {data}
    The goal is to show: {visualization_goal}"""

    response = client.beta.chat.completions.parse(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        response_format=VisualizationConfig,
    )

    try:
        # Extract axis and title info from response
        content = response.choices[0].message.content

        # Return structured chart config
        return {
            "chart_type": content.chart_type,
            "x_axis": content.x_axis,
            "y_axis": content.y_axis,
            "title": content.title,
            "data": data,
        }
    except Exception:
        return {
            "chart_type": "line",
            "x_axis": "date",
            "y_axis": "value",
            "title": visualization_goal,
            "data": data,
        }


@tracer.chain()
def create_chart(config: VisualizationConfig) -> str:
    """Create a chart based on the configuration"""
    prompt = f"""Write python code to create a chart based on the following configuration.
    Only return the code, no other text.
    config: {config}"""

    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
    )

    code = response.choices[0].message.content
    code = code.replace("```python", "").replace("```", "")
    code = code.strip()

    return code


@tracer.tool()
def generate_visualization(data: str, visualization_goal: str) -> str:
    """Generate a visualization based on the data and goal"""
    config = extract_chart_config(data, visualization_goal)
    code = create_chart(config)
    return code

# code = generate_visualization(example_data, "A line chart of sales over each day in november.")

@tracer.tool()
def run_python_code(code: str) -> str:
    """Execute Python code in a restricted environment"""
    # Create restricted globals/locals dictionaries with plotting libraries
    restricted_globals = {
        "__builtins__": {
            "print": print,
            "len": len,
            "range": range,
            "sum": sum,
            "min": min,
            "max": max,
            "int": int,
            "float": float,
            "str": str,
            "list": list,
            "dict": dict,
            "tuple": tuple,
            "set": set,
            "round": round,
            "__import__": __import__,
            "json": __import__("json"),
        },
        "plt": __import__("matplotlib.pyplot"),
        "pd": __import__("pandas"),
        "np": __import__("numpy"),
        "sns": __import__("seaborn"),
    }

    try:
        # Execute code in restricted environment
        exec_locals = {}
        exec(code, restricted_globals, exec_locals)

        # Capture any printed output or return the plot
        exec_locals.get("__builtins__", {}).get("_", "")
        if "plt" in exec_locals:
            return exec_locals["plt"]

        # Try to parse output as JSON before returning
        return "Code executed successfully"

    except Exception as e:
        return f"Error executing code: {str(e)}"

Tool 3: Data Analysis

@tracer.tool()
def analyze_sales_data(prompt: str, data: str) -> str:
    """Implementation of AI-powered sales data analysis"""
    # Construct prompt based on analysis type and data subset
    prompt = f"""Analyze the following data: {data}
    Your job is to answer the following question: {prompt}"""

    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
    )

    analysis = response.choices[0].message.content
    return analysis if analysis else "No analysis could be generated"

# analysis = analyze_sales_data("What is the most popular product SKU?", example_data)
# analysis

Tool Schema:

You'll need to pass your tool descriptions into your agent router. The following code allows you to easily do so:

# Define tools/functions that can be called by the model
tools = [
    {
        "type": "function",
        "function": {
            "name": "lookup_sales_data",
            "description": "Look up data from Store Sales Price Elasticity Promotions dataset",
            "parameters": {
                "type": "object",
                "properties": {
                    "prompt": {
                        "type": "string",
                        "description": "The unchanged prompt that the user provided.",
                    }
                },
                "required": ["prompt"],
            },
        },
    },
    {
        "type": "function",
        "function": {
            "name": "analyze_sales_data",
            "description": "Analyze sales data to extract insights",
            "parameters": {
                "type": "object",
                "properties": {
                    "data": {
                        "type": "string",
                        "description": "The lookup_sales_data tool's output.",
                    },
                    "prompt": {
                        "type": "string",
                        "description": "The unchanged prompt that the user provided.",
                    },
                },
                "required": ["data", "prompt"],
            },
        },
    },
    {
        "type": "function",
        "function": {
            "name": "generate_visualization",
            "description": "Generate Python code to create data visualizations",
            "parameters": {
                "type": "object",
                "properties": {
                    "data": {
                        "type": "string",
                        "description": "The lookup_sales_data tool's output.",
                    },
                    "visualization_goal": {
                        "type": "string",
                        "description": "The goal of the visualization.",
                    },
                },
                "required": ["data", "visualization_goal"],
            },
        },
    },
    # {
    #     "type": "function",
    #     "function": {
    #         "name": "run_python_code",
    #         "description": "Run Python code in a restricted environment",
    #         "parameters": {
    #             "type": "object",
    #             "properties": {
    #                 "code": {"type": "string", "description": "The Python code to run."}
    #             },
    #             "required": ["code"]
    #         }
    #     }
    # }
]

# Dictionary mapping function names to their implementations
tool_implementations = {
    "lookup_sales_data": lookup_sales_data,
    "analyze_sales_data": analyze_sales_data,
    "generate_visualization": generate_visualization,
    # "run_python_code": run_python_code
}

Agent logic

With the tools defined, you're ready to define the main routing and tool call handling steps of your agent.

@tracer.chain()
def handle_tool_calls(tool_calls, messages):
    for tool_call in tool_calls:
        function = tool_implementations[tool_call.function.name]
        function_args = json.loads(tool_call.function.arguments)
        result = function(**function_args)

        messages.append({"role": "tool", "content": result, "tool_call_id": tool_call.id})
    return messages

def start_main_span(messages):
    print("Starting main span with messages:", messages)

    with tracer.start_as_current_span("AgentRun", openinference_span_kind="agent") as span:
        span.set_input(value=messages)
        ret = run_agent(messages)
        print("Main span completed with return value:", ret)
        span.set_output(value=ret)
        span.set_status(StatusCode.OK)
        return ret


def run_agent(messages):
    print("Running agent with messages:", messages)
    if isinstance(messages, str):
        messages = [{"role": "user", "content": messages}]
        print("Converted string message to list format")

    # Check and add system prompt if needed
    if not any(
        isinstance(message, dict) and message.get("role") == "system" for message in messages
    ):
        system_prompt = {
            "role": "system",
            "content": "You are a helpful assistant that can answer questions about the Store Sales Price Elasticity Promotions dataset.",
        }
        messages.append(system_prompt)
        print("Added system prompt to messages")

    while True:
        # Router call span
        print("Starting router call span")
        with tracer.start_as_current_span(
            "router_call",
            openinference_span_kind="chain",
        ) as span:
            span.set_input(value=messages)

            response = client.chat.completions.create(
                model=model,
                messages=messages,
                tools=tools,
            )

            messages.append(response.choices[0].message.model_dump())
            tool_calls = response.choices[0].message.tool_calls
            print("Received response with tool calls:", bool(tool_calls))
            span.set_status(StatusCode.OK)

            if tool_calls:
                # Tool calls span
                print("Processing tool calls")
                messages = handle_tool_calls(tool_calls, messages)
                span.set_output(value=tool_calls)
            else:
                print("No tool calls, returning final response")
                span.set_output(value=response.choices[0].message.content)

                return response.choices[0].message.content

Run the agent

Your agent is now good to go! Let's try it out with some example questions:

ret = start_main_span([{"role": "user", "content": "Create a line chart showing sales in 2021"}])
print(Markdown(ret))

agent_questions = [
    "What was the most popular product SKU?",
    "What was the total revenue across all stores?",
    "Which store had the highest sales volume?",
    "Create a bar chart showing total sales by store",
    "What percentage of items were sold on promotion?",
    "Plot daily sales volume over time",
    "What was the average transaction value?",
    "Create a box plot of transaction values",
    "Which products were frequently purchased together?",
    "Plot a line graph showing the sales trend over time with a 7-day moving average",
]

for question in tqdm(agent_questions, desc="Processing questions"):
    try:
        ret = start_main_span([{"role": "user", "content": question}])
    except Exception as e:
        print(f"Error processing question: {question}")
        print(e)
        continue

Evaluating the agent

So your agent looks like it's working, but how can you measure its performance?

OpenAIInstrumentor().uninstrument()  # Uninstrument the OpenAI client to avoid capturing LLM as a Judge evaluation calls in your same project.

import nest_asyncio

import phoenix as px
from phoenix.evals import TOOL_CALLING_PROMPT_TEMPLATE, OpenAIModel, llm_classify
from phoenix.experiments import evaluate_experiment, run_experiment
from phoenix.experiments.evaluators import create_evaluator
from phoenix.experiments.types import Example
from phoenix.trace import SpanEvaluations
from phoenix.trace.dsl import SpanQuery

nest_asyncio.apply()

px_client = px.Client()
eval_model = OpenAIModel(model="gpt-4o-mini")

Function Calling Evals using LLM as a Judge

This first evaluation will evaluate your agent router choices using another LLM.

It follows a standard pattern:

Export traces from Phoenix
Prepare those exported traces in a dataframe with the correct columns
Use llm_classify to run a standard template across each row of that dataframe and produce an eval label
Upload the results back into Phoenix

query = (
    SpanQuery()
    .where(
        "span_kind == 'LLM'",
    )
    .select(question="input.value", output_messages="llm.output_messages")
)

# The Phoenix Client can take this query and return the dataframe.
tool_calls_df = px.Client().query_spans(query, project_name=project_name, timeout=None)
tool_calls_df.dropna(subset=["output_messages"], inplace=True)


def get_tool_call(outputs):
    if outputs[0].get("message").get("tool_calls"):
        return (
            outputs[0]
            .get("message")
            .get("tool_calls")[0]
            .get("tool_call")
            .get("function")
            .get("name")
        )
    else:
        return "No tool used"


tool_calls_df["tool_call"] = tool_calls_df["output_messages"].apply(get_tool_call)
tool_calls_df.head()

tool_call_eval = llm_classify(
    dataframe=tool_calls_df,
    template=TOOL_CALLING_PROMPT_TEMPLATE.template.replace(
        "{tool_definitions}",
        "generate_visualization, lookup_sales_data, analyze_sales_data, run_python_code",
    ),
    rails=["correct", "incorrect"],
    model=eval_model,
    provide_explanation=True,
)

tool_call_eval["score"] = tool_call_eval.apply(
    lambda x: 1 if x["label"] == "correct" else 0, axis=1
)

tool_call_eval.head()

px.Client().log_evaluations(
    SpanEvaluations(eval_name="Tool Calling Eval", dataframe=tool_call_eval),
)

You should now see eval labels in Phoenix.

Function Calling Evals using Ground Truth

The above example works, however if you have ground truth labled data, you can use that data to get an even more accurate measure of your router's performance by running an experiments.

Experiments also follow a standard step-by-step process in Phoenix:

Create a dataset of test cases, and optionally, expected outputs
Create a task to run on each test case - usually this is invoking your agent or a specifc step of it
Create evaluator(s) to run on each output of your task
Visualize results in Phoenix

import uuid

id = str(uuid.uuid4())

agent_tool_responses = {
    "What was the most popular product SKU?": "lookup_sales_data, analyze_sales_data",
    "What was the total revenue across all stores?": "lookup_sales_data, analyze_sales_data",
    "Which store had the highest sales volume?": "lookup_sales_data, analyze_sales_data",
    "Create a bar chart showing total sales by store": "generate_visualization, lookup_sales_data, run_python_code",
    "What percentage of items were sold on promotion?": "lookup_sales_data, analyze_sales_data",
    "Plot daily sales volume over time": "generate_visualization, lookup_sales_data, run_python_code",
    "What was the average transaction value?": "lookup_sales_data, analyze_sales_data",
    "Create a box plot of transaction values": "generate_visualization, lookup_sales_data, run_python_code",
    "Which products were frequently purchased together?": "lookup_sales_data, analyze_sales_data",
    "Plot a line graph showing the sales trend over time with a 7-day moving average": "generate_visualization, lookup_sales_data, run_python_code",
}


tool_calling_df = pd.DataFrame(agent_tool_responses.items(), columns=["question", "tool_calls"])
dataset = px_client.upload_dataset(
    dataframe=tool_calling_df,
    dataset_name=f"tool_calling_ground_truth_{id}",
    input_keys=["question"],
    output_keys=["tool_calls"],
)

For your task, you can simply run just the router call of your agent:

def run_router_step(example: Example) -> str:
    messages = [
        {
            "role": "system",
            "content": "You are a helpful assistant that can answer questions about the Store Sales Price Elasticity Promotions dataset.",
        }
    ]
    messages.append({"role": "user", "content": example.input.get("question")})

    response = client.chat.completions.create(
        model=model,
        messages=messages,
        tools=tools,
    )
    tool_calls = []
    for tool_call in response.choices[0].message.tool_calls:
        tool_calls.append(tool_call.function.name)
    return tool_calls

Your evaluator can also be simple, since you have expected outputs. If you didn't have those expected outputs, you could instead use an LLM as a Judge here, or even basic code:

def tools_match(expected: str, output: str) -> bool:
    expected_tools = expected.get("tool_calls").split(", ")
    return expected_tools == output

experiment = run_experiment(
    dataset,
    run_router_step,
    evaluators=[tools_match],
    experiment_name="Tool Calling Eval",
    experiment_description="Evaluating the tool calling step of the agent",
)

Tool Evals

Evaluating our SQL generation tool

# This step will be replaced by a human annotated set of ground truth data, instead of generated examples

db_lookup_questions = [
    "What was the most popular product SKU?",
    "Which store had the highest total sales value?",
    "How many items were sold on promotion?",
    "What was the average quantity sold per transaction?",
    "Which product class code generated the most revenue?",
    "What day of the week had the highest sales volume?",
    "How many unique stores made sales?",
    "What was the highest single transaction value?",
    "Which products were frequently sold together?",
    "What's the trend in sales over time?",
]

expected_results = []

for question in tqdm(db_lookup_questions, desc="Processing SQL lookup questions"):
    try:
        with suppress_tracing():
            expected_results.append(lookup_sales_data(question))
    except Exception as e:
        print(f"Error processing question: {question}")
        print(e)
        db_lookup_questions.remove(question)

# Create a DataFrame with the questions
questions_df = pd.DataFrame({"question": db_lookup_questions, "expected_result": expected_results})

display(questions_df)

dataset = px_client.upload_dataset(
    dataframe=questions_df,
    dataset_name=f"sales_db_lookup_questions_{id}",
    input_keys=["question"],
    output_keys=["expected_result"],
)

def run_sql_query(example: Example) -> str:
    with suppress_tracing():
        return lookup_sales_data(example.input.get("question"))

def evaluate_sql_result(output: str, expected: str) -> bool:
    # Extract just the numbers from both strings
    result_nums = "".join(filter(str.isdigit, output))
    expected_nums = "".join(filter(str.isdigit, expected.get("expected_result")))
    return result_nums == expected_nums

experiment = run_experiment(
    dataset,
    run_sql_query,
    evaluators=[evaluate_sql_result],
    experiment_name="SQL Query Eval",
    experiment_description="Evaluating the SQL query generation step of the agent",
)

Evaluating our Python code generation tool

# Replace this with a human annotated set of ground truth data, instead of generated examples

code_generation_questions = [
    "Create a bar chart showing total sales by store",
    "Plot daily sales volume over time",
    "Plot a line graph showing the sales trend over time with a 7-day moving average",
    "Create a histogram of quantities sold per transaction",
    "Generate a pie chart showing sales distribution across product classes",
    "Create a stacked bar chart showing promotional vs non-promotional sales by store",
    "Generate a heatmap of sales by day of week and store number",
    "Plot a line chart comparing sales trends between top 5 stores",
]

example_data = []
chart_configs = []
for question in tqdm(code_generation_questions[:], desc="Processing code generation questions"):
    try:
        with suppress_tracing():
            example_data.append(lookup_sales_data(question))
            chart_configs.append(json.dumps(extract_chart_config(example_data[-1], question)))
    except Exception as e:
        print(f"Error processing question: {question}")
        print(e)
        code_generation_questions.remove(question)

code_generation_df = pd.DataFrame(
    {
        "question": code_generation_questions,
        "example_data": example_data,
        "chart_configs": chart_configs,
    }
)

dataset = px_client.upload_dataset(
    dataframe=code_generation_df,
    dataset_name=f"code_generation_questions_{id}",
    input_keys=["question", "example_data", "chart_configs"],
)

def run_code_generation(example: Example) -> str:
    with suppress_tracing():
        chart_config = extract_chart_config(
            data=example.input.get("example_data"), visualization_goal=example.input.get("question")
        )
        code = generate_visualization(
            visualization_goal=example.input.get("question"), data=example.input.get("example_data")
        )

    return {"code": code, "chart_config": chart_config}

In this case, you don't have ground truth data to compare to. Instead you can just use a simple code evaluator: trying to run the generated code and catching any errors.

def code_is_runnable(output: str) -> bool:
    """Check if the code is runnable"""
    output = output.get("code")
    output = output.strip()
    output = output.replace("```python", "").replace("```", "")
    try:
        exec(output)
        return True
    except Exception:
        return False

def evaluate_chart_config(output: str, expected: str) -> bool:
    return output.get("chart_config") == expected.get("chart_config")

experiment = run_experiment(
    dataset,
    run_code_generation,
    evaluators=[code_is_runnable, evaluate_chart_config],
    experiment_name="Code Generation Eval",
    experiment_description="Evaluating the code generation step of the agent",
)

Evaluating the agent path and convergence

Finally, the last piece of your agent to evaluate is its path. This is important to evaluate to understand how efficient your agent is in its execution. Does it need to call the same tool multiple times? Does it skip steps it shouldn't, and have to backtrack later? Convergence or path evals can tell you this.

Convergence evals operate slightly differently. The one you'll use below relies on knowing the minimum number of steps taken by the agent for a given type of query. Instead of just running an experiment, you'll run an experiment then after it completes, attach a second evaluator to calculate convergence.

The workflow is as follows:

Create a dataset of the same type of question, phrased different ways each time - the agent should take the same path for each, but you'll often find it doesn't.
Create a task that runs the agent on each question, while tracking the number of steps it takes.
Run the experiment without an evaluator.
Calculate the minimum number of steps taken to complete the task.
Create an evaluator that compares the steps taken of each run against that min step number.
Run this evaluator on your experiment from step 3.
View your results in Phoenix

# Replace this with a human annotated set of ground truth data, instead of generated examples

convergence_questions = [
    "What was the average quantity sold per transaction?",
    "What is the mean number of items per sale?",
    "Calculate the typical quantity per transaction",
    "Show me the average number of units sold in each transaction",
    "What's the mean transaction size in terms of quantity?",
    "On average, how many items were purchased per transaction?",
    "What is the average basket size per sale?",
    "Calculate the mean number of products per purchase",
    "What's the typical number of units per order?",
    "Find the average quantity of items in each transaction",
    "What is the average number of products bought per purchase?",
    "Tell me the mean quantity of items in a typical transaction",
    "How many items does a customer buy on average per transaction?",
    "What's the usual number of units in each sale?",
    "Calculate the average basket quantity per order",
    "What is the typical amount of products per transaction?",
    "Show the mean number of items customers purchase per visit",
    "What's the average quantity of units per shopping trip?",
    "How many products do customers typically buy in one transaction?",
    "What is the standard basket size in terms of quantity?",
]

convergence_df = pd.DataFrame({"question": convergence_questions})

dataset = px_client.upload_dataset(
    dataframe=convergence_df, dataset_name="convergence_questions", input_keys=["question"]
)

def format_message_steps(messages):
    """
    Convert a list of message objects into a readable format that shows the steps taken.

    Args:
        messages (list): A list of message objects containing role, content, tool calls, etc.

    Returns:
        str: A readable string showing the steps taken.
    """
    steps = []
    for message in messages:
        role = message.get("role")
        if role == "user":
            steps.append(f"User: {message.get('content')}")
        elif role == "system":
            steps.append("System: Provided context")
        elif role == "assistant":
            if message.get("tool_calls"):
                for tool_call in message["tool_calls"]:
                    tool_name = tool_call["function"]["name"]
                    steps.append(f"Assistant: Called tool '{tool_name}'")
            else:
                steps.append(f"Assistant: {message.get('content')}")
        elif role == "tool":
            steps.append(f"Tool response: {message.get('content')}")

    return "\n".join(steps)

def run_agent_and_track_path(example: Example) -> str:
    print("Starting main span with messages:", example.input.get("question"))
    messages = [{"role": "user", "content": example.input.get("question")}]
    ret = run_agent_messages(messages)
    return {"path_length": len(ret), "messages": format_message_steps(ret)}


def run_agent_messages(messages):
    print("Running agent with messages:", messages)
    if isinstance(messages, str):
        messages = [{"role": "user", "content": messages}]
        print("Converted string message to list format")

    # Check and add system prompt if needed
    if not any(
        isinstance(message, dict) and message.get("role") == "system" for message in messages
    ):
        system_prompt = {
            "role": "system",
            "content": "You are a helpful assistant that can answer questions about the Store Sales Price Elasticity Promotions dataset.",
        }
        messages.append(system_prompt)
        print("Added system prompt to messages")

    while True:
        # Router call span
        print("Starting router")

        response = client.chat.completions.create(
            model=model,
            messages=messages,
            tools=tools,
        )

        messages.append(response.choices[0].message.model_dump())
        tool_calls = response.choices[0].message.tool_calls
        print("Received response with tool calls:", bool(tool_calls))

        if tool_calls:
            # Tool calls span
            print("Processing tool calls")
            tool_calls = response.choices[0].message.tool_calls
            messages = handle_tool_calls(tool_calls, messages)
        else:
            print("No tool calls, returning final response")
            return messages

experiment = run_experiment(
    dataset,
    run_agent_and_track_path,
    experiment_name="Convergence Eval",
    experiment_description="Evaluating the convergence of the agent",
)

experiment.as_dataframe()

outputs = experiment.as_dataframe()["output"].to_dict().values()
optimal_path_length = min(
    output.get("path_length")
    for output in outputs
    if output and output.get("path_length") is not None
)
print(f"The optimal path length is {optimal_path_length}")

@create_evaluator(name="Convergence Eval", kind="CODE")
def evaluate_path_length(output: str) -> float:
    if output and output.get("path_length"):
        return optimal_path_length / float(output.get("path_length"))
    else:
        return 0

experiment = evaluate_experiment(experiment, evaluators=[evaluate_path_length])

Advanced - Combining all the evals into our experiment

As an optional final step, you can combine all the evaluators and experiments above into a single experiment. This requires some more advanced data wrangling, but gives you a single report on your agent's performance.

Build a version of our agent that tracks all the necessary information for evals

def process_messages(messages):
    tool_calls = []
    tool_responses = []
    final_output = None

    for i, message in enumerate(messages):
        # Extract tool calls
        if "tool_calls" in message and message["tool_calls"]:
            for tool_call in message["tool_calls"]:
                tool_name = tool_call["function"]["name"]
                tool_input = tool_call["function"]["arguments"]
                tool_calls.append(tool_name)

                # Prepare tool response structure with tool name and input
                tool_responses.append(
                    {"tool_name": tool_name, "tool_input": tool_input, "tool_response": None}
                )

        # Extract tool responses
        if message["role"] == "tool" and "tool_call_id" in message:
            for tool_response in tool_responses:
                if message["tool_call_id"] in message.values():
                    tool_response["tool_response"] = message["content"]

        # Extract final output
        if (
            message["role"] == "assistant"
            and not message.get("tool_calls")
            and not message.get("function_call")
        ):
            final_output = message["content"]

    result = {
        "tool_calls": tool_calls,
        "tool_responses": tool_responses,
        "final_output": final_output,
        "unchanged_messages": messages,
        "path_length": len(messages),
    }

    return result

def run_agent_and_track_path_combined(example: Example) -> str:
    print("Starting main span with messages:", example.input.get("question"))
    messages = [{"role": "user", "content": example.input.get("question")}]
    ret = run_agent_messages_combined(messages)
    return process_messages(ret)


def run_agent_messages_combined(messages):
    print("Running agent with messages:", messages)
    if isinstance(messages, str):
        messages = [{"role": "user", "content": messages}]
        print("Converted string message to list format")

    # Check and add system prompt if needed
    if not any(
        isinstance(message, dict) and message.get("role") == "system" for message in messages
    ):
        system_prompt = {
            "role": "system",
            "content": "You are a helpful assistant that can answer questions about the Store Sales Price Elasticity Promotions dataset.",
        }
        messages.append(system_prompt)
        print("Added system prompt to messages")

    while True:
        # Router call span
        print("Starting router")

        response = client.chat.completions.create(
            model=model,
            messages=messages,
            tools=tools,
        )

        messages.append(response.choices[0].message.model_dump())
        tool_calls = response.choices[0].message.tool_calls
        print("Received response with tool calls:", bool(tool_calls))

        if tool_calls:
            # Tool calls span
            print("Processing tool calls")
            tool_calls = response.choices[0].message.tool_calls
            messages = handle_tool_calls(tool_calls, messages)
        else:
            print("No tool calls, returning final response")
            return messages

generate_sql_query("What was the most popular product SKU?", store_sales_df.columns, "sales")

overall_experiment_questions = [
    {
        "question": "What was the most popular product SKU?",
        "sql_result": "   SKU_Coded  Total_Qty_Sold 0    6200700         52262.0",
    },
    {
        "question": "What was the total revenue across all stores?",
        "sql_result": "   Total_Revenue 0   1.327264e+07",
    },
    {
        "question": "Which store had the highest sales volume?",
        "sql_result": "   Store_Number  Total_Sales_Volume 0          2970             59322.0",
    },
    {
        "question": "Create a bar chart showing total sales by store",
        "sql_result": "    Store_Number    Total_Sales 0            880  420302.088397 1           1650  580443.007953 2           4180  272208.118542 3            550  229727.498752 4           1100  497509.528013 5           3300  619660.167018 6           3190  335035.018792 7           2970  836341.327191 8           3740  359729.808228 9           2530  324046.518720 10          4400   95745.620250 11          1210  508393.767785 12           330  370503.687331 13          2750  453664.808068 14          1980  242290.828499 15          1760  350747.617798 16          3410  410567.848126 17           990  378433.018639 18          4730  239711.708869 19          4070  322307.968330 20          3080  495458.238811 21          2090  309996.247965 22          1320  592832.067579 23          2640  308990.318559 24          1540  427777.427815 25          4840  389056.668316 26          2860  132320.519487 27          2420  406715.767402 28           770  292968.918642 29          3520  145701.079372 30           660  343594.978075 31          3630  405034.547846 32          2310  412579.388504 33          2200  361173.288199 34          1870  401070.997685",
    },
    {
        "question": "What percentage of items were sold on promotion?",
        "sql_result": "   Promotion_Percentage 0              0.625596",
    },
    {
        "question": "What was the average transaction value?",
        "sql_result": "   Average_Transaction_Value 0                  19.018132",
    },
    {
        "question": "Create a line chart showing sales in 2021",
        "sql_result": "  sale_month  total_quantity_sold  total_sales_value 0 2021-11-01              43056.0      499984.428193 1 2021-12-01              75724.0      910982.118423",
    },
]

overall_experiment_questions[0]["sql_generated"] = generate_sql_query(
    overall_experiment_questions[0]["question"], store_sales_df.columns, "sales"
)
overall_experiment_questions[1]["sql_generated"] = generate_sql_query(
    overall_experiment_questions[1]["question"], store_sales_df.columns, "sales"
)
overall_experiment_questions[2]["sql_generated"] = generate_sql_query(
    overall_experiment_questions[2]["question"], store_sales_df.columns, "sales"
)
overall_experiment_questions[3]["sql_generated"] = generate_sql_query(
    overall_experiment_questions[3]["question"], store_sales_df.columns, "sales"
)
overall_experiment_questions[4]["sql_generated"] = generate_sql_query(
    overall_experiment_questions[4]["question"], store_sales_df.columns, "sales"
)
overall_experiment_questions[5]["sql_generated"] = generate_sql_query(
    overall_experiment_questions[5]["question"], store_sales_df.columns, "sales"
)
overall_experiment_questions[6]["sql_generated"] = generate_sql_query(
    overall_experiment_questions[6]["question"], store_sales_df.columns, "sales"
)

print(overall_experiment_questions[6])

# overall_experiment_df = pd.DataFrame(overall_experiment_questions)

# dataset = px_client.upload_dataset(dataframe=overall_experiment_df, dataset_name="overall_experiment_questions_all", input_keys=["question"], output_keys=["sql_result"])

print(overall_experiment_questions[6])

[
    {
        "question": "What was the most popular product SKU?",
        "sql_result": "   SKU_Coded  Total_Qty_Sold 0    6200700         52262.0",
        "sql_generated": "```sql\nSELECT SKU_Coded, SUM(Qty_Sold) AS Total_Qty_Sold\nFROM sales\nGROUP BY SKU_Coded\nORDER BY Total_Qty_Sold DESC\nLIMIT 1;\n```",
    },
    {
        "question": "What was the total revenue across all stores?",
        "sql_result": "   Total_Revenue 0   1.327264e+07",
        "sql_generated": "```sql\nSELECT SUM(Total_Sale_Value) AS Total_Revenue\nFROM sales;\n```",
    },
    {
        "question": "Which store had the highest sales volume?",
        "sql_result": "   Store_Number  Total_Sales_Volume 0          2970             59322.0",
        "sql_generated": "```sql\nSELECT Store_Number, SUM(Total_Sale_Value) AS Total_Sales_Volume\nFROM sales\nGROUP BY Store_Number\nORDER BY Total_Sales_Volume DESC\nLIMIT 1;\n```",
    },
    {
        "question": "Create a bar chart showing total sales by store",
        "sql_result": "    Store_Number    Total_Sales 0            880  420302.088397 1           1650  580443.007953 2           4180  272208.118542 3            550  229727.498752 4           1100  497509.528013 5           3300  619660.167018 6           3190  335035.018792 7           2970  836341.327191 8           3740  359729.808228 9           2530  324046.518720 10          4400   95745.620250 11          1210  508393.767785 12           330  370503.687331 13          2750  453664.808068 14          1980  242290.828499 15          1760  350747.617798 16          3410  410567.848126 17           990  378433.018639 18          4730  239711.708869 19          4070  322307.968330 20          3080  495458.238811 21          2090  309996.247965 22          1320  592832.067579 23          2640  308990.318559 24          1540  427777.427815 25          4840  389056.668316 26          2860  132320.519487 27          2420  406715.767402 28           770  292968.918642 29          3520  145701.079372 30           660  343594.978075 31          3630  405034.547846 32          2310  412579.388504 33          2200  361173.288199 34          1870  401070.997685",
        "sql_generated": "```sql\nSELECT Store_Number, SUM(Total_Sale_Value) AS Total_Sales\nFROM sales\nGROUP BY Store_Number;\n```",
    },
    {
        "question": "What percentage of items were sold on promotion?",
        "sql_result": "   Promotion_Percentage 0              0.625596",
        "sql_generated": "```sql\nSELECT \n    (SUM(CASE WHEN On_Promo = 'Yes' THEN 1 ELSE 0 END) * 100.0) / COUNT(*) AS Promotion_Percentage\nFROM \n    sales;\n```",
    },
    {
        "question": "What was the average transaction value?",
        "sql_result": "   Average_Transaction_Value 0                  19.018132",
        "sql_generated": "```sql\nSELECT AVG(Total_Sale_Value) AS Average_Transaction_Value\nFROM sales;\n```",
    },
    {
        "question": "Create a line chart showing sales in 2021",
        "sql_result": "  sale_month  total_quantity_sold  total_sales_value 0 2021-11-01              43056.0      499984.428193 1 2021-12-01              75724.0      910982.118423",
        "sql_generated": "```sql\nSELECT MONTH(Sold_Date) AS Month, SUM(Total_Sale_Value) AS Total_Sales\nFROM sales\nWHERE YEAR(Sold_Date) = 2021\nGROUP BY MONTH(Sold_Date)\nORDER BY MONTH(Sold_Date);\n```",
    },
]

CLARITY_LLM_JUDGE_PROMPT = """
In this task, you will be presented with a query and an answer. Your objective is to evaluate the clarity
of the answer in addressing the query. A clear response is one that is precise, coherent, and directly
addresses the query without introducing unnecessary complexity or ambiguity. An unclear response is one
that is vague, disorganized, or difficult to understand, even if it may be factually correct.

Your response should be a single word: either "clear" or "unclear," and it should not include any other
text or characters. "clear" indicates that the answer is well-structured, easy to understand, and
appropriately addresses the query. "unclear" indicates that the answer is ambiguous, poorly organized, or
not effectively communicated. Please carefully consider the query and answer before determining your
response.

After analyzing the query and the answer, you must write a detailed explanation of your reasoning to
justify why you chose either "clear" or "unclear." Avoid stating the final label at the beginning of your
explanation. Your reasoning should include specific points about how the answer does or does not meet the
criteria for clarity.

[BEGIN DATA]
Query: {query}
Answer: {response}
[END DATA]
Please analyze the data carefully and provide an explanation followed by your response.

EXPLANATION: Provide your reasoning step by step, evaluating the clarity of the answer based on the query.
LABEL: "clear" or "unclear"
"""

ENTITY_CORRECTNESS_LLM_JUDGE_PROMPT = """
In this task, you will be presented with a query and an answer. Your objective is to determine whether all
the entities mentioned in the answer are correctly identified and accurately match those in the query. An
entity refers to any specific person, place, organization, date, or other proper noun. Your evaluation
should focus on whether the entities in the answer are correctly named and appropriately associated with
the context in the query.

Your response should be a single word: either "correct" or "incorrect," and it should not include any
other text or characters. "correct" indicates that all entities mentioned in the answer match those in the
query and are properly identified. "incorrect" indicates that the answer contains errors or mismatches in
the entities referenced compared to the query.

After analyzing the query and the answer, you must write a detailed explanation of your reasoning to
justify why you chose either "correct" or "incorrect." Avoid stating the final label at the beginning of
your explanation. Your reasoning should include specific points about how the entities in the answer do or
do not match the entities in the query.

[BEGIN DATA]
Query: {query}
Answer: {response}
[END DATA]
Please analyze the data carefully and provide an explanation followed by your response.

EXPLANATION: Provide your reasoning step by step, evaluating whether the entities in the answer are
correct and consistent with the query.
LABEL: "correct" or "incorrect"
"""

TOOL_CALLING_PROMPT_TEMPLATE.template.replace("{tool_definitions}", json.dumps(tools))

def function_calling_eval(input: str, output: str) -> float:
    function_calls = output.get("tool_calls")
    if function_calls:
        eval_df = pd.DataFrame(
            {"question": [input.get("question")] * len(function_calls), "tool_call": function_calls}
        )

        tool_call_eval = llm_classify(
            dataframe=eval_df,
            template=TOOL_CALLING_PROMPT_TEMPLATE.template.replace(
                "{tool_definitions}", json.dumps(tools).replace("{", '"').replace("}", '"')
            ),
            rails=["correct", "incorrect"],
            model=eval_model,
            provide_explanation=True,
        )

        tool_call_eval["score"] = tool_call_eval.apply(
            lambda x: 1 if x["label"] == "correct" else 0, axis=1
        )
        return tool_call_eval["score"].mean()
    else:
        return 0


def code_is_runnable(output: str) -> bool:
    """Check if the code is runnable"""
    generated_code = output.get("tool_responses")
    if not generated_code:
        return True

    # Find first lookup_sales_data response
    generated_code = next(
        (r for r in generated_code if r.get("tool_name") == "generate_visualization"), None
    )
    if not generated_code:
        return True

    # Get the first response
    generated_code = generated_code.get("tool_response", "")
    generated_code = generated_code.strip()
    generated_code = generated_code.replace("```python", "").replace("```", "")
    try:
        exec(generated_code)
        return True
    except Exception:
        return False


def evaluate_sql_result(output, expected) -> bool:
    sql_result = output.get("tool_responses")
    if not sql_result:
        return True

    # Find first lookup_sales_data response
    sql_result = next((r for r in sql_result if r.get("tool_name") == "lookup_sales_data"), None)
    if not sql_result:
        return True

    # Get the first response
    sql_result = sql_result.get("tool_response", "")

    # Extract just the numbers from both strings
    result_nums = "".join(filter(str.isdigit, sql_result))
    expected_nums = "".join(filter(str.isdigit, expected.get("sql_result")))
    return result_nums == expected_nums


def evaluate_clarity(output: str, input: str) -> bool:
    df = pd.DataFrame({"query": [input.get("question")], "response": [output.get("final_output")]})
    response = llm_classify(
        dataframe=df,
        template=CLARITY_LLM_JUDGE_PROMPT,
        rails=["clear", "unclear"],
        model=eval_model,
        provide_explanation=True,
    )
    return response["label"] == "clear"


def evaluate_entity_correctness(output: str, input: str) -> bool:
    df = pd.DataFrame({"query": [input.get("question")], "response": [output.get("final_output")]})
    response = llm_classify(
        dataframe=df,
        template=ENTITY_CORRECTNESS_LLM_JUDGE_PROMPT,
        rails=["correct", "incorrect"],
        model=eval_model,
        provide_explanation=True,
    )
    return response["label"] == "correct"

def run_overall_experiment(example: Example) -> str:
    with suppress_tracing():
        return run_agent_and_track_path_combined(example)


experiment = run_experiment(
    dataset,
    run_overall_experiment,
    evaluators=[
        function_calling_eval,
        evaluate_sql_result,
        evaluate_clarity,
        evaluate_entity_correctness,
        code_is_runnable,
    ],
    experiment_name="Overall Experiment",
    experiment_description="Evaluating the overall experiment",
)

Congratulations! 🎉

You've now evaluated every aspect of your agent. If you've made it this far, you're now an expert in evaluating agent routers, tools, and paths!