Quickstart: Datasets & Experiments

Phoenix helps you run experiments over your AI and LLM applications to evaluate and iteratively improve their performance. This quickstart shows you how to get up and running quickly.

Launch Phoenix

Using Phoenix Cloud

Grab your API key from the Keys option on the left bar.
In your code, set your endpoint and API key:

import os

PHOENIX_API_KEY = "ADD YOUR API KEY"
os.environ["PHOENIX_CLIENT_HEADERS"] = f"api_key={PHOENIX_API_KEY}"
os.environ["PHOENIX_COLLECTOR_ENDPOINT"] = "https://app.phoenix.arize.com"

const PHOENIX_API_KEY = "ADD YOUR API KEY";
process.env["PHOENIX_CLIENT_HEADERS"] = `api_key=${PHOENIX_API_KEY}`;
process.env["PHOENIX_COLLECTOR_ENDPOINT"] = "https://app.phoenix.arize.com";

Using Self-hosted Phoenix

In your code, set your endpoint:

import os

os.environ["PHOENIX_COLLECTOR_ENDPOINT"] = "Your Phoenix Endpoint"

process.env["PHOENIX_COLLECTOR_ENDPOINT"] = "Your Phoenix Endpoint"

Datasets

Upload a dataset.

import pandas as pd
import phoenix as px

df = pd.DataFrame(
    [
        {
            "question": "What is Paul Graham known for?",
            "answer": "Co-founding Y Combinator and writing on startups and techology.",
            "metadata": {"topic": "tech"},
        }
    ]
)
phoenix_client = px.Client()
dataset = phoenix_client.upload_dataset(
    dataframe=df,
    dataset_name="test-dataset",
    input_keys=["question"],
    output_keys=["answer"],
    metadata_keys=["metadata"],
)

import { createClient } from "@arizeai/phoenix-client";
import { createDataset } from "@arizeai/phoenix-client/datasets";

// Create example data
const examples = [
  {
    input: { question: "What is Paul Graham known for?" },
    output: {
      answer: "Co-founding Y Combinator and writing on startups and techology."
    },
    metadata: { topic: "tech" }
  }
];

// Initialize Phoenix client
const client = createClient();

// Upload dataset
const { datasetId } = await createDataset({
  client,
  name: "test-dataset",
  examples: examples
});

Tasks

Create a task to evaluate.

from openai import OpenAI
from phoenix.experiments.types import Example

openai_client = OpenAI()

task_prompt_template = "Answer in a few words: {question}"


def task(example: Example) -> str:
    question = example.input["question"]
    message_content = task_prompt_template.format(question=question)
    response = openai_client.chat.completions.create(
        model="gpt-4o", messages=[{"role": "user", "content": message_content}]
    )
    return response.choices[0].message.content

import { OpenAI } from "openai";
import { type RunExperimentParams } from "@arizeai/phoenix-client/experiments";

// Initialize OpenAI client
const openai = new OpenAI({
  apiKey: process.env.OPENAI_API_KEY
});

const taskPromptTemplate = "Answer in a few words: {question}";

const task: RunExperimentParams["task"] = async (example) => {
  // Access question with type assertion
  const question = example.input.question || "No question provided";
  const messageContent = taskPromptTemplate.replace("{question}", question);

  const response = await openai.chat.completions.create({
    model: "gpt-4o", 
    messages: [{ role: "user", content: messageContent }]
  });

  return response.choices[0]?.message?.content || "";
};

Evaluators

Use pre-built evaluators to grade task output with code...

from phoenix.experiments.evaluators import ContainsAnyKeyword

contains_keyword = ContainsAnyKeyword(keywords=["Y Combinator", "YC"])

import { asEvaluator } from "@arizeai/phoenix-client/experiments";

// Code-based evaluator that checks if response contains specific keywords
const containsKeyword = asEvaluator({
  name: "contains_keyword",
  kind: "CODE",
  evaluate: async ({ output }) => {
    const keywords = ["Y Combinator", "YC"];
    const outputStr = String(output).toLowerCase();
    const contains = keywords.some((keyword) =>
      outputStr.toLowerCase().includes(keyword.toLowerCase())
    );

    return {
      score: contains ? 1.0 : 0.0,
      label: contains ? "contains_keyword" : "missing_keyword",
      metadata: { keywords },
      explanation: contains
        ? `Output contains one of the keywords: ${keywords.join(", ")}`
        : `Output does not contain any of the keywords: ${keywords.join(", ")}`
    };
  }
});

or LLMs.

from phoenix.experiments.evaluators import ConcisenessEvaluator
from phoenix.evals.models import OpenAIModel

model = OpenAIModel(model="gpt-4o")
conciseness = ConcisenessEvaluator(model=model)

import { asEvaluator } from "@arizeai/phoenix-client/experiments";
import { OpenAI } from "openai";

const openai = new OpenAI({
  apiKey: process.env.OPENAI_API_KEY
});

// LLM-based evaluator for conciseness
const conciseness = asEvaluator({
  name: "conciseness",
  kind: "LLM",
  evaluate: async ({ output }) => {
    const prompt = `
      Rate the following text on a scale of 0.0 to 1.0 for conciseness (where 1.0 is perfectly concise).
      
      TEXT: ${output}
      
      Return only a number between 0.0 and 1.0.
    `;

    const response = await openai.chat.completions.create({
      model: "gpt-4o",
      messages: [{ role: "user", content: prompt }]
    });

    const scoreText = response.choices[0]?.message?.content?.trim() || "0";
    const score = parseFloat(scoreText);

    return {
      score: isNaN(score) ? 0.5 : score,
      label: score > 0.7 ? "concise" : "verbose",
      metadata: {},
      explanation: `Conciseness score: ${score}`
    };
  }
});

Define custom evaluators with code...

from typing import Any, Dict


def jaccard_similarity(output: str, expected: Dict[str, Any]) -> float:
    # https://en.wikipedia.org/wiki/Jaccard_index
    actual_words = set(output.lower().split(" "))
    expected_words = set(expected["answer"].lower().split(" "))
    words_in_common = actual_words.intersection(expected_words)
    all_words = actual_words.union(expected_words)
    return len(words_in_common) / len(all_words)

import { asEvaluator } from "@arizeai/phoenix-client/experiments";

// Custom Jaccard similarity evaluator
const jaccardSimilarity = asEvaluator({
  name: "jaccard_similarity",
  kind: "CODE",
  evaluate: async ({ output, expected }) => {
    const actualWords = new Set(String(output).toLowerCase().split(" "));
    const expectedAnswer = expected?.answer || "";
    const expectedWords = new Set(expectedAnswer.toLowerCase().split(" "));

    const wordsInCommon = new Set(
      [...actualWords].filter((word) => expectedWords.has(word))
    );

    const allWords = new Set([...actualWords, ...expectedWords]);
    const score = wordsInCommon.size / allWords.size;

    return {
      score,
      label: score > 0.5 ? "similar" : "dissimilar",
      metadata: {
        actualWordsCount: actualWords.size,
        expectedWordsCount: expectedWords.size,
        commonWordsCount: wordsInCommon.size,
        allWordsCount: allWords.size
      },
      explanation: `Jaccard similarity: ${score}`
    };
  }
});

or LLMs.

from phoenix.experiments.evaluators import create_evaluator
from typing import Any, Dict

eval_prompt_template = """
Given the QUESTION and REFERENCE_ANSWER, determine whether the ANSWER is accurate.
Output only a single word (accurate or inaccurate).

QUESTION: {question}

REFERENCE_ANSWER: {reference_answer}

ANSWER: {answer}

ACCURACY (accurate / inaccurate):
"""


@create_evaluator(kind="llm")  # need the decorator or the kind will default to "code"
def accuracy(input: Dict[str, Any], output: str, expected: Dict[str, Any]) -> float:
    message_content = eval_prompt_template.format(
        question=input["question"], reference_answer=expected["answer"], answer=output
    )
    response = openai_client.chat.completions.create(
        model="gpt-4o", messages=[{"role": "user", "content": message_content}]
    )
    response_message_content = response.choices[0].message.content.lower().strip()
    return 1.0 if response_message_content == "accurate" else 0.0

import { asEvaluator } from "@arizeai/phoenix-client/experiments";
import { OpenAI } from "openai";

const openai = new OpenAI({
  apiKey: process.env.OPENAI_API_KEY
});

// LLM-based accuracy evaluator
const accuracy = asEvaluator({
  name: "accuracy",
  kind: "LLM",
  evaluate: async ({ input, output, expected }) => {
    const question = input.question || "No question provided";
    const referenceAnswer = expected?.answer || "No reference answer provided";

    const evalPromptTemplate = `
      Given the QUESTION and REFERENCE_ANSWER, determine whether the ANSWER is accurate.
      Output only a single word (accurate or inaccurate).
      
      QUESTION: {question}
      
      REFERENCE_ANSWER: {reference_answer}
      
      ANSWER: {answer}
      
      ACCURACY (accurate / inaccurate):
    `;

    const messageContent = evalPromptTemplate
      .replace("{question}", question)
      .replace("{reference_answer}", referenceAnswer)
      .replace("{answer}", String(output));

    const response = await openai.chat.completions.create({
      model: "gpt-4o",
      messages: [{ role: "user", content: messageContent }]
    });

    const responseContent = 
      response.choices[0]?.message?.content?.toLowerCase().trim() || "";
    const isAccurate = responseContent === "accurate";

    return {
      score: isAccurate ? 1.0 : 0.0,
      label: isAccurate ? "accurate" : "inaccurate",
      metadata: {},
      explanation: `LLM determined the answer is ${isAccurate ? "accurate" : "inaccurate"}`
    };
  }
});

Experiments

Run an experiment and evaluate the results.

from phoenix.experiments import run_experiment

experiment = run_experiment(
    dataset,
    task,
    experiment_name="initial-experiment",
    evaluators=[jaccard_similarity, accuracy],
)

import { runExperiment } from "@arizeai/phoenix-client/experiments";

// Run the experiment with selected evaluators
const experiment = await runExperiment({
  client,
  experimentName: "initial-experiment",
  dataset: { datasetId }, // Use the dataset ID from earlier
  task,
  evaluators: [jaccardSimilarity, accuracy]
});

console.log("Initial experiment completed with ID:", experiment.id);

Run more evaluators after the fact.

from phoenix.experiments import evaluate_experiment

experiment = evaluate_experiment(experiment, evaluators=[contains_keyword, conciseness])

import { evaluateExperiment } from "@arizeai/phoenix-client/experiments";

// Add more evaluations to an existing experiment
const updatedEvaluation = await evaluateExperiment({
  client,
  experiment, // Use the existing experiment object
  evaluators: [containsKeyword, conciseness]
});

console.log("Additional evaluations completed for experiment:", experiment.id);

And iterate 🚀

Dry Run

Sometimes we may want to do a quick sanity check on the task function or the evaluators before unleashing them on the full dataset. run_experiment() and evaluate_experiment() both are equipped with a dry_run= parameter for this purpose: it executes the task and evaluators on a small subset without sending data to the Phoenix server. Setting dry_run=True selects one sample from the dataset, and setting it to a number, e.g. dry_run=3, selects multiple. The sampling is also deterministic, so you can keep re-running it for debugging purposes.

PreviousOverview: Datasets & Experiments NextHow-to: Datasets

Last updated 4 days ago

Was this helpful?