Integrating Phoenix Evaluators in Your Custom Experiment
I have a very simple experiment, but I want to improve it so that it makes best use of Phoenix’s evals and prompt tooling. I’m concerned there are important things I’m not getting by not using the LLMEvaluator class The docs state
Note: All arize-phoenix-evals Evaluators are drop-in compatible with experiments.
then they link to https://arize.com/docs/phoenix/datasets-and-experiments/how-to-experiments/using-evaluators I find it very confusing that there aren’t any examples of running an phoenix.evals.evaluators.Evaluator implementation with phoenix.experiments.run_experiment and can’t figure out why my custom LLMEvaluator isn’t working. Can you help me find an example of how to implement a custom LLMEvaluator and run it with run_experiment? It’s confusing that the docs on datasets and experiments discuss evals. Then that section is followed up by another section dedicated to Evaluation, but using different modules, without reference to the experiments framework. What am I missing? This is the experiment I’m working on:
import json
from pathlib import Path
from typing import Any
import llm
from phoenix.client import Client
from phoenix.experiments import run_experiment
from pydantic import BaseModel, Field
from snakemake.script import snakemake
from ankihub_llm.base import get_documents_from_file_with_llm
from ankihub_llm.experiments.common.common import rebuild_phoenix_dataset_from_file
from ankihub_llm.experiments.common.tracing import setup_tracing
setup_tracing()
def fetch_prompt() -> str:
# get the phoenix client and pull the pdf_text_extraction by name
client = Client()
version_id: str = snakemake.config.get("prompt_version")
prompt = client.prompts.get(prompt_version_id=version_id)
return prompt
PROMPT = fetch_prompt().format(sdk="openai")["messages"][0]["content"]
EXTRACTION_MODEL = snakemake.config["extraction_model_name"]
JUDGE_MODEL = snakemake.config["judge_model_name"]
class Alignment(BaseModel):
aligned: bool = Field(
description="Whether the extracted text strictly follows the system instructions"
)
explanation: str = Field(description="An explanation for the alignment decision")
def llm_alignment_evaluator(
output: dict[str, Any] | None, input=None, metadata=None
) -> tuple[bool, str]:
pdf_url = input["pdf_url"]
pages = output["pages"]
text = "\n\n".join(p.get("markdown", "") for p in pages)
if not text.strip():
return False, "No extracted text to evaluate."
judge = llm.get_model(JUDGE_MODEL)
# TODO Pull this prompt from phoenix
prompt = (
"You are judging if the extracted text strictly follows the system instructions. Examine the attached PDF and the extracted text.\n"
f"Extracted pages: {len(pages)}\n"
f"<instructions>\n{PROMPT}\n</instructions>\n\n"
f"<extracted>\n{text}\n</extracted>\n"
)
# TODO attach the pdf to the judge model
resp = judge.prompt(
prompt, schema=Alignment, attachments=[llm.Attachment(url=pdf_url)]
)
evaluation = json.loads(resp.text())
return bool(evaluation.get("aligned", False)), str(
evaluation.get("explanation", "")
)
def phoenix_task(example):
pdf_url: str = example.input["pdf_url"]
documents = get_documents_from_file_with_llm(
pdf_url=pdf_url, prompt=PROMPT, model_name=EXTRACTION_MODEL
)
return {"pages": [{"markdown": d.page_content, **d.metadata} for d in documents]}
def main() -> None:
output_path = Path(snakemake.output.results)
dataset = rebuild_phoenix_dataset_from_file(snakemake.input.dataset)
experiment = run_experiment(
dataset=dataset,
task=phoenix_task,
evaluators=[llm_alignment_evaluator],
experiment_name="009_llm_text_extraction",
experiment_description="Extract PDF text with system prompt and judge alignment",
experiment_metadata={},
concurrency=4,
timeout=120,
)
df = experiment.as_dataframe()
output_path.parent.mkdir(parents=True, exist_ok=True)
df.to_json(output_path, index=False, indent=2)
if __name__ == "__main__":
main()