getting an issue where I've defined a custom evaluator and am returning an EvaluationResult object but get an error because a dict is expected
"""
Accuracy evaluation using structured AI output.
"""
from typing import Any, Dict
from phoenix.experiments.evaluators import create_evaluator
from phoenix.experiments.types import EvaluationResult
from src.ai.prompt_runner import PromptRunner
@create_evaluator(kind="llm")
async def accuracy(input: Dict[str, Any], output: str, expected: str) -> Dict[str, Any]:
"""
Evaluate accuracy using structured AI output.
Args:
input: Input data containing doc_path
output: Generated summary string from the task
expected: Expected summary string
Returns:
Dict with label, explanation, and score
"""
prompt_runner = PromptRunner()
# Handle case where expected might be a dict or string
reference_answer = (
expected
if isinstance(expected, str)
else expected.get("expected_summary", str(expected))
)
eval_result = await prompt_runner.evaluate_accuracy(
input_text=input["doc_path"],
reference_answer=reference_answer,
answer=output,
)
return EvaluationResult(
label=eval_result.label,
explanation=eval_result.explanation,
score=eval_result.score,
)code where it errors out:
if result:
# Filter out None values for OpenTelemetry attributes
attributes: dict[str, Any] = {}
if (score := result.get("score")) is not None:
attributes["evaluation.score"] = score
if (label := result.get("label")) is not None:
attributes["evaluation.label"] = label
if attributes:
span.set_attributes(attributes)
span.set_attribute(OPENINFERENCE_SPAN_KIND, EVALUATOR)
span.set_status(status)error:
AttributeError: 'EvaluationResult' object has no attribute 'get'before I was returning a dict rather than an EvaluationResult object but ended up with a different issue
You are getting AttributeError: 'EvaluationResult' object has no attribute 'get' because Phoenix expects your evaluator to return a dictionary (or something dict-like), not an instance of the EvaluationResult class. The EvaluationResult type is a type alias for a dictionary, not a class to instantiate directly. You should return a dict, e.g., {"label": ..., "explanation": ..., "score": ...} instead of EvaluationResult(...) (see docs, see create_evaluator usage).So, change your return statement to: LBKSreturn { "label": eval_result.label, "explanation": eval_result.explanation, "score": eval_result.score, }LBKSWould you like more detail or code context?Sources:
- create_evaluator expects dict return
- see docs
馃挕聽Hint: Mention RunLLM in the thread for followups.
This error occurs if your evaluator returns a dict that does not match the expected structure. Phoenix expects a dict with specific keys: at minimum, "score" (float or int), and optionally "label" (str), "explanation" (str), and "metadata" (dict). If your dict is missing required keys or has unexpected types, you'll get this error (see implementation).Ensure your return value is like: LBKSreturn { "score": float(eval_result.score), "label": str(eval_result.label), "explanation": str(eval_result.explanation), }LBKSand that all values are the correct types. Would you like a full example or more troubleshooting steps?Sources:
- Evaluator return type and error handling
馃挕聽Hint: Mention RunLLM in the thread for followups.
If your dict has the correct keys and types but you still get ValueError: Unsupported evaluation result type: <class 'dict'> , it's likely because Phoenix's evaluator expects only the keys "score", "label", and "explanation"鈥攏o extra keys or nested structures (see code). Also, ensure you are not returning a nested dict or tuple, and that your function is not wrapped in another decorator that changes the return type.If the structure is correct and the error persists, double-check that you are not returning a dict inside a tuple or list, and that your function is not returning None or another type in some cases. If possible, print the exact value being returned right before the return statement to confirm.Would you like a minimal working example or help debugging further?Sources:
- Evaluator return type and error handling
馃挕聽Hint: Mention RunLLM in the thread for followups.
no extra keys provided RunLLM am i using the right set of imports?
from phoenix.experiments.evaluators import create_evaluator
from phoenix.client.experiments import run_experiment
from phoenix.client import Client, AsyncClient
run_experiment(
dataset=dataset,
task=task,
experiment_name="initial-experiment",
evaluators=[accuracy],
client=AsyncClient(),
)Your imports are inconsistent: you are mixing phoenix.experiments.evaluators (old API) with phoenix.client.experiments (new API). For the new Phoenix client, you should use from phoenix.client.experiments import create_evaluator instead of from phoenix.experiments.evaluators import create_evaluator (see docs).Update your import to: LBKSfrom phoenix.client.experiments import create_evaluator, run_experimentLBKSWould you like a full minimal example for the new client?Sources:
- phoenix.client.experiments API
馃挕聽Hint: Mention RunLLM in the thread for followups.
Hey Pranav P.. I'm not able to reproduce this issue, but please try the snippet below. If you encounter the same issue, you might try upgrading your version of arize-phoenix-evals to the latest. Please be aware that your code snippet and the code snippet below use the legacy client and legacy phoenix.experiments module, which is in the process of being deprecated.
from typing import Any, Dict
from openai import OpenAI
from phoenix import Client
from phoenix.experiments import run_experiment
from phoenix.experiments.types import EvaluationResult
client = Client()
openai_client = OpenAI()
dataset = client.upload_dataset(
dataset_name="experiment-quickstart-dataset",
inputs=[{"question": "What is Paul Graham known for?"}],
outputs=[{"answer": "Co-founding Y Combinator and writing on startups and techology."}],
metadata=[{"topic": "tech"}],
)
def jaccard_similarity(output: str, expected: Dict[str, Any]) -> float:
# https://en.wikipedia.org/wiki/Jaccard_index
actual_words = set(output.lower().split(" "))
expected_words = set(expected["answer"].lower().split(" "))
words_in_common = actual_words.intersection(expected_words)
all_words = actual_words.union(expected_words)
return EvaluationResult(
score=len(words_in_common) / len(all_words),
label="good",
explanation="The answer is accurate and contains the expected keywords.",
)
task_prompt_template = "Answer in a few words: {question}"
def task(input: Any) -> str:
question = input["question"]
message_content = task_prompt_template.format(question=question)
response = openai_client.chat.completions.create(
model="gpt-4o", messages=[{"role": "user", "content": message_content}]
)
return response.choices[0].message.content or ""
experiment = run_experiment(
dataset=dataset,
task=task,
experiment_name="initial-experiment",
evaluators=[jaccard_similarity],
)
really confusing having the same methods importable from different packages
Thanks Pranav P.. We definitely feel your pain on this. cc Dustin N. Elizabeth H.
