Using LLM for Classification Evaluation in Sentiment Analysis

·May 30, 2025 12:39 PM

Experiments - evaluation question Hi, I see that in most examples llm_classify is used for classification eval. I tried to use examples from the experiment folder but got stuck. How can I recreate the functionality of the llm_classify to output categorical labels so that phoenix will calculate precision/ recall/ f1 for me? This is my current code, would appreciate any points how to convert it to classification eval

# Setup OpenAI and Phoenix Client
URL = "https://XXXXXXX.com"
phoenix_client = px.Client(endpoint=URL)
openai_client = OpenAI(base_url="https://XXXXXX/v1", api_key="XXXXX)

# Upload Dataset
now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
dataset = phoenix_client.upload_dataset(
    dataframe=df,
    dataset_name=f"sentiment-analysis-{now}",
    input_keys=["query"],
    output_keys=["ground_truth"])
emotions_unique = ", ".join(df['ground_truth'].unique())
print(emotions_unique)
prompt_template = """
Classify the emotion present in the text below. You should only respond with the name of the emotion, no other words.
The emotion must be one of the provided values.

Input
=======
[Text]: {text}
[Provided Values]: {emotions}
"""
def make_emotion_task(prompt_template: str, model_name: str):
    def task(input: Dict[str, Any]) -> str:
        formatted_prompt = prompt_template.format(text=input["query"], emotions=emotions_unique)
        response = openai_client.chat.completions.create(
            model=model_name,
            messages=[{"role": "user", "content": formatted_prompt}]
        )
        assert response.choices
        return response.choices[0].message.content.strip()
    return task

# Instantiate task
task = make_emotion_task(prompt_template, model_name="meta-llama/Meta-Llama-3-8B-Instruct")

# Evaluation prompt for model-as-a-judge
eval_prompt = """
Your task is to evaluate whether the predicted emotion below describes the supplied input text. 
We are also including the correct emotion as a piece of data.

Begin Data:
[input text]: {input}
[correct emotion]: {expected}
[predicted emotion]: {output}

It's possible that the predicted emotion is another word for the correct emotion, and the two are 
roughly equivalent. If the two emotions are equivalent, respond with the word 'correct'. If they
are note equivalent, respond with the word 'incorrect'. Do not include any other words in your 
response
"""

# Evaluator function
@create_evaluator(kind="llm")
def llm_as_a_judge_eval(input: Dict[str, Any], output: str, expected: Dict[str, Any]) -> float:
    message_content = eval_prompt.format(
        input=input["query"],
        expected=expected["ground_truth"],
        output=output
    )
    response = openai_client.chat.completions.create(
        model="meta-llama/Meta-Llama-3-8B-Instruct",
        messages=[{"role": "user", "content": message_content}]
    )
    assert response.choices
    return 1.0 if response.choices[0].message.content.strip().lower() == "correct" else 0.0

@create_evaluator(kind="llm")
def classification_eval(input, output, expected) -> str:
    predicted = output.strip().lower()
    actual = expected["ground_truth"].strip().lower()

    return "correct" if predicted == actual else "incorrect"

experiment = run_experiment(
    dataset=dataset,
    task=task,
    experiment_name="sentiment-experiment",
    experiment_description="baseline experiment",
    experiment_metadata={"vendor": "OpenAI", "model": "Meta-Llama-3-8B-Instruct"},
    evaluators=[classification_eval],
    dry_run=None
)

rails = ["correct", "incorrect"] # Use llm_classify to get predictions classification_df = llm_classify( data=df, template=eval_prompt, model=OpenAIModel(model="meta-llama/Meta-Llama-3-8B-Instruct", api_key="EMPTY"), rails=rails, verbose=True )

12 comments