Thank you for the help - to be fair, that seems like a bit overcomplicating the issue, so I will write the logic myself and use phoenix just for the visualization itself. Given dataframe with the following structure and the classification report. Is there a way to upload those results to phoenix just for visualization? John G.
Hi guys, Does anyone have an example to share on how to use llm_classify with experiments or use an LLM SDK to reproduce same functionality?
Would appreciate any examples you could share with me that achieve similar thing
Got it - thanks
RunLLM I modifed the code to this one:
rails = ["correct", "incorrect"]
# Use llm_classify to get predictions
classification_df = llm_classify(
data=df,
template=eval_prompt,
model=OpenAIModel(model="meta-llama/Meta-Llama-3-8B-Instruct", api_key="EMPTY"),
rails=rails,
verbose=True
)But got an error: retries exhausted after 1 attempts: Missing template variable: 'input
Experiments - evaluation question Hi, I see that in most examples llm_classify is used for classification eval. I tried to use examples from the experiment folder but got stuck. How can I recreate the functionality of the llm_classify to output categorical labels so that phoenix will calculate precision/ recall/ f1 for me? This is my current code, would appreciate any points how to convert it to classification eval
# Setup OpenAI and Phoenix Client
URL = "https://XXXXXXX.com"
phoenix_client = px.Client(endpoint=URL)
openai_client = OpenAI(base_url="https://XXXXXX/v1", api_key="XXXXX)
# Upload Dataset
now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
dataset = phoenix_client.upload_dataset(
dataframe=df,
dataset_name=f"sentiment-analysis-{now}",
input_keys=["query"],
output_keys=["ground_truth"])
emotions_unique = ", ".join(df['ground_truth'].unique())
print(emotions_unique)
prompt_template = """
Classify the emotion present in the text below. You should only respond with the name of the emotion, no other words.
The emotion must be one of the provided values.
Input
=======
[Text]: {text}
[Provided Values]: {emotions}
"""
def make_emotion_task(prompt_template: str, model_name: str):
def task(input: Dict[str, Any]) -> str:
formatted_prompt = prompt_template.format(text=input["query"], emotions=emotions_unique)
response = openai_client.chat.completions.create(
model=model_name,
messages=[{"role": "user", "content": formatted_prompt}]
)
assert response.choices
return response.choices[0].message.content.strip()
return task
# Instantiate task
task = make_emotion_task(prompt_template, model_name="meta-llama/Meta-Llama-3-8B-Instruct")
# Evaluation prompt for model-as-a-judge
eval_prompt = """
Your task is to evaluate whether the predicted emotion below describes the supplied input text.
We are also including the correct emotion as a piece of data.
Begin Data:
[input text]: {input}
[correct emotion]: {expected}
[predicted emotion]: {output}
It's possible that the predicted emotion is another word for the correct emotion, and the two are
roughly equivalent. If the two emotions are equivalent, respond with the word 'correct'. If they
are note equivalent, respond with the word 'incorrect'. Do not include any other words in your
response
"""
# Evaluator function
@create_evaluator(kind="llm")
def llm_as_a_judge_eval(input: Dict[str, Any], output: str, expected: Dict[str, Any]) -> float:
message_content = eval_prompt.format(
input=input["query"],
expected=expected["ground_truth"],
output=output
)
response = openai_client.chat.completions.create(
model="meta-llama/Meta-Llama-3-8B-Instruct",
messages=[{"role": "user", "content": message_content}]
)
assert response.choices
return 1.0 if response.choices[0].message.content.strip().lower() == "correct" else 0.0
@create_evaluator(kind="llm")
def classification_eval(input, output, expected) -> str:
predicted = output.strip().lower()
actual = expected["ground_truth"].strip().lower()
return "correct" if predicted == actual else "incorrect"
experiment = run_experiment(
dataset=dataset,
task=task,
experiment_name="sentiment-experiment",
experiment_description="baseline experiment",
experiment_metadata={"vendor": "OpenAI", "model": "Meta-Llama-3-8B-Instruct"},
evaluators=[classification_eval],
dry_run=None
)
