Hey guys, I am running llm_classify() but I am not receiving any explanation for the labeling of correct or incorrect, here is my implementation (image1) I do not really know what is wrong, we were wondering if ti could be realted to the way we were extracting the data in "tool_call" column (image 2). Can anyone give us some guidance
💡 Hint: Mention RunLLM in the thread for followups.
in our case, tool_call="llm.function_call" doesn't work. So we did a custom function to extract that value manually, because it was inside of the 'response', Idk if that is affecting us
sure. I am going to split the context and many msg for a better explanation here is my questions dataframe:
questions
0 "I bought the JBL 300Z headphones a while back...
1 "Could you let me know what high-performance l...
2 "I recently purchased something from your elec...
3 "It seems like the Samsung 56C monitor I order...
4 "I was hoping you could help me figure out whe...
5 "I'm curious about whether using the promo cod...then
import phoenix as px
from langchain_openai import ChatOpenAI...
session = px.launch_app()...
from openinference.instrumentation.openai import OpenAIInstrumentor
from phoenix.otel import register
# configure the Phoenix tracer
tracer_provider = register()
OpenAIInstrumentor().instrument(tracer_provider=tracer_provider)defining functions:
## function definitions using pydantic decorator
@tool
def product_comparison(product_a_id: str, product_b_id: str) -> dict:
"""Compare features of two products."""
if product_a_id == "" or product_b_id == "":
return {"error": "missing product id"}
return {"comparison": "Similar"}
@tool
def product_details(product_id: str) -> dict:
"""Get detailed features on one product."""
if product_id == "":
return {"error": "missing product id"}
return {"name": "Product Name", "price": "$12.50", "Availability": "In Stock"}
@tool
def apply_discount_code(order_id: int, discount_code: str) -> dict:
"""Apply a discount code to an order."""
if order_id == "" or discount_code == "":
return {"error": "missing order id or discount code"}
return {"applied": "True"}
@tool
def product_search(
query: str,
category: str = None,
min_price: float = 0.0,
max_price: float = None,
page: int = 1,
page_size: int = 20,
) -> dict:
"""Search for products based on criteria."""
if query == "":
return {"error": "missing query"}
return {"results": [], "pagination": {"total": 0, "page": 1, "page_size": 20}}
@tool
def customer_support(issue_type: str) -> dict:
"""Get contact information for customer support."""
if issue_type == "":
return {"error": "missing issue type"}
return {"contact": issue_type}
@tool
def track_package(tracking_number: int) -> dict:
"""Track the status of a package."""
if tracking_number == "":
return {"error": "missing tracking number"}
return {"status": "Delivered"}
tools = [
product_comparison,
product_search,
customer_support,
track_package,
apply_discount_code,
product_details,
]defining assistant for tool calling:
from langgraph.prebuilt import create_react_agent
model = ChatOpenAI(
model="gpt-4o",
openai_api_base="https:...",
openai_api_key=token_manager.get_token(),
)
# Assuming you have already created the langgraph_agent_executor
langgraph_agent_executor = create_react_agent(model, tools)
# Define a function to process each query
def process_query(query):
messages = langgraph_agent_executor.invoke({"messages": [("human", query)]})
return {
"input": query,
"output": messages["messages"][-1].content,
}
# Apply the function to each question in the DataFrame
questions_df["response"] = questions_df["questions"].apply(lambda query: process_query(query))
questions_dfgot this:
questions response
0 "I bought the JBL 300Z headphones a while back... {'input': '"I bought the JBL 300Z headphones a...
1 "Could you let me know what high-performance l... {'input': '"Could you let me know what high-pe...
2 "I recently purchased something from your elec... {'input': '"I recently purchased something fro...
3 "It seems like the Samsung 56C monitor I order... {'input': '"It seems like the Samsung 56C moni...
4 "I was hoping you could help me figure out whe... {'input': '"I was hoping you could help me fig...
5 "I'm curious about whether using the promo cod... {'input': '"I'm curious about whether using th...
6 "I ordered a Dyson 10J vacuum not too long ago... {'input': '"I ordered a Dyson 10J vacuum not t...
7 "It would be great if you could explain the be... {'input': '"It would be great if you could exp...
8 "Would you be able to provide details on smart... {'input': '"Would you be able to provide detai...evaluation part:
from phoenix.trace import SpanEvaluations
from phoenix.trace.dsl import SpanQuery
query = (
SpanQuery()
.where(
# Filter for the `LLM` span kind.
# The filter condition is a string of valid Python boolean expression.
"span_kind == 'LLM'",
)
.select(
# Extract and rename the following span attributes
#check how is it done with lang
question="llm.input_messages",
response="llm.output_messages",
tool_call="llm.function_call",
)
)
trace_df = px.Client().query_spans(query)
trace_df["tool_call"] = trace_df["tool_call"].fillna("No tool used")
trace_dfoutput:
question response tool_call
context.span_id
a4b3c014e46015e7 [{'message': {'role': 'user', 'content': '"I b... None No tool used
861067ee6b580315 [{'message': {'role': 'user', 'content': '"I b... [{'message': {'tool_calls': [{'tool_call': {'f... No tool used
e87963e99a6f9e5e [{'message': {'role': 'user', 'content': '"I b... [{'message': {'role': 'assistant', 'content': ... No tool used
4f8aa7732e060181 [{'message': {'role': 'user', 'content': '"Cou... [{'message': {'tool_calls': [{'tool_call': {'f... No tool usedit seems the llm.function_call doesn't exist, so we made a custom function:
# 3. Function to Extract Tool Call Information
def extract_tool_call(response):
try:
# Access the nested structure to get the tool_calls list
tool_calls = response[0]['message'].get('tool_calls')
# If tool_calls exist, format them into a string
if tool_calls:
# Extract function name and arguments from each tool call
tool_call_info = [
f"Function: {tc['tool_call']['function']['name']}, Arguments: {tc['tool_call']['function']['arguments']}"
for tc in tool_calls
]
return "; ".join(tool_call_info) # Combine multiple tool calls into one string
else:
return "No tool used"
except (TypeError, KeyError, IndexError):
return "No tool used"
# 4. Apply the Function to Create the 'tool_call' Column
trace_df["tool_call"] = trace_df["response"].apply(extract_tool_call)this the output now, here I am not sure if is the correct format for tool_call column :
question response tool_call
context.span_id
a4b3c014e46015e7 [{'message': {'role': 'user', 'content': '"I b... None No tool used
861067ee6b580315 [{'message': {'role': 'user', 'content': '"I b... [{'message': {'tool_calls': [{'tool_call': {'f... Function: product_details, Arguments: {"produc...
e87963e99a6f9e5e [{'message': {'role': 'user', 'content': '"I b... [{'message': {'role': 'assistant', 'content': ... No tool used
4f8aa7732e060181 [{'message': {'role': 'user', 'content': '"Cou... [{'message': {'tool_calls': [{'tool_call': {'f... Function: product_search, Arguments: {"query":...
809cb705e6af18c7 [{'message': {'role': 'user', 'content': '"Cou... [{'message': {'role': 'assistant', 'content': ... No tool usedour eval model:
eval_model = OpenAIModel(
model="gpt-4o-2024-08-06",
base_url="https...",
api_key=token_manager.get_token(),
)then:
tool_definitions = ""
for current_tool in tools:
tool_definitions += f"""
{current_tool.name}: {current_tool.description}
"""
print(tool_definitions)finally:
rails = list(TOOL_CALLING_PROMPT_RAILS_MAP.values())
# llm_classify: Phoenix method
response_classifications = llm_classify(
data=trace_df,
#give json of tools definitios to evaluate the responses of the LLm
template = TOOL_CALLING_PROMPT_TEMPLATE.template[0].template.replace("{tool_definitions}", tool_definitions),
model=eval_model,
rails=rails,
provide_explanation=True
)
response_classifications["score"] = response_classifications.apply(
lambda x: 1 if x["label"] == "correct" else 0, axis=1
)we dont have explanation:
label explanation exceptions execution_status execution_seconds score
context.span_id
a4b3c014e46015e7 incorrect None [] COMPLETED 1.228536 0
861067ee6b580315 incorrect None [] COMPLETED 1.547667 0
e87963e99a6f9e5e incorrect None [] COMPLETED 1.469549 0
4f8aa7732e060181 correct None [] COMPLETED 1.733419 1
809cb705e6af18c7 incorrect None [] COMPLETED 1.296693 0
12ad48497d6a7d19 incorrect None [] COMPLETED 1.314777 0
076a1c592d0dc498 incorrect None [] COMPLETED 1.266717 0
940fc34151352d8c incorrect None [] COMPLETED 1.225860 0bassicaly we were just trying to replicate this notebook, but we made some changes because of deprecation:https://github.com/Arize-ai/phoenix/blob/main/tutorials/evals/evaluate_tool_calling.ipynb
Got it. Do you have the notebook you can share?
they dont allow me to share scripts
It sounds like you were expecting there to be a column in the dataframe that wasn't included when you pulled trace_df?
are you referring to this:
.select(
# Extract and rename the following span attributes
#check how is it done with lang
question="llm.input_messages",
response="llm.output_messages",
tool_call="llm.function_call",
)
)
trace_df YES
but since I dont get it I extracted that info from response
