Error Logging Evaluations in UI: KeyError in DataFrame Processing
Hi team, i am facing this error while logging evaluations on ui. The traces are being sent but the evaluations are not comming. Actually the get_qa_with_reference(px.Client()) returns empty df and get_retrieved_documents(px.Client()) returns none error :
File "/Users/priya/datasci/llm_as_a_service/llamaas/qa/chatbot_qa/job_builder.py", line 188, in submit
self.dashboard_data[names[index]] = df_result.mean(numeric_only=True)["score"]
File "/Users/priya/tracenv/lib/python3.9/site-packages/pandas/core/series.py", line 1111, in __getitem__
return self._get_value(key)
File "/Users/priya/tracenv/lib/python3.9/site-packages/pandas/core/series.py", line 1227, in _get_value
loc = self.index.get_loc(label)
File "/Users/priya/tracenv/lib/python3.9/site-packages/pandas/core/indexes/range.py", line 417, in get_loc
raise KeyError(key)
KeyError: 'score'code snippet :
def submit(self):
os.environ["PHOENIX_PROJECT_NAME"] = self.project_name
print(f"local server setup")
session = px.launch_app()
LangChainInstrumentor().instrument()
ai_dresult = {}
question_num = 0
error_questions = 0
total_questions = len(self.question_answer_pool)
failed_questions = [] # To keep track of questions that failed
for question, human_answer in self.question_answer_pool.items():
try:
normalized_question = normalize_spacing(question)
ai_answer = self.chat_app_run(normalized_question)
ai_dresult[normalized_question] = ai_answer
question_num += 1
except Exception as e:
error_questions += 1
failed_questions.append((question, str(e))) # Store the failed question and the error
logger.error(f"An error occurred while processing the question '{question}': {e}")
# Optionally, you can continue to the next iteration if you want to skip failed ones
continue
# After processing all questions, logging the failed ones for review
if failed_questions:
logger.error(f"Failed questions summary: {len(failed_questions)} errors encountered.")
for failed_question, error_message in failed_questions:
logger.error(f"Question: {failed_question} | Error: {error_message}")
logger.info(f"The evaluation based on dataset:{self.dataset_id}. {question_num} questions to run")
logger.info(f"The evaluation based on dataset:{self.dataset_id}.")
logger.info(f"Total questions to be evaluated: {total_questions}")
logger.info(f"Questions processed successfully: {question_num}")
logger.info(f"Questions resulted in errors: {error_questions}")
# log the traces
logger.info(f"Getting input, output and reference from traces ...")
input_output_df = get_qa_with_reference(px.Client())
input_output_df["correct_answer"] = input_output_df["input"].apply(
lambda x: self.question_answer_pool[x])
input_output_df["ai_answer"] = input_output_df["input"].apply(
lambda x: ai_dresult[x])
retrieved_documents_df = get_retrieved_documents(px.Client())
print(f"input_output_df: {input_output_df}")
print(f"retrieved_documents_df: {retrieved_documents_df}")
if self.evaluators_qa_with_reference:
evaluations_list = self.evaluators_qa_with_reference.values()
logger.info(f"Running evaluations: {evaluations_list}")
df_results = []
names = []
for name in self.evaluators_qa_with_reference.values():
df_results.append(f"result_of_{name}")
names.append(name)
df_results = run_evals(
dataframe=input_output_df,
evaluators=self.evaluators_qa_with_reference.keys(),
provide_explanation=True,
)
print(f"self evalutaionsndndh: {self.evaluators_qa_with_reference.keys()}")
logger.info(f"Log evaluation results to UI for: {evaluations_list}")
for index, df_result in enumerate(df_results):
px.Client().log_evaluations(
SpanEvaluations(eval_name=names[index], dataframe=df_result)
)
self.dashboard_data[names[index]] = df_result.mean(numeric_only=True)["score"]
if self.evaluators_retrieved_documents:
evaluations_retrived = self.evaluators_retrieved_documents.values()
logger.info(f"Running evaluations: {evaluations_retrived}")
relevance_eval_df = run_evals(
dataframe=retrieved_documents_df,
evaluators=self.evaluators_retrieved_documents.keys(),
provide_explanation=True,
)[0]
logger.info(f"Log evaluation results to UI: {evaluations_retrived}")
px.Client().log_evaluations(
DocumentEvaluations(eval_name="Relevance", dataframe=relevance_eval_df),
)
self.dashboard_data["Relevance"] = relevance_eval_df.mean(numeric_only=True)["score"]
if self.run_outputtone:
logger.info("Run evaluation: outputtone")
output_tone_df = run_output_tone_evaluation(input_output_df)
logger.debug(f"output_tone_df:{output_tone_df}")
logger.info(f"Log evaluation results to UI: outputtone")
px.Client().log_evaluations(SpanEvaluations(eval_name="output_tone", dataframe=output_tone_df))
# TODO: implement the function
self.dashboard_data["timestamp"] = datetime.now().timestamp()
if self.dataset_id > BENCHMARK_DATASET_ID_MIN and self.project_name != PROJECT_NAMES.get('RANDOM'):
logger.info("Log evaluations scores to Bigquery")
util.log_scores_to_bigQuery(self.dashboard_data)
logger.info("The traces and evaluation done")
time.sleep(self.job_save_seconds_for_local_run)