Error Logging Evaluations in UI: KeyError in DataFrame Processing

·May 08, 2024 02:46 AM

Hi team, i am facing this error while logging evaluations on ui. The traces are being sent but the evaluations are not comming. Actually the get_qa_with_reference(px.Client()) returns empty df and get_retrieved_documents(px.Client()) returns none error :

File "/Users/priya/datasci/llm_as_a_service/llamaas/qa/chatbot_qa/job_builder.py", line 188, in submit
    self.dashboard_data[names[index]] = df_result.mean(numeric_only=True)["score"]
  File "/Users/priya/tracenv/lib/python3.9/site-packages/pandas/core/series.py", line 1111, in __getitem__
    return self._get_value(key)
  File "/Users/priya/tracenv/lib/python3.9/site-packages/pandas/core/series.py", line 1227, in _get_value
    loc = self.index.get_loc(label)
  File "/Users/priya/tracenv/lib/python3.9/site-packages/pandas/core/indexes/range.py", line 417, in get_loc
    raise KeyError(key)
KeyError: 'score'

code snippet :

def submit(self):

        os.environ["PHOENIX_PROJECT_NAME"] = self.project_name
        print(f"local server setup")
        session = px.launch_app()
        LangChainInstrumentor().instrument()

        ai_dresult = {}
        question_num = 0
        error_questions = 0
        total_questions = len(self.question_answer_pool)
        failed_questions = []  # To keep track of questions that failed

        for question, human_answer in self.question_answer_pool.items():
            try:
                normalized_question = normalize_spacing(question)
                ai_answer = self.chat_app_run(normalized_question)
                ai_dresult[normalized_question] = ai_answer
                question_num += 1
            except Exception as e:
                error_questions += 1
                failed_questions.append((question, str(e)))  # Store the failed question and the error
                logger.error(f"An error occurred while processing the question '{question}': {e}")
                # Optionally, you can continue to the next iteration if you want to skip failed ones
                continue

        # After processing all questions, logging the failed ones for review
        if failed_questions:
            logger.error(f"Failed questions summary: {len(failed_questions)} errors encountered.")
            for failed_question, error_message in failed_questions:
                logger.error(f"Question: {failed_question} | Error: {error_message}")

        logger.info(f"The evaluation based on dataset:{self.dataset_id}. {question_num} questions to run")
        logger.info(f"The evaluation based on dataset:{self.dataset_id}.")
        logger.info(f"Total questions to be evaluated: {total_questions}")
        logger.info(f"Questions processed successfully: {question_num}")
        logger.info(f"Questions resulted in errors: {error_questions}")
        # log the traces
        logger.info(f"Getting input, output and reference from traces ...")
        input_output_df = get_qa_with_reference(px.Client())
        input_output_df["correct_answer"] = input_output_df["input"].apply(
            lambda x: self.question_answer_pool[x])

        input_output_df["ai_answer"] = input_output_df["input"].apply(
            lambda x: ai_dresult[x])
        retrieved_documents_df = get_retrieved_documents(px.Client())
        print(f"input_output_df: {input_output_df}")
        print(f"retrieved_documents_df: {retrieved_documents_df}")
        if self.evaluators_qa_with_reference:
            evaluations_list = self.evaluators_qa_with_reference.values()
            logger.info(f"Running evaluations: {evaluations_list}")
            df_results = []
            names = []
            for name in self.evaluators_qa_with_reference.values():
                df_results.append(f"result_of_{name}")
                names.append(name)

            df_results = run_evals(
                dataframe=input_output_df,
                evaluators=self.evaluators_qa_with_reference.keys(),
                provide_explanation=True,
            )
            print(f"self evalutaionsndndh: {self.evaluators_qa_with_reference.keys()}")
            logger.info(f"Log evaluation results to UI for: {evaluations_list}")
            for index, df_result in enumerate(df_results):
                px.Client().log_evaluations(
                    SpanEvaluations(eval_name=names[index], dataframe=df_result)
                )
                self.dashboard_data[names[index]] = df_result.mean(numeric_only=True)["score"]

        if self.evaluators_retrieved_documents:
            evaluations_retrived = self.evaluators_retrieved_documents.values()
            logger.info(f"Running evaluations: {evaluations_retrived}")
            relevance_eval_df = run_evals(
                dataframe=retrieved_documents_df,
                evaluators=self.evaluators_retrieved_documents.keys(),
                provide_explanation=True,
            )[0]
            logger.info(f"Log evaluation results to UI: {evaluations_retrived}")
            px.Client().log_evaluations(
                DocumentEvaluations(eval_name="Relevance", dataframe=relevance_eval_df),
            )
            self.dashboard_data["Relevance"] = relevance_eval_df.mean(numeric_only=True)["score"]

        if self.run_outputtone:
            logger.info("Run evaluation: outputtone")
            output_tone_df = run_output_tone_evaluation(input_output_df)
            logger.debug(f"output_tone_df:{output_tone_df}")
            logger.info(f"Log evaluation results to UI: outputtone")
            px.Client().log_evaluations(SpanEvaluations(eval_name="output_tone", dataframe=output_tone_df))

        #  TODO: implement the function
        self.dashboard_data["timestamp"] = datetime.now().timestamp()
        if self.dataset_id > BENCHMARK_DATASET_ID_MIN and self.project_name != PROJECT_NAMES.get('RANDOM'):
            logger.info("Log evaluations scores to Bigquery")
            util.log_scores_to_bigQuery(self.dashboard_data)
        logger.info("The traces and evaluation done")

        time.sleep(self.job_save_seconds_for_local_run)

17 comments