1DS19ET063 P.

Commented on Error Logging Evaluations in UI: KeyError in DataF...·Posted inPhoenix Support

ohh! now i wonder why is that happening. Btw Thanks for your help Roger Y. Xander S.

Commented on Error Logging Evaluations in UI: KeyError in DataF...·Posted inPhoenix Support

Thanks Roger Y. it is working now. but I wonder that earlier i was using get_qa_with_reference() and it was working fine, and suddenly from monday it started throwing invalid index error 🤔

Commented on Error Logging Evaluations in UI: KeyError in DataF...·Posted inPhoenix Support

1DS19ET063 P.

i have done few modifications :


    def submit(self):

        if self.server.server_identifier == "127.0.0.1":
            os.environ["PHOENIX_PROJECT_NAME"] = self.project_name
            print(f"local server setup")
            session = px.launch_app()
            LangChainInstrumentor().instrument()
        else:
            print("Setting up for remote server...")
            os.environ["PHOENIX_PROJECT_NAME"] = self.project_name
            os.environ["PHOENIX_COLLECTOR_ENDPOINT"]
            print(f"PHOENIX_COLLECTOR_ENDPOINT is now set to: {os.environ.get('PHOENIX_COLLECTOR_ENDPOINT')}")
            LangChainInstrumentor().instrument()
        ai_dresult = {}
        question_num = 0
        error_questions = 0
        total_questions = len(self.question_answer_pool)
        failed_questions = []  # To keep track of questions that failed

        for question, human_answer in self.question_answer_pool.items():
            try:
                normalized_question = normalize_spacing(question)
                ai_answer = self.chat_app_run(normalized_question)
                ai_dresult[normalized_question] = ai_answer
                question_num += 1
            except Exception as e:
                error_questions += 1
                failed_questions.append((question, str(e)))  # Store the failed question and the error
                logger.error(f"An error occurred while processing the question '{question}': {e}")
                # Optionally, you can continue to the next iteration if you want to skip failed ones
                continue

        # After processing all questions, logging the failed ones for review
        if failed_questions:
            logger.error(f"Failed questions summary: {len(failed_questions)} errors encountered.")
            for failed_question, error_message in failed_questions:
                logger.error(f"Question: {failed_question} | Error: {error_message}")

        logger.info(f"The evaluation based on dataset:{self.dataset_id}. {question_num} questions to run")
        logger.info(f"The evaluation based on dataset:{self.dataset_id}.")
        logger.info(f"Total questions to be evaluated: {total_questions}")
        logger.info(f"Questions processed successfully: {question_num}")
        logger.info(f"Questions resulted in errors: {error_questions}")
        # log the traces
        logger.info(f"Getting input, output and reference from traces ...")
        try:
            # Retrieve parent spans
            df_parent = px.Client().query_spans(
                SpanQuery()
                .select(
                    "span_id",
                    input="input.value",
                    output="output.value",
                )
                .with_index('trace_id')
                .where("parent_id is None"),
            )
            df_parent = df_parent.dropna()
            logger.info(f"Retrieved data of df_parent: {df_parent}")
            logger.info(f"DataFrame index name of df_parent: {df_parent.index.name}")
            logger.info(f"Parent DataFrame columns: {df_parent.columns}")

            # Retrieve child spans
            df_child = px.Client().query_spans(
                SpanQuery()
                .where("span_kind == 'RETRIEVER'")
                .with_index('trace_id')
                .concat(
                    "retrieval.documents",
                    reference="document.content",
                ),
            )
            logger.info(f"Child DataFrame initial data: {df_child}")

            df_child = pd.DataFrame({"reference": df_child.groupby("context.trace_id")
                                    ['reference'].apply(lambda x: '\n\n'.join(x))})
            logger.info(f"Child DataFrame columns: {df_child.columns}")

            df_final = pd.concat([df_parent, df_child], axis=1, join='inner').set_index("context.span_id")
            logger.info(f"Final DataFrame: {df_final}")

            # Merge parent and child dataframes if necessary
            input_output_df = pd.merge(df_parent, df_child, on='context.span_id', how='left')
            input_output_df.drop_duplicates(subset=['input'], inplace=True)  # Ensuring no duplicates by input column

            input_output_df["correct_answer"] = input_output_df["input"].apply(
                lambda x: self.question_answer_pool.get(x, "Unknown"))
            input_output_df["ai_answer"] = input_output_df["input"].apply(
                lambda x: ai_dresult.get(x, "No AI Response"))

        except Exception as e:
            logger.error(f"An error occurred while retrieving or processing data: {e}")
            return

        # retrieved_documents_df = get_retrieved_documents(px.Client())

        if self.evaluators_qa_with_reference:
            evaluations_list = self.evaluators_qa_with_reference.values()
            logger.info(f"Running evaluations: {evaluations_list}")
            df_results = []
            names = []
            for name in self.evaluators_qa_with_reference.values():
                df_results.append(f"result_of_{name}")
                names.append(name)

            df_results = run_evals(
                dataframe=input_output_df,
                evaluators=self.evaluators_qa_with_reference.keys(),
                provide_explanation=True,
            )
            logger.info(f"Log evaluation results to UI for: {evaluations_list}")
            if 'span_id' not in input_output_df.columns:
                logger.error("span_id column missing in DataFrame")
                return

            # Set the DataFrame index to span_id for evaluation logging
            input_output_df.set_index('span_id', inplace=True)

            # Logging the DataFrame with the correct index
            for index, df_result in enumerate(df_results):
                if not df_result.empty:
                    df_result.set_index('span_id', inplace=True)  # Ensure the correct index
                    px.Client().log_evaluations(
                        SpanEvaluations(eval_name=names[index], dataframe=df_result)
                    )
                    self.dashboard_data[names[index]] = df_result.mean(numeric_only=True)["score"]
                else:
                    logger.error("Evaluation DataFrame is empty")
            # for index, df_result in enumerate(df_results):
            #     px.Client().log_evaluations(
            #         SpanEvaluations(eval_name=names[index], dataframe=df_result)
            #     )
            #     self.dashboard_data[names[index]] = df_result.mean(numeric_only=True)["score"]
        retrieved_documents_df = get_retrieved_documents(px.Client())
        if 'span_id' in retrieved_documents_df.columns:
            retrieved_documents_df.set_index('span_id', inplace=True)
            retrieved_documents_eval = retrieved_documents_df.copy()  # Assuming this is the correct DataFrame

            # Calculating relevance score
            retrieved_documents_eval["score"] = (
                retrieved_documents_eval['label'][~retrieved_documents_eval['label'].isna()] == "relevant"
            ).astype(int)

            # Log retrieval evaluations
            px.Client().log_evaluations(DocumentEvaluations(eval_name="Relevance", dataframe=retrieved_documents_eval))
            self.dashboard_data["Relevance"] = relevance_eval_df.mean(numeric_only=True)["score"]
        else:
            logger.error("span_id column missing in retrieved documents DataFrame")

        if self.evaluators_retrieved_documents:
            evaluations_retrived = self.evaluators_retrieved_documents.values()
            logger.info(f"Running evaluations: {evaluations_retrived}")
            relevance_eval_df = run_evals(
                dataframe=retrieved_documents_df,
                evaluators=self.evaluators_retrieved_documents.keys(),
                provide_explanation=True,
            )[0]
            logger.info(f"Log evaluation results to UI: {evaluations_retrived}")
            px.Client().log_evaluations(
                DocumentEvaluations(eval_name="Relevance", dataframe=relevance_eval_df),
            )
            self.dashboard_data["Relevance"] = relevance_eval_df.mean(numeric_only=True)["score"]

        if self.run_outputtone:
            logger.info("Run evaluation: outputtone")
            output_tone_df = run_output_tone_evaluation(input_output_df)
            logger.debug(f"output_tone_df:{output_tone_df}")
            logger.info(f"Log evaluation results to UI: outputtone")
            px.Client().log_evaluations(SpanEvaluations(eval_name="output_tone", dataframe=output_tone_df))

        #  TODO: implement the function
        self.dashboard_data["timestamp"] = datetime.now().timestamp()
        if self.dataset_id > BENCHMARK_DATASET_ID_MIN and self.project_name != PROJECT_NAMES.get('RANDOM'):
            logger.info("Log evaluations scores to Bigquery")
            util.log_scores_to_bigQuery(self.dashboard_data)
        logger.info("The traces and evaluation done")

        time.sleep(self.job_save_seconds_for_local_run)

logs :

Retrieved data of df_parent:
context.trace_id                                    ...                                                   
a4ccddfad55c01c696403ee0b75736c4  8b9e86c033a458a0  ...  {"documents": ["page_content=\"Title: SMUI/SMS...
c9ee0cbd25b5a8ea4f65cbaebff21a08  445a8964f909737a  ...  {"generations": [[{"text": "\"LLM\" can refer ...
2c35eed21f209d12a714ff1edcb8bcb9  eddd09b2ac1ca77d  ...  {"confidence_score": 0.0, "result": "I don't k...
347bc66c697a8d6bd61ec625cd1106ec  98000343032d8284  ...  {"documents": ["page_content='Title: 2018\\n\\...
88e320c1730ecafa80a4f6489669a977  0d0514a0ca01a2b3  ...  {"generations": [[{"text": "As of my last upda...

2024-05-14T23:26:55.478861+0000 INFO [job_builder] - DataFrame index name of df_parent: context.trace_id
2024-05-14T23:26:55.479139+0000 INFO [job_builder] - Parent DataFrame columns: Index(['context.span_id', 'input', 'output'], dtype='object')
2024-05-14T23:26:55.518683+0000 INFO [job_builder] - Child DataFrame initial data:                                                                           reference
context.trace_id                                                                  
6e3a51ba94f4315fada1733c7231a1a2  Title: 2018\n\n\n\n11\n\n\n\n29 Meeting notes\...
a4ccddfad55c01c696403ee0b75736c4  Title: SMUI/SMSM Specific Guides\n\nWeblog lag...
347bc66c697a8d6bd61ec625cd1106ec  Title: 2018\n\n\n\n11\n\n\n\n29 Meeting notes\...
2024-05-14T23:26:55.521006+0000 INFO [job_builder] - Child DataFrame columns: Index(['reference'], dtype='object')
2024-05-14T23:26:55.528978+0000 INFO [job_builder] - Final DataFrame:                                                               input  ...                                          reference
context.span_id             
71c8d750b06873c9                                       What is LLM?  ...  Title: SMUI/SMSM Specific Guides\n\nWeblog lag...
47e6f936e212787c                               Who is USA president  ...  Title: 2018\n\n\n\n11\n\n\n\n29 Meeting notes\...
8b9e86c033a458a0                                       What is LLM?  ...  Title: SMUI/SMSM Specific Guides\n\nWeblog lag...
98000343032d8284                               Who is USA president  ...  Title: 2018\n\n\n\n11\n\n\n\n29 Meeting notes\...
[30 rows x 3 columns]
2024-05-14T23:26:55.529319+0000 ERROR [job_builder] - An error occurred while retrieving or processing data: 'context.span_id'

Commented on Error Logging Evaluations in UI: KeyError in DataF...·Posted inPhoenix Support

1DS19ET063 P.

Thanks for sharing this Roger Y. but i think i am missing out something, because even after making those changes i am getting the following error, below i have attached the error and the code.


    def submit(self):

        if self.server.server_identifier == "127.0.0.1":
            os.environ["PHOENIX_PROJECT_NAME"] = self.project_name
            print(f"local server setup")
            session = px.launch_app()
            LangChainInstrumentor().instrument()
        else:
            print("Setting up for remote server...")
            os.environ["PHOENIX_PROJECT_NAME"] = self.project_name
            os.environ["PHOENIX_COLLECTOR_ENDPOINT"]
            print(f"PHOENIX_COLLECTOR_ENDPOINT is now set to: {os.environ.get('PHOENIX_COLLECTOR_ENDPOINT')}")
            LangChainInstrumentor().instrument()
        ai_dresult = {}
        question_num = 0
        error_questions = 0
        total_questions = len(self.question_answer_pool)
        failed_questions = []  # To keep track of questions that failed

        for question, human_answer in self.question_answer_pool.items():
            try:
                normalized_question = normalize_spacing(question)
                ai_answer = self.chat_app_run(normalized_question)
                ai_dresult[normalized_question] = ai_answer
                question_num += 1
            except Exception as e:
                error_questions += 1
                failed_questions.append((question, str(e)))  # Store the failed question and the error
                logger.error(f"An error occurred while processing the question '{question}': {e}")
                # Optionally, you can continue to the next iteration if you want to skip failed ones
                continue

        # After processing all questions, logging the failed ones for review
        if failed_questions:
            logger.error(f"Failed questions summary: {len(failed_questions)} errors encountered.")
            for failed_question, error_message in failed_questions:
                logger.error(f"Question: {failed_question} | Error: {error_message}")

        logger.info(f"The evaluation based on dataset:{self.dataset_id}. {question_num} questions to run")
        logger.info(f"The evaluation based on dataset:{self.dataset_id}.")
        logger.info(f"Total questions to be evaluated: {total_questions}")
        logger.info(f"Questions processed successfully: {question_num}")
        logger.info(f"Questions resulted in errors: {error_questions}")
        # log the traces
        logger.info(f"Getting input, output and reference from traces ...")
        try:
            # Retrieve parent spans
            df_parent = px.Client().query_spans(
                SpanQuery()
                .select(
                    span_id="context.span_id",
                    input="input.value",
                    output="output.value",
                )
                .with_index('span_id')
                .where("parent_id is None"),
            )
            df_parent = df_parent.dropna()
            logger.info(f"Retrieved DataFrame columns: {df_parent.columns.tolist()}")
            if 'context.span_id' not in df_parent.columns:
                logger.error("'context.span_id' column missing in parent DataFrame")
                return

            # Retrieve child spans
            df_child = px.Client().query_spans(
                SpanQuery()
                .where("span_kind == 'RETRIEVER'")
                .with_index('span_id')
                .select(
                    parent_id="parent_id",  # Explicitly retrieve 'parent_id'
                    span_id="parent_id"
                )
                .concat(
                    "retrieval.documents",
                    reference="document.content",
                ),
            )
            logger.info(f"Child DataFrame initial data: {df_child}")

            if 'context.span_id' not in df_child.columns:
                logger.error("'context.span_id' column missing in child DataFrame")
                return
            df_child = pd.DataFrame({"reference": df_child.groupby("context.span_id")
                                    ['reference'].apply(lambda x: '\n\n'.join(x))})
            df_final = pd.concat([df_parent, df_child], axis=1, join='inner').set_index("context.span_id")
            logger.info(f"Final DataFrame: {df_final}")
            logger.info(f"Child DataFrame Columns: {df_child.columns.tolist()}")
        # Check if parent_id is in df_child before merge
            if 'parent_id' not in df_child.columns:
                logger.error("parent_id column missing in child DataFrame")
                return

            # Merge parent and child dataframes if necessary
            input_output_df = pd.merge(df_parent, df_child, how='left', left_index=True, right_on='parent_id')
            input_output_df.drop_duplicates(subset=['input'], inplace=True)  # Ensuring no duplicates by input column

            input_output_df["correct_answer"] = input_output_df["input"].apply(
                lambda x: self.question_answer_pool.get(x, "Unknown"))
            input_output_df["ai_answer"] = input_output_df["input"].apply(
                lambda x: ai_dresult.get(x, "No AI Response"))

        except Exception as e:
            logger.error(f"An error occurred while retrieving or processing data: {e}")
            return

        # retrieved_documents_df = get_retrieved_documents(px.Client())

        if self.evaluators_qa_with_reference:
            evaluations_list = self.evaluators_qa_with_reference.values()
            logger.info(f"Running evaluations: {evaluations_list}")
            df_results = []
            names = []
            for name in self.evaluators_qa_with_reference.values():
                df_results.append(f"result_of_{name}")
                names.append(name)

            df_results = run_evals(
                dataframe=input_output_df,
                evaluators=self.evaluators_qa_with_reference.keys(),
                provide_explanation=True,
            )
            logger.info(f"Log evaluation results to UI for: {evaluations_list}")
            if 'context.span_id' not in input_output_df.columns:
                logger.error("context.span_id column missing in DataFrame")
                return

            # Set the DataFrame index to context.span_id for evaluation logging
            input_output_df.set_index('context.span_id', inplace=True)

            # Logging the DataFrame with the correct index
            for index, df_result in enumerate(df_results):
                if not df_result.empty:
                    df_result.set_index('context.span_id', inplace=True)  # Ensure the correct index
                    px.Client().log_evaluations(
                        SpanEvaluations(eval_name=names[index], dataframe=df_result)
                    )
                    self.dashboard_data[names[index]] = df_result.mean(numeric_only=True)["score"]
                else:
                    logger.error("Evaluation DataFrame is empty")
            # for index, df_result in enumerate(df_results):
            #     px.Client().log_evaluations(
            #         SpanEvaluations(eval_name=names[index], dataframe=df_result)
            #     )
            #     self.dashboard_data[names[index]] = df_result.mean(numeric_only=True)["score"]
        retrieved_documents_df = get_retrieved_documents(px.Client())
        if 'context.span_id' in retrieved_documents_df.columns:
            retrieved_documents_df.set_index('context.span_id', inplace=True)
            retrieved_documents_eval = retrieved_documents_df.copy()  # Assuming this is the correct DataFrame

            # Calculating relevance score
            retrieved_documents_eval["score"] = (
                retrieved_documents_eval['label'][~retrieved_documents_eval['label'].isna()] == "relevant"
            ).astype(int)

            # Log retrieval evaluations
            px.Client().log_evaluations(DocumentEvaluations(eval_name="Relevance", dataframe=retrieved_documents_eval))
            self.dashboard_data["Relevance"] = relevance_eval_df.mean(numeric_only=True)["score"]
        else:
            logger.error("context.span_id column missing in retrieved documents DataFrame")

        if self.evaluators_retrieved_documents:
            evaluations_retrived = self.evaluators_retrieved_documents.values()
            logger.info(f"Running evaluations: {evaluations_retrived}")
            relevance_eval_df = run_evals(
                dataframe=retrieved_documents_df,
                evaluators=self.evaluators_retrieved_documents.keys(),
                provide_explanation=True,
            )[0]
            logger.info(f"Log evaluation results to UI: {evaluations_retrived}")
            px.Client().log_evaluations(
                DocumentEvaluations(eval_name="Relevance", dataframe=relevance_eval_df),
            )
            self.dashboard_data["Relevance"] = relevance_eval_df.mean(numeric_only=True)["score"]

        if self.run_outputtone:
            logger.info("Run evaluation: outputtone")
            output_tone_df = run_output_tone_evaluation(input_output_df)
            logger.debug(f"output_tone_df:{output_tone_df}")
            logger.info(f"Log evaluation results to UI: outputtone")
            px.Client().log_evaluations(SpanEvaluations(eval_name="output_tone", dataframe=output_tone_df))

        #  TODO: implement the function
        self.dashboard_data["timestamp"] = datetime.now().timestamp()
        if self.dataset_id > BENCHMARK_DATASET_ID_MIN and self.project_name != PROJECT_NAMES.get('RANDOM'):
            logger.info("Log evaluations scores to Bigquery")
            util.log_scores_to_bigQuery(self.dashboard_data)
        logger.info("The traces and evaluation done")

        time.sleep(self.job_save_seconds_for_local_run)

error :


2024-05-14T21:48:35.789032+0000 INFO [job_builder] - The evaluation based on dataset:100001. 2 questions to run
2024-05-14T21:48:35.789712+0000 INFO [job_builder] - The evaluation based on dataset:100001.
2024-05-14T21:48:35.789990+0000 INFO [job_builder] - Total questions to be evaluated: 2
2024-05-14T21:48:35.790171+0000 INFO [job_builder] - Questions processed successfully: 2
2024-05-14T21:48:35.790293+0000 INFO [job_builder] - Questions resulted in errors: 0
2024-05-14T21:48:35.790399+0000 INFO [job_builder] - Getting input, output and reference from traces ...
2024-05-14T21:48:35.834075+0000 INFO [job_builder] - Retrieved DataFrame columns: ['input', 'output']
2024-05-14T21:48:35.884919+0000 ERROR [job_builder] - 'context.span_id' column missing in parent DataFrame

Commented on Error Logging Evaluations in UI: KeyError in DataF...·Posted inPhoenix Support

1DS19ET063 P.

Thanks for the above. I did not made any changes, i just changed how i was loading my retriever and it worked. i am now getting another issue with get_qa_with_reference(px.Client()) error :

File "/app/llamaas/qa/chatbot_qa/job_builder.py", line 159, in submit
    input_output_df = get_qa_with_reference(px.Client())
  File "/usr/local/lib/python3.9/site-packages/phoenix/trace/dsl/helpers.py", line 87, in get_qa_with_reference
    return pd.concat(
  File "/usr/local/lib/python3.9/site-packages/pandas/core/reshape/concat.py", line 393, in concat
    return op.get_result()
  File "/usr/local/lib/python3.9/site-packages/pandas/core/reshape/concat.py", line 678, in get_result
    indexers[ax] = obj_labels.get_indexer(new_labels)
  File "/usr/local/lib/python3.9/site-packages/pandas/core/indexes/base.py", line 3882, in get_indexer
    raise InvalidIndexError(self._requires_unique_msg)
pandas.errors.InvalidIndexError: Reindexing only valid with uniquely valued Index objects

Commented on Error Logging Evaluations in UI: KeyError in DataF...·Posted inPhoenix Support

1DS19ET063 P.

actually this was working fine 2 weeks ago, but it's failing now with this key error.

Commented on Error Logging Evaluations in UI: KeyError in DataF...·Posted inPhoenix Support

1DS19ET063 P.

and this is how i set the tasks :

def set_tasks(
            self, eval_model=DEFAULT_EVAL_MODEL, correctness=False, hallucination=False, toxicity=False, groundtruth=False, relevance=False, outputtone=False, *args):
        self.eval_model = eval_model
        if correctness:
            correctness_evaluator = util.get_correctness_evaluator(eval_model)
            self.evaluators_qa_with_reference[correctness_evaluator] = "Correctness"
        if hallucination:
            hallucination_evaluator = util.get_hallucination_evaluator(eval_model)
            self.evaluators_qa_with_reference[hallucination_evaluator] = "Hallucination"
        if toxicity:
            toxicity_evaluator = util.get_toxicity_evaluator(eval_model)
            self.evaluators_qa_with_reference[toxicity_evaluator] = "Toxicity"
        if groundtruth:
            human_vs_AI_evaluator = util.get_human_vs_AI_evaluator(eval_model)
            self.evaluators_qa_with_reference[human_vs_AI_evaluator] = "Groundtruth"
        if relevance:
            relevance_evaluator = util.get_relevance_evaluator(eval_model)
            self.evaluators_retrieved_documents[relevance_evaluator] = "Relevance"
        if outputtone:
            self.run_outputtone = True
        return self

Commented on Error Logging Evaluations in UI: KeyError in DataF...·Posted inPhoenix Support

1DS19ET063 P.

Thanks Xander S. I have written the evaluators like this :

def get_correctness_evaluator(model=DEFAULT_EVAL_MODEL, template=default_prompts.QA_PROMPT_TEMPLATE):
    return LLMEvaluator(model, template)


def get_hallucination_evaluator(model=DEFAULT_EVAL_MODEL, template=default_prompts.HALLUCINATION_PROMPT_TEMPLATE):
    return LLMEvaluator(model=DEFAULT_EVAL_MODEL, template=default_prompts.HALLUCINATION_PROMPT_TEMPLATE)


def get_toxicity_evaluator(model=DEFAULT_EVAL_MODEL, template=default_prompts.TOXICITY_PROMPT_TEMPLATE):
    return LLMEvaluator(model=DEFAULT_EVAL_MODEL, template=default_prompts.TOXICITY_PROMPT_TEMPLATE)


def get_relevance_evaluator(model=DEFAULT_EVAL_MODEL, template=default_prompts.RAG_RELEVANCY_PROMPT_TEMPLATE):
    return LLMEvaluator(model=DEFAULT_EVAL_MODEL, template=default_prompts.RAG_RELEVANCY_PROMPT_TEMPLATE)


def get_human_vs_AI_evaluator(model=DEFAULT_EVAL_MODEL, template=default_prompts.HUMAN_VS_AI_PROMPT_TEMPLATE):
    return LLMEvaluator(model=DEFAULT_EVAL_MODEL, template=default_prompts.HUMAN_VS_AI_PROMPT_TEMPLATE)

Posted in Phoenix Support·

1DS19ET063 P.

Error Logging Evaluations in UI: KeyError in DataFrame Processing

Hi team, i am facing this error while logging evaluations on ui. The traces are being sent but the evaluations are not comming. Actually the get_qa_with_reference(px.Client()) returns empty df and get_retrieved_documents(px.Client()) returns none error :

File "/Users/priya/datasci/llm_as_a_service/llamaas/qa/chatbot_qa/job_builder.py", line 188, in submit
    self.dashboard_data[names[index]] = df_result.mean(numeric_only=True)["score"]
  File "/Users/priya/tracenv/lib/python3.9/site-packages/pandas/core/series.py", line 1111, in __getitem__
    return self._get_value(key)
  File "/Users/priya/tracenv/lib/python3.9/site-packages/pandas/core/series.py", line 1227, in _get_value
    loc = self.index.get_loc(label)
  File "/Users/priya/tracenv/lib/python3.9/site-packages/pandas/core/indexes/range.py", line 417, in get_loc
    raise KeyError(key)
KeyError: 'score'

code snippet :

def submit(self):

        os.environ["PHOENIX_PROJECT_NAME"] = self.project_name
        print(f"local server setup")
        session = px.launch_app()
        LangChainInstrumentor().instrument()

        ai_dresult = {}
        question_num = 0
        error_questions = 0
        total_questions = len(self.question_answer_pool)
        failed_questions = []  # To keep track of questions that failed

        for question, human_answer in self.question_answer_pool.items():
            try:
                normalized_question = normalize_spacing(question)
                ai_answer = self.chat_app_run(normalized_question)
                ai_dresult[normalized_question] = ai_answer
                question_num += 1
            except Exception as e:
                error_questions += 1
                failed_questions.append((question, str(e)))  # Store the failed question and the error
                logger.error(f"An error occurred while processing the question '{question}': {e}")
                # Optionally, you can continue to the next iteration if you want to skip failed ones
                continue

        # After processing all questions, logging the failed ones for review
        if failed_questions:
            logger.error(f"Failed questions summary: {len(failed_questions)} errors encountered.")
            for failed_question, error_message in failed_questions:
                logger.error(f"Question: {failed_question} | Error: {error_message}")

        logger.info(f"The evaluation based on dataset:{self.dataset_id}. {question_num} questions to run")
        logger.info(f"The evaluation based on dataset:{self.dataset_id}.")
        logger.info(f"Total questions to be evaluated: {total_questions}")
        logger.info(f"Questions processed successfully: {question_num}")
        logger.info(f"Questions resulted in errors: {error_questions}")
        # log the traces
        logger.info(f"Getting input, output and reference from traces ...")
        input_output_df = get_qa_with_reference(px.Client())
        input_output_df["correct_answer"] = input_output_df["input"].apply(
            lambda x: self.question_answer_pool[x])

        input_output_df["ai_answer"] = input_output_df["input"].apply(
            lambda x: ai_dresult[x])
        retrieved_documents_df = get_retrieved_documents(px.Client())
        print(f"input_output_df: {input_output_df}")
        print(f"retrieved_documents_df: {retrieved_documents_df}")
        if self.evaluators_qa_with_reference:
            evaluations_list = self.evaluators_qa_with_reference.values()
            logger.info(f"Running evaluations: {evaluations_list}")
            df_results = []
            names = []
            for name in self.evaluators_qa_with_reference.values():
                df_results.append(f"result_of_{name}")
                names.append(name)

            df_results = run_evals(
                dataframe=input_output_df,
                evaluators=self.evaluators_qa_with_reference.keys(),
                provide_explanation=True,
            )
            print(f"self evalutaionsndndh: {self.evaluators_qa_with_reference.keys()}")
            logger.info(f"Log evaluation results to UI for: {evaluations_list}")
            for index, df_result in enumerate(df_results):
                px.Client().log_evaluations(
                    SpanEvaluations(eval_name=names[index], dataframe=df_result)
                )
                self.dashboard_data[names[index]] = df_result.mean(numeric_only=True)["score"]

        if self.evaluators_retrieved_documents:
            evaluations_retrived = self.evaluators_retrieved_documents.values()
            logger.info(f"Running evaluations: {evaluations_retrived}")
            relevance_eval_df = run_evals(
                dataframe=retrieved_documents_df,
                evaluators=self.evaluators_retrieved_documents.keys(),
                provide_explanation=True,
            )[0]
            logger.info(f"Log evaluation results to UI: {evaluations_retrived}")
            px.Client().log_evaluations(
                DocumentEvaluations(eval_name="Relevance", dataframe=relevance_eval_df),
            )
            self.dashboard_data["Relevance"] = relevance_eval_df.mean(numeric_only=True)["score"]

        if self.run_outputtone:
            logger.info("Run evaluation: outputtone")
            output_tone_df = run_output_tone_evaluation(input_output_df)
            logger.debug(f"output_tone_df:{output_tone_df}")
            logger.info(f"Log evaluation results to UI: outputtone")
            px.Client().log_evaluations(SpanEvaluations(eval_name="output_tone", dataframe=output_tone_df))

        #  TODO: implement the function
        self.dashboard_data["timestamp"] = datetime.now().timestamp()
        if self.dataset_id > BENCHMARK_DATASET_ID_MIN and self.project_name != PROJECT_NAMES.get('RANDOM'):
            logger.info("Log evaluations scores to Bigquery")
            util.log_scores_to_bigQuery(self.dashboard_data)
        logger.info("The traces and evaluation done")

        time.sleep(self.job_save_seconds_for_local_run)

17Comments