Thanks Roger Y. it is working now. but I wonder that earlier i was using get_qa_with_reference() and it was working fine, and suddenly from monday it started throwing invalid index error 馃
i have done few modifications :
def submit(self):
if self.server.server_identifier == "127.0.0.1":
os.environ["PHOENIX_PROJECT_NAME"] = self.project_name
print(f"local server setup")
session = px.launch_app()
LangChainInstrumentor().instrument()
else:
print("Setting up for remote server...")
os.environ["PHOENIX_PROJECT_NAME"] = self.project_name
os.environ["PHOENIX_COLLECTOR_ENDPOINT"]
print(f"PHOENIX_COLLECTOR_ENDPOINT is now set to: {os.environ.get('PHOENIX_COLLECTOR_ENDPOINT')}")
LangChainInstrumentor().instrument()
ai_dresult = {}
question_num = 0
error_questions = 0
total_questions = len(self.question_answer_pool)
failed_questions = [] # To keep track of questions that failed
for question, human_answer in self.question_answer_pool.items():
try:
normalized_question = normalize_spacing(question)
ai_answer = self.chat_app_run(normalized_question)
ai_dresult[normalized_question] = ai_answer
question_num += 1
except Exception as e:
error_questions += 1
failed_questions.append((question, str(e))) # Store the failed question and the error
logger.error(f"An error occurred while processing the question '{question}': {e}")
# Optionally, you can continue to the next iteration if you want to skip failed ones
continue
# After processing all questions, logging the failed ones for review
if failed_questions:
logger.error(f"Failed questions summary: {len(failed_questions)} errors encountered.")
for failed_question, error_message in failed_questions:
logger.error(f"Question: {failed_question} | Error: {error_message}")
logger.info(f"The evaluation based on dataset:{self.dataset_id}. {question_num} questions to run")
logger.info(f"The evaluation based on dataset:{self.dataset_id}.")
logger.info(f"Total questions to be evaluated: {total_questions}")
logger.info(f"Questions processed successfully: {question_num}")
logger.info(f"Questions resulted in errors: {error_questions}")
# log the traces
logger.info(f"Getting input, output and reference from traces ...")
try:
# Retrieve parent spans
df_parent = px.Client().query_spans(
SpanQuery()
.select(
"span_id",
input="input.value",
output="output.value",
)
.with_index('trace_id')
.where("parent_id is None"),
)
df_parent = df_parent.dropna()
logger.info(f"Retrieved data of df_parent: {df_parent}")
logger.info(f"DataFrame index name of df_parent: {df_parent.index.name}")
logger.info(f"Parent DataFrame columns: {df_parent.columns}")
# Retrieve child spans
df_child = px.Client().query_spans(
SpanQuery()
.where("span_kind == 'RETRIEVER'")
.with_index('trace_id')
.concat(
"retrieval.documents",
reference="document.content",
),
)
logger.info(f"Child DataFrame initial data: {df_child}")
df_child = pd.DataFrame({"reference": df_child.groupby("context.trace_id")
['reference'].apply(lambda x: '\n\n'.join(x))})
logger.info(f"Child DataFrame columns: {df_child.columns}")
df_final = pd.concat([df_parent, df_child], axis=1, join='inner').set_index("context.span_id")
logger.info(f"Final DataFrame: {df_final}")
# Merge parent and child dataframes if necessary
input_output_df = pd.merge(df_parent, df_child, on='context.span_id', how='left')
input_output_df.drop_duplicates(subset=['input'], inplace=True) # Ensuring no duplicates by input column
input_output_df["correct_answer"] = input_output_df["input"].apply(
lambda x: self.question_answer_pool.get(x, "Unknown"))
input_output_df["ai_answer"] = input_output_df["input"].apply(
lambda x: ai_dresult.get(x, "No AI Response"))
except Exception as e:
logger.error(f"An error occurred while retrieving or processing data: {e}")
return
# retrieved_documents_df = get_retrieved_documents(px.Client())
if self.evaluators_qa_with_reference:
evaluations_list = self.evaluators_qa_with_reference.values()
logger.info(f"Running evaluations: {evaluations_list}")
df_results = []
names = []
for name in self.evaluators_qa_with_reference.values():
df_results.append(f"result_of_{name}")
names.append(name)
df_results = run_evals(
dataframe=input_output_df,
evaluators=self.evaluators_qa_with_reference.keys(),
provide_explanation=True,
)
logger.info(f"Log evaluation results to UI for: {evaluations_list}")
if 'span_id' not in input_output_df.columns:
logger.error("span_id column missing in DataFrame")
return
# Set the DataFrame index to span_id for evaluation logging
input_output_df.set_index('span_id', inplace=True)
# Logging the DataFrame with the correct index
for index, df_result in enumerate(df_results):
if not df_result.empty:
df_result.set_index('span_id', inplace=True) # Ensure the correct index
px.Client().log_evaluations(
SpanEvaluations(eval_name=names[index], dataframe=df_result)
)
self.dashboard_data[names[index]] = df_result.mean(numeric_only=True)["score"]
else:
logger.error("Evaluation DataFrame is empty")
# for index, df_result in enumerate(df_results):
# px.Client().log_evaluations(
# SpanEvaluations(eval_name=names[index], dataframe=df_result)
# )
# self.dashboard_data[names[index]] = df_result.mean(numeric_only=True)["score"]
retrieved_documents_df = get_retrieved_documents(px.Client())
if 'span_id' in retrieved_documents_df.columns:
retrieved_documents_df.set_index('span_id', inplace=True)
retrieved_documents_eval = retrieved_documents_df.copy() # Assuming this is the correct DataFrame
# Calculating relevance score
retrieved_documents_eval["score"] = (
retrieved_documents_eval['label'][~retrieved_documents_eval['label'].isna()] == "relevant"
).astype(int)
# Log retrieval evaluations
px.Client().log_evaluations(DocumentEvaluations(eval_name="Relevance", dataframe=retrieved_documents_eval))
self.dashboard_data["Relevance"] = relevance_eval_df.mean(numeric_only=True)["score"]
else:
logger.error("span_id column missing in retrieved documents DataFrame")
if self.evaluators_retrieved_documents:
evaluations_retrived = self.evaluators_retrieved_documents.values()
logger.info(f"Running evaluations: {evaluations_retrived}")
relevance_eval_df = run_evals(
dataframe=retrieved_documents_df,
evaluators=self.evaluators_retrieved_documents.keys(),
provide_explanation=True,
)[0]
logger.info(f"Log evaluation results to UI: {evaluations_retrived}")
px.Client().log_evaluations(
DocumentEvaluations(eval_name="Relevance", dataframe=relevance_eval_df),
)
self.dashboard_data["Relevance"] = relevance_eval_df.mean(numeric_only=True)["score"]
if self.run_outputtone:
logger.info("Run evaluation: outputtone")
output_tone_df = run_output_tone_evaluation(input_output_df)
logger.debug(f"output_tone_df:{output_tone_df}")
logger.info(f"Log evaluation results to UI: outputtone")
px.Client().log_evaluations(SpanEvaluations(eval_name="output_tone", dataframe=output_tone_df))
# TODO: implement the function
self.dashboard_data["timestamp"] = datetime.now().timestamp()
if self.dataset_id > BENCHMARK_DATASET_ID_MIN and self.project_name != PROJECT_NAMES.get('RANDOM'):
logger.info("Log evaluations scores to Bigquery")
util.log_scores_to_bigQuery(self.dashboard_data)
logger.info("The traces and evaluation done")
time.sleep(self.job_save_seconds_for_local_run)logs :
Retrieved data of df_parent:
context.trace_id ...
a4ccddfad55c01c696403ee0b75736c4 8b9e86c033a458a0 ... {"documents": ["page_content=\"Title: SMUI/SMS...
c9ee0cbd25b5a8ea4f65cbaebff21a08 445a8964f909737a ... {"generations": [[{"text": "\"LLM\" can refer ...
2c35eed21f209d12a714ff1edcb8bcb9 eddd09b2ac1ca77d ... {"confidence_score": 0.0, "result": "I don't k...
347bc66c697a8d6bd61ec625cd1106ec 98000343032d8284 ... {"documents": ["page_content='Title: 2018\\n\\...
88e320c1730ecafa80a4f6489669a977 0d0514a0ca01a2b3 ... {"generations": [[{"text": "As of my last upda...
2024-05-14T23:26:55.478861+0000 INFO [job_builder] - DataFrame index name of df_parent: context.trace_id
2024-05-14T23:26:55.479139+0000 INFO [job_builder] - Parent DataFrame columns: Index(['context.span_id', 'input', 'output'], dtype='object')
2024-05-14T23:26:55.518683+0000 INFO [job_builder] - Child DataFrame initial data: reference
context.trace_id
6e3a51ba94f4315fada1733c7231a1a2 Title: 2018\n\n\n\n11\n\n\n\n29 Meeting notes\...
a4ccddfad55c01c696403ee0b75736c4 Title: SMUI/SMSM Specific Guides\n\nWeblog lag...
347bc66c697a8d6bd61ec625cd1106ec Title: 2018\n\n\n\n11\n\n\n\n29 Meeting notes\...
2024-05-14T23:26:55.521006+0000 INFO [job_builder] - Child DataFrame columns: Index(['reference'], dtype='object')
2024-05-14T23:26:55.528978+0000 INFO [job_builder] - Final DataFrame: input ... reference
context.span_id
71c8d750b06873c9 What is LLM? ... Title: SMUI/SMSM Specific Guides\n\nWeblog lag...
47e6f936e212787c Who is USA president ... Title: 2018\n\n\n\n11\n\n\n\n29 Meeting notes\...
8b9e86c033a458a0 What is LLM? ... Title: SMUI/SMSM Specific Guides\n\nWeblog lag...
98000343032d8284 Who is USA president ... Title: 2018\n\n\n\n11\n\n\n\n29 Meeting notes\...
[30 rows x 3 columns]
2024-05-14T23:26:55.529319+0000 ERROR [job_builder] - An error occurred while retrieving or processing data: 'context.span_id'Thanks for sharing this Roger Y. but i think i am missing out something, because even after making those changes i am getting the following error, below i have attached the error and the code.
def submit(self):
if self.server.server_identifier == "127.0.0.1":
os.environ["PHOENIX_PROJECT_NAME"] = self.project_name
print(f"local server setup")
session = px.launch_app()
LangChainInstrumentor().instrument()
else:
print("Setting up for remote server...")
os.environ["PHOENIX_PROJECT_NAME"] = self.project_name
os.environ["PHOENIX_COLLECTOR_ENDPOINT"]
print(f"PHOENIX_COLLECTOR_ENDPOINT is now set to: {os.environ.get('PHOENIX_COLLECTOR_ENDPOINT')}")
LangChainInstrumentor().instrument()
ai_dresult = {}
question_num = 0
error_questions = 0
total_questions = len(self.question_answer_pool)
failed_questions = [] # To keep track of questions that failed
for question, human_answer in self.question_answer_pool.items():
try:
normalized_question = normalize_spacing(question)
ai_answer = self.chat_app_run(normalized_question)
ai_dresult[normalized_question] = ai_answer
question_num += 1
except Exception as e:
error_questions += 1
failed_questions.append((question, str(e))) # Store the failed question and the error
logger.error(f"An error occurred while processing the question '{question}': {e}")
# Optionally, you can continue to the next iteration if you want to skip failed ones
continue
# After processing all questions, logging the failed ones for review
if failed_questions:
logger.error(f"Failed questions summary: {len(failed_questions)} errors encountered.")
for failed_question, error_message in failed_questions:
logger.error(f"Question: {failed_question} | Error: {error_message}")
logger.info(f"The evaluation based on dataset:{self.dataset_id}. {question_num} questions to run")
logger.info(f"The evaluation based on dataset:{self.dataset_id}.")
logger.info(f"Total questions to be evaluated: {total_questions}")
logger.info(f"Questions processed successfully: {question_num}")
logger.info(f"Questions resulted in errors: {error_questions}")
# log the traces
logger.info(f"Getting input, output and reference from traces ...")
try:
# Retrieve parent spans
df_parent = px.Client().query_spans(
SpanQuery()
.select(
span_id="context.span_id",
input="input.value",
output="output.value",
)
.with_index('span_id')
.where("parent_id is None"),
)
df_parent = df_parent.dropna()
logger.info(f"Retrieved DataFrame columns: {df_parent.columns.tolist()}")
if 'context.span_id' not in df_parent.columns:
logger.error("'context.span_id' column missing in parent DataFrame")
return
# Retrieve child spans
df_child = px.Client().query_spans(
SpanQuery()
.where("span_kind == 'RETRIEVER'")
.with_index('span_id')
.select(
parent_id="parent_id", # Explicitly retrieve 'parent_id'
span_id="parent_id"
)
.concat(
"retrieval.documents",
reference="document.content",
),
)
logger.info(f"Child DataFrame initial data: {df_child}")
if 'context.span_id' not in df_child.columns:
logger.error("'context.span_id' column missing in child DataFrame")
return
df_child = pd.DataFrame({"reference": df_child.groupby("context.span_id")
['reference'].apply(lambda x: '\n\n'.join(x))})
df_final = pd.concat([df_parent, df_child], axis=1, join='inner').set_index("context.span_id")
logger.info(f"Final DataFrame: {df_final}")
logger.info(f"Child DataFrame Columns: {df_child.columns.tolist()}")
# Check if parent_id is in df_child before merge
if 'parent_id' not in df_child.columns:
logger.error("parent_id column missing in child DataFrame")
return
# Merge parent and child dataframes if necessary
input_output_df = pd.merge(df_parent, df_child, how='left', left_index=True, right_on='parent_id')
input_output_df.drop_duplicates(subset=['input'], inplace=True) # Ensuring no duplicates by input column
input_output_df["correct_answer"] = input_output_df["input"].apply(
lambda x: self.question_answer_pool.get(x, "Unknown"))
input_output_df["ai_answer"] = input_output_df["input"].apply(
lambda x: ai_dresult.get(x, "No AI Response"))
except Exception as e:
logger.error(f"An error occurred while retrieving or processing data: {e}")
return
# retrieved_documents_df = get_retrieved_documents(px.Client())
if self.evaluators_qa_with_reference:
evaluations_list = self.evaluators_qa_with_reference.values()
logger.info(f"Running evaluations: {evaluations_list}")
df_results = []
names = []
for name in self.evaluators_qa_with_reference.values():
df_results.append(f"result_of_{name}")
names.append(name)
df_results = run_evals(
dataframe=input_output_df,
evaluators=self.evaluators_qa_with_reference.keys(),
provide_explanation=True,
)
logger.info(f"Log evaluation results to UI for: {evaluations_list}")
if 'context.span_id' not in input_output_df.columns:
logger.error("context.span_id column missing in DataFrame")
return
# Set the DataFrame index to context.span_id for evaluation logging
input_output_df.set_index('context.span_id', inplace=True)
# Logging the DataFrame with the correct index
for index, df_result in enumerate(df_results):
if not df_result.empty:
df_result.set_index('context.span_id', inplace=True) # Ensure the correct index
px.Client().log_evaluations(
SpanEvaluations(eval_name=names[index], dataframe=df_result)
)
self.dashboard_data[names[index]] = df_result.mean(numeric_only=True)["score"]
else:
logger.error("Evaluation DataFrame is empty")
# for index, df_result in enumerate(df_results):
# px.Client().log_evaluations(
# SpanEvaluations(eval_name=names[index], dataframe=df_result)
# )
# self.dashboard_data[names[index]] = df_result.mean(numeric_only=True)["score"]
retrieved_documents_df = get_retrieved_documents(px.Client())
if 'context.span_id' in retrieved_documents_df.columns:
retrieved_documents_df.set_index('context.span_id', inplace=True)
retrieved_documents_eval = retrieved_documents_df.copy() # Assuming this is the correct DataFrame
# Calculating relevance score
retrieved_documents_eval["score"] = (
retrieved_documents_eval['label'][~retrieved_documents_eval['label'].isna()] == "relevant"
).astype(int)
# Log retrieval evaluations
px.Client().log_evaluations(DocumentEvaluations(eval_name="Relevance", dataframe=retrieved_documents_eval))
self.dashboard_data["Relevance"] = relevance_eval_df.mean(numeric_only=True)["score"]
else:
logger.error("context.span_id column missing in retrieved documents DataFrame")
if self.evaluators_retrieved_documents:
evaluations_retrived = self.evaluators_retrieved_documents.values()
logger.info(f"Running evaluations: {evaluations_retrived}")
relevance_eval_df = run_evals(
dataframe=retrieved_documents_df,
evaluators=self.evaluators_retrieved_documents.keys(),
provide_explanation=True,
)[0]
logger.info(f"Log evaluation results to UI: {evaluations_retrived}")
px.Client().log_evaluations(
DocumentEvaluations(eval_name="Relevance", dataframe=relevance_eval_df),
)
self.dashboard_data["Relevance"] = relevance_eval_df.mean(numeric_only=True)["score"]
if self.run_outputtone:
logger.info("Run evaluation: outputtone")
output_tone_df = run_output_tone_evaluation(input_output_df)
logger.debug(f"output_tone_df:{output_tone_df}")
logger.info(f"Log evaluation results to UI: outputtone")
px.Client().log_evaluations(SpanEvaluations(eval_name="output_tone", dataframe=output_tone_df))
# TODO: implement the function
self.dashboard_data["timestamp"] = datetime.now().timestamp()
if self.dataset_id > BENCHMARK_DATASET_ID_MIN and self.project_name != PROJECT_NAMES.get('RANDOM'):
logger.info("Log evaluations scores to Bigquery")
util.log_scores_to_bigQuery(self.dashboard_data)
logger.info("The traces and evaluation done")
time.sleep(self.job_save_seconds_for_local_run)error :
2024-05-14T21:48:35.789032+0000 INFO [job_builder] - The evaluation based on dataset:100001. 2 questions to run
2024-05-14T21:48:35.789712+0000 INFO [job_builder] - The evaluation based on dataset:100001.
2024-05-14T21:48:35.789990+0000 INFO [job_builder] - Total questions to be evaluated: 2
2024-05-14T21:48:35.790171+0000 INFO [job_builder] - Questions processed successfully: 2
2024-05-14T21:48:35.790293+0000 INFO [job_builder] - Questions resulted in errors: 0
2024-05-14T21:48:35.790399+0000 INFO [job_builder] - Getting input, output and reference from traces ...
2024-05-14T21:48:35.834075+0000 INFO [job_builder] - Retrieved DataFrame columns: ['input', 'output']
2024-05-14T21:48:35.884919+0000 ERROR [job_builder] - 'context.span_id' column missing in parent DataFrameThanks for the above. I did not made any changes, i just changed how i was loading my retriever and it worked. i am now getting another issue with get_qa_with_reference(px.Client()) error :
File "/app/llamaas/qa/chatbot_qa/job_builder.py", line 159, in submit
input_output_df = get_qa_with_reference(px.Client())
File "/usr/local/lib/python3.9/site-packages/phoenix/trace/dsl/helpers.py", line 87, in get_qa_with_reference
return pd.concat(
File "/usr/local/lib/python3.9/site-packages/pandas/core/reshape/concat.py", line 393, in concat
return op.get_result()
File "/usr/local/lib/python3.9/site-packages/pandas/core/reshape/concat.py", line 678, in get_result
indexers[ax] = obj_labels.get_indexer(new_labels)
File "/usr/local/lib/python3.9/site-packages/pandas/core/indexes/base.py", line 3882, in get_indexer
raise InvalidIndexError(self._requires_unique_msg)
pandas.errors.InvalidIndexError: Reindexing only valid with uniquely valued Index objectsactually this was working fine 2 weeks ago, but it's failing now with this key error.
and this is how i set the tasks :
def set_tasks(
self, eval_model=DEFAULT_EVAL_MODEL, correctness=False, hallucination=False, toxicity=False, groundtruth=False, relevance=False, outputtone=False, *args):
self.eval_model = eval_model
if correctness:
correctness_evaluator = util.get_correctness_evaluator(eval_model)
self.evaluators_qa_with_reference[correctness_evaluator] = "Correctness"
if hallucination:
hallucination_evaluator = util.get_hallucination_evaluator(eval_model)
self.evaluators_qa_with_reference[hallucination_evaluator] = "Hallucination"
if toxicity:
toxicity_evaluator = util.get_toxicity_evaluator(eval_model)
self.evaluators_qa_with_reference[toxicity_evaluator] = "Toxicity"
if groundtruth:
human_vs_AI_evaluator = util.get_human_vs_AI_evaluator(eval_model)
self.evaluators_qa_with_reference[human_vs_AI_evaluator] = "Groundtruth"
if relevance:
relevance_evaluator = util.get_relevance_evaluator(eval_model)
self.evaluators_retrieved_documents[relevance_evaluator] = "Relevance"
if outputtone:
self.run_outputtone = True
return selfThanks Xander S. I have written the evaluators like this :
def get_correctness_evaluator(model=DEFAULT_EVAL_MODEL, template=default_prompts.QA_PROMPT_TEMPLATE):
return LLMEvaluator(model, template)
def get_hallucination_evaluator(model=DEFAULT_EVAL_MODEL, template=default_prompts.HALLUCINATION_PROMPT_TEMPLATE):
return LLMEvaluator(model=DEFAULT_EVAL_MODEL, template=default_prompts.HALLUCINATION_PROMPT_TEMPLATE)
def get_toxicity_evaluator(model=DEFAULT_EVAL_MODEL, template=default_prompts.TOXICITY_PROMPT_TEMPLATE):
return LLMEvaluator(model=DEFAULT_EVAL_MODEL, template=default_prompts.TOXICITY_PROMPT_TEMPLATE)
def get_relevance_evaluator(model=DEFAULT_EVAL_MODEL, template=default_prompts.RAG_RELEVANCY_PROMPT_TEMPLATE):
return LLMEvaluator(model=DEFAULT_EVAL_MODEL, template=default_prompts.RAG_RELEVANCY_PROMPT_TEMPLATE)
def get_human_vs_AI_evaluator(model=DEFAULT_EVAL_MODEL, template=default_prompts.HUMAN_VS_AI_PROMPT_TEMPLATE):
return LLMEvaluator(model=DEFAULT_EVAL_MODEL, template=default_prompts.HUMAN_VS_AI_PROMPT_TEMPLATE)Hi team, i am facing this error while logging evaluations on ui. The traces are being sent but the evaluations are not comming. Actually the get_qa_with_reference(px.Client()) returns empty df and get_retrieved_documents(px.Client()) returns none error :
File "/Users/priya/datasci/llm_as_a_service/llamaas/qa/chatbot_qa/job_builder.py", line 188, in submit
self.dashboard_data[names[index]] = df_result.mean(numeric_only=True)["score"]
File "/Users/priya/tracenv/lib/python3.9/site-packages/pandas/core/series.py", line 1111, in __getitem__
return self._get_value(key)
File "/Users/priya/tracenv/lib/python3.9/site-packages/pandas/core/series.py", line 1227, in _get_value
loc = self.index.get_loc(label)
File "/Users/priya/tracenv/lib/python3.9/site-packages/pandas/core/indexes/range.py", line 417, in get_loc
raise KeyError(key)
KeyError: 'score'code snippet :
def submit(self):
os.environ["PHOENIX_PROJECT_NAME"] = self.project_name
print(f"local server setup")
session = px.launch_app()
LangChainInstrumentor().instrument()
ai_dresult = {}
question_num = 0
error_questions = 0
total_questions = len(self.question_answer_pool)
failed_questions = [] # To keep track of questions that failed
for question, human_answer in self.question_answer_pool.items():
try:
normalized_question = normalize_spacing(question)
ai_answer = self.chat_app_run(normalized_question)
ai_dresult[normalized_question] = ai_answer
question_num += 1
except Exception as e:
error_questions += 1
failed_questions.append((question, str(e))) # Store the failed question and the error
logger.error(f"An error occurred while processing the question '{question}': {e}")
# Optionally, you can continue to the next iteration if you want to skip failed ones
continue
# After processing all questions, logging the failed ones for review
if failed_questions:
logger.error(f"Failed questions summary: {len(failed_questions)} errors encountered.")
for failed_question, error_message in failed_questions:
logger.error(f"Question: {failed_question} | Error: {error_message}")
logger.info(f"The evaluation based on dataset:{self.dataset_id}. {question_num} questions to run")
logger.info(f"The evaluation based on dataset:{self.dataset_id}.")
logger.info(f"Total questions to be evaluated: {total_questions}")
logger.info(f"Questions processed successfully: {question_num}")
logger.info(f"Questions resulted in errors: {error_questions}")
# log the traces
logger.info(f"Getting input, output and reference from traces ...")
input_output_df = get_qa_with_reference(px.Client())
input_output_df["correct_answer"] = input_output_df["input"].apply(
lambda x: self.question_answer_pool[x])
input_output_df["ai_answer"] = input_output_df["input"].apply(
lambda x: ai_dresult[x])
retrieved_documents_df = get_retrieved_documents(px.Client())
print(f"input_output_df: {input_output_df}")
print(f"retrieved_documents_df: {retrieved_documents_df}")
if self.evaluators_qa_with_reference:
evaluations_list = self.evaluators_qa_with_reference.values()
logger.info(f"Running evaluations: {evaluations_list}")
df_results = []
names = []
for name in self.evaluators_qa_with_reference.values():
df_results.append(f"result_of_{name}")
names.append(name)
df_results = run_evals(
dataframe=input_output_df,
evaluators=self.evaluators_qa_with_reference.keys(),
provide_explanation=True,
)
print(f"self evalutaionsndndh: {self.evaluators_qa_with_reference.keys()}")
logger.info(f"Log evaluation results to UI for: {evaluations_list}")
for index, df_result in enumerate(df_results):
px.Client().log_evaluations(
SpanEvaluations(eval_name=names[index], dataframe=df_result)
)
self.dashboard_data[names[index]] = df_result.mean(numeric_only=True)["score"]
if self.evaluators_retrieved_documents:
evaluations_retrived = self.evaluators_retrieved_documents.values()
logger.info(f"Running evaluations: {evaluations_retrived}")
relevance_eval_df = run_evals(
dataframe=retrieved_documents_df,
evaluators=self.evaluators_retrieved_documents.keys(),
provide_explanation=True,
)[0]
logger.info(f"Log evaluation results to UI: {evaluations_retrived}")
px.Client().log_evaluations(
DocumentEvaluations(eval_name="Relevance", dataframe=relevance_eval_df),
)
self.dashboard_data["Relevance"] = relevance_eval_df.mean(numeric_only=True)["score"]
if self.run_outputtone:
logger.info("Run evaluation: outputtone")
output_tone_df = run_output_tone_evaluation(input_output_df)
logger.debug(f"output_tone_df:{output_tone_df}")
logger.info(f"Log evaluation results to UI: outputtone")
px.Client().log_evaluations(SpanEvaluations(eval_name="output_tone", dataframe=output_tone_df))
# TODO: implement the function
self.dashboard_data["timestamp"] = datetime.now().timestamp()
if self.dataset_id > BENCHMARK_DATASET_ID_MIN and self.project_name != PROJECT_NAMES.get('RANDOM'):
logger.info("Log evaluations scores to Bigquery")
util.log_scores_to_bigQuery(self.dashboard_data)
logger.info("The traces and evaluation done")
time.sleep(self.job_save_seconds_for_local_run)