Generate Pandas DataFrame for Project Metrics Evaluation
How do I print out in a pandas data frame a table of each Project Name and then the metrics at a given k value (E.g.) I want a df table like this: output: | Retrieval Mode | QA Correctness@3 | NDCG@3 | Precision@3 | Hit Rate@3 | | ------------------------------- | ---------------- | ------ | ----------- | ---------- | ----------------- | | **Standard: Embedding** | 0.96 | 0.72 | 0.57 | 0.77 | 0.97 | | **Standard: Hybrid** | 0.92 | 0.71 | 0.57 | 0.79 | 0.93 | | **Standard: Hybrid + Rerank** | 0.92 | 0.77 | 0.63 | 0.84 | 0.82 | | **Contextual: Embedding** | 0.95 | 0.77 | 0.60 | 0.81 | 0.90 | | **Contextual: Hybrid** | 0.94 | 0.72 | 0.60 | 0.79 | 0.88 | | **Contextual: Hybrid + Rerank** | 0.90 | 0.79 | 0.65 | 0.86 | 0.77 |
from phoenix.evals import (
HallucinationEvaluator,
OpenAIModel,
QAEvaluator,
RelevanceEvaluator,
run_evals,
)
from phoenix.session.evaluation import get_qa_with_reference, get_retrieved_documents
from phoenix.trace import DocumentEvaluations, SpanEvaluations
from tqdm import tqdm
# Define the evaluation model
eval_model = OpenAIModel(
azure_endpoint=AZURE_OPENAI_ENDPOINT,
azure_deployment="gpt-4o-mini", # I'm using gpt-4o-mini for evaluation
model=AZURE_OPENAI_CHAT_COMPLETION_DEPLOYED_MODEL_NAME,
api_key=AZURE_OPENAI_API_KEY,
api_version="2024-06-01"
)
# Define evaluators
hallucination_evaluator = HallucinationEvaluator(eval_model)
qa_correctness_evaluator = QAEvaluator(eval_model)
relevance_evaluator = RelevanceEvaluator(eval_model)
# List of project names corresponding to each query engine
projects = [
"Standard: Embedding",
"Standard: Hybrid",
"Standard: Hybrid + Rerank",
"Contextual: Embedding",
"Contextual: Hybrid",
"Contextual: Hybrid + Rerank",
]
# Loop through each project and perform evaluations
for project in projects:
# Create queries and retrieved documents DataFrames for the project
queries_df = get_qa_with_reference(px.Client(), project_name=project)
retrieved_documents_df = get_retrieved_documents(px.Client(), project_name=project)
# Run evaluations
hallucination_eval_df, qa_correctness_eval_df = run_evals(
dataframe=queries_df,
evaluators=[hallucination_evaluator, qa_correctness_evaluator],
provide_explanation=True,
)
relevance_eval_df = run_evals(
dataframe=retrieved_documents_df,
evaluators=[relevance_evaluator],
provide_explanation=True,
)[0]
# Log evaluations
px.Client().log_evaluations(
SpanEvaluations(eval_name=f"Hallucination_{project}", dataframe=hallucination_eval_df),
SpanEvaluations(eval_name=f"QA Correctness_{project}", dataframe=qa_correctness_eval_df),
DocumentEvaluations(eval_name=f"Relevance_{project}", dataframe=relevance_eval_df),
)