Logging Hallucination and QA Correctness Evaluations in UI Code
hi team, i want to log my evaluation for hallucination and qa correctness to the UI, why not showing? here's my code
px.launch_app().view()
tracer_provider = register(endpoint="http://0.0.0.0:6006/v1/traces")
LlamaIndexInstrumentor().instrument(skip_dep_check=True, tracer_provider=tracer_provider)
spans_df = px.Client().get_spans_dataframe()
spans_df[["name", "span_kind", "attributes.input.value", "attributes.retrieval.documents"]].head()
from phoenix.session.evaluation import get_qa_with_reference, get_retrieved_documents
retrieved_documents_df = get_retrieved_documents(px.Client())
queries_df = get_qa_with_reference(px.Client())
import nest_asyncio
nest_asyncio.apply()
eval_model = OpenAIModel(
model="gpt-4-0125-preview"
)
hallucination_evaluator = HallucinationEvaluator(eval_model)
qa_correctness_evaluator = QAEvaluator(eval_model)
hallucination_eval_df, qa_correctness_eval_df = run_evals(
dataframe=queries_df,
evaluators=[hallucination_evaluator, qa_correctness_evaluator],
provide_explanation=True,
)
hallucination_eval_df["score"] = (
hallucination_eval_df.label[~hallucination_eval_df.label.isna()] == "factual"
).astype(int)
qa_correctness_eval_df["score"] = (
qa_correctness_eval_df.label[~qa_correctness_eval_df.label.isna()] == "correct"
).astype(int)
px.Client().log_evaluations(
SpanEvaluations(eval_name="Hallucination", dataframe=hallucination_eval_df),
SpanEvaluations(eval_name="QA Correctness", dataframe=qa_correctness_eval_df),
)
