From f692def738a349d36e913ee7cd5272e56f5b8830 Mon Sep 17 00:00:00 2001 From: jyong <718720800@qq.com> Date: Tue, 17 Mar 2026 15:26:39 +0800 Subject: [PATCH] evaluation runtime --- .../evaluation/entities/evaluation_entity.py | 50 ++++++++++ .../frameworks/deepeval/deepeval_evaluator.py | 93 +++++++++++-------- .../frameworks/ragas/ragas_evaluator.py | 58 +++++++----- .../runners/agent_evaluation_runner.py | 65 ++++++------- .../runners/retrieval_evaluation_runner.py | 5 +- .../runners/snippet_evaluation_runner.py | 65 ++++++------- .../runners/workflow_evaluation_runner.py | 65 ++++++------- api/models/evaluation.py | 1 + api/tasks/evaluation_task.py | 38 +++++--- 9 files changed, 258 insertions(+), 182 deletions(-) diff --git a/api/core/evaluation/entities/evaluation_entity.py b/api/core/evaluation/entities/evaluation_entity.py index 8d065888a3..13cfa70ae4 100644 --- a/api/core/evaluation/entities/evaluation_entity.py +++ b/api/core/evaluation/entities/evaluation_entity.py @@ -15,6 +15,56 @@ class EvaluationCategory(StrEnum): RETRIEVAL_TEST = "retrieval_test" +class EvaluationMetricName(StrEnum): + """Canonical metric names shared across all evaluation frameworks. + + Each framework maps these names to its own internal implementation. + A framework that does not support a given metric should log a warning + and skip it rather than raising an error. + """ + + # LLM / general text-quality metrics + FAITHFULNESS = "faithfulness" + ANSWER_RELEVANCY = "answer_relevancy" + ANSWER_CORRECTNESS = "answer_correctness" + SEMANTIC_SIMILARITY = "semantic_similarity" + + # Retrieval-quality metrics + CONTEXT_PRECISION = "context_precision" + CONTEXT_RECALL = "context_recall" + CONTEXT_RELEVANCE = "context_relevance" + + # Agent-quality metrics + TOOL_CORRECTNESS = "tool_correctness" + TASK_COMPLETION = "task_completion" + + +# Per-category canonical metric lists used by get_supported_metrics(). +LLM_METRIC_NAMES: list[EvaluationMetricName] = [ + EvaluationMetricName.FAITHFULNESS, + EvaluationMetricName.ANSWER_RELEVANCY, + EvaluationMetricName.ANSWER_CORRECTNESS, + EvaluationMetricName.SEMANTIC_SIMILARITY, +] + +RETRIEVAL_METRIC_NAMES: list[EvaluationMetricName] = [ + EvaluationMetricName.CONTEXT_PRECISION, + EvaluationMetricName.CONTEXT_RECALL, + EvaluationMetricName.CONTEXT_RELEVANCE, +] + +AGENT_METRIC_NAMES: list[EvaluationMetricName] = [ + EvaluationMetricName.TOOL_CORRECTNESS, + EvaluationMetricName.TASK_COMPLETION, +] + +WORKFLOW_METRIC_NAMES: list[EvaluationMetricName] = [ + EvaluationMetricName.FAITHFULNESS, + EvaluationMetricName.ANSWER_RELEVANCY, + EvaluationMetricName.ANSWER_CORRECTNESS, +] + + class EvaluationMetric(BaseModel): name: str value: Any diff --git a/api/core/evaluation/frameworks/deepeval/deepeval_evaluator.py b/api/core/evaluation/frameworks/deepeval/deepeval_evaluator.py index 3ea6739bdc..3893f81061 100644 --- a/api/core/evaluation/frameworks/deepeval/deepeval_evaluator.py +++ b/api/core/evaluation/frameworks/deepeval/deepeval_evaluator.py @@ -4,30 +4,39 @@ from typing import Any from core.evaluation.base_evaluation_instance import BaseEvaluationInstance from core.evaluation.entities.config_entity import DeepEvalConfig from core.evaluation.entities.evaluation_entity import ( + AGENT_METRIC_NAMES, + LLM_METRIC_NAMES, + RETRIEVAL_METRIC_NAMES, + WORKFLOW_METRIC_NAMES, EvaluationCategory, EvaluationItemInput, EvaluationItemResult, EvaluationMetric, + EvaluationMetricName, ) from core.evaluation.frameworks.ragas.ragas_model_wrapper import DifyModelWrapper logger = logging.getLogger(__name__) -# Metric name mappings per category -# +# Maps canonical EvaluationMetricName to the corresponding deepeval metric class name. # deepeval metric field requirements (LLMTestCase fields): -# - faithfulness: input, actual_output, retrieval_context -# - answer_relevancy: input, actual_output -# - contextual_precision: input, actual_output, expected_output, retrieval_context -# - contextual_recall: input, actual_output, expected_output, retrieval_context -# - contextual_relevancy: input, actual_output, retrieval_context -# - hallucination: input, actual_output, context -# - tool_correctness: input, actual_output, expected_tools -# - task_completion: input, actual_output -LLM_METRICS = ["faithfulness", "answer_relevancy"] -RETRIEVAL_METRICS = ["contextual_precision", "contextual_recall", "contextual_relevancy"] -AGENT_METRICS = ["tool_correctness", "task_completion"] -WORKFLOW_METRICS = ["faithfulness", "answer_relevancy"] +# - faithfulness: input, actual_output, retrieval_context +# - answer_relevancy: input, actual_output +# - context_precision: input, actual_output, expected_output, retrieval_context +# - context_recall: input, actual_output, expected_output, retrieval_context +# - context_relevance: input, actual_output, retrieval_context +# - tool_correctness: input, actual_output, expected_tools +# - task_completion: input, actual_output +# Metrics not listed here are unsupported by deepeval and will be skipped. +_DEEPEVAL_METRIC_MAP: dict[EvaluationMetricName, str] = { + EvaluationMetricName.FAITHFULNESS: "FaithfulnessMetric", + EvaluationMetricName.ANSWER_RELEVANCY: "AnswerRelevancyMetric", + EvaluationMetricName.CONTEXT_PRECISION: "ContextualPrecisionMetric", + EvaluationMetricName.CONTEXT_RECALL: "ContextualRecallMetric", + EvaluationMetricName.CONTEXT_RELEVANCE: "ContextualRelevancyMetric", + EvaluationMetricName.TOOL_CORRECTNESS: "ToolCorrectnessMetric", + EvaluationMetricName.TASK_COMPLETION: "TaskCompletionMetric", +} class DeepEvalEvaluator(BaseEvaluationInstance): @@ -39,15 +48,16 @@ class DeepEvalEvaluator(BaseEvaluationInstance): def get_supported_metrics(self, category: EvaluationCategory) -> list[str]: match category: case EvaluationCategory.LLM: - return LLM_METRICS + candidates = LLM_METRIC_NAMES case EvaluationCategory.RETRIEVAL: - return RETRIEVAL_METRICS + candidates = RETRIEVAL_METRIC_NAMES case EvaluationCategory.AGENT: - return AGENT_METRICS - case EvaluationCategory.WORKFLOW: - return WORKFLOW_METRICS + candidates = AGENT_METRIC_NAMES + case EvaluationCategory.WORKFLOW | EvaluationCategory.SNIPPET: + candidates = WORKFLOW_METRIC_NAMES case _: return [] + return [m for m in candidates if m in _DEEPEVAL_METRIC_MAP] def evaluate_llm( self, @@ -121,8 +131,8 @@ class DeepEvalEvaluator(BaseEvaluationInstance): - Retrieval: input=query, actual_output=output, expected_output, retrieval_context=context - Agent: input=query, actual_output=output """ - deepeval_metrics = _build_deepeval_metrics(requested_metrics) - if not deepeval_metrics: + metric_pairs = _build_deepeval_metrics(requested_metrics) + if not metric_pairs: logger.warning("No valid DeepEval metrics found for: %s", requested_metrics) return [EvaluationItemResult(index=item.index) for item in items] @@ -130,15 +140,15 @@ class DeepEvalEvaluator(BaseEvaluationInstance): for item in items: test_case = self._build_test_case(item, category) metrics: list[EvaluationMetric] = [] - for metric in deepeval_metrics: + for canonical_name, metric in metric_pairs: try: metric.measure(test_case) if metric.score is not None: - metrics.append(EvaluationMetric(name=metric.__class__.__name__, value=float(metric.score))) + metrics.append(EvaluationMetric(name=canonical_name, value=float(metric.score))) except Exception: logger.exception( "Failed to compute metric %s for item %d", - metric.__class__.__name__, + canonical_name, item.index, ) results.append(EvaluationItemResult(index=item.index, metrics=metrics)) @@ -248,8 +258,12 @@ def _format_input(inputs: dict[str, Any], category: EvaluationCategory) -> str: return str(next(iter(inputs.values()), "")) if inputs else "" -def _build_deepeval_metrics(requested_metrics: list[str]) -> list[Any]: - """Build DeepEval metric instances from metric names.""" +def _build_deepeval_metrics(requested_metrics: list[str]) -> list[tuple[str, Any]]: + """Build DeepEval metric instances from canonical metric names. + + Returns a list of (canonical_name, metric_instance) pairs so that callers + can record the canonical name rather than the framework-internal class name. + """ try: from deepeval.metrics import ( AnswerRelevancyMetric, @@ -261,24 +275,25 @@ def _build_deepeval_metrics(requested_metrics: list[str]) -> list[Any]: ToolCorrectnessMetric, ) - metric_map: dict[str, Any] = { - "faithfulness": FaithfulnessMetric, - "answer_relevancy": AnswerRelevancyMetric, - "contextual_precision": ContextualPrecisionMetric, - "contextual_recall": ContextualRecallMetric, - "contextual_relevancy": ContextualRelevancyMetric, - "tool_correctness": ToolCorrectnessMetric, - "task_completion": TaskCompletionMetric, + # Maps canonical name → deepeval metric class + deepeval_class_map: dict[str, Any] = { + EvaluationMetricName.FAITHFULNESS: FaithfulnessMetric, + EvaluationMetricName.ANSWER_RELEVANCY: AnswerRelevancyMetric, + EvaluationMetricName.CONTEXT_PRECISION: ContextualPrecisionMetric, + EvaluationMetricName.CONTEXT_RECALL: ContextualRecallMetric, + EvaluationMetricName.CONTEXT_RELEVANCE: ContextualRelevancyMetric, + EvaluationMetricName.TOOL_CORRECTNESS: ToolCorrectnessMetric, + EvaluationMetricName.TASK_COMPLETION: TaskCompletionMetric, } - metrics = [] + pairs: list[tuple[str, Any]] = [] for name in requested_metrics: - metric_class = metric_map.get(name) + metric_class = deepeval_class_map.get(name) if metric_class: - metrics.append(metric_class(threshold=0.5)) + pairs.append((name, metric_class(threshold=0.5))) else: - logger.warning("Unknown DeepEval metric: %s", name) - return metrics + logger.warning("Metric '%s' is not supported by DeepEval, skipping", name) + return pairs except ImportError: logger.warning("DeepEval metrics not available") return [] diff --git a/api/core/evaluation/frameworks/ragas/ragas_evaluator.py b/api/core/evaluation/frameworks/ragas/ragas_evaluator.py index 842dfb29eb..4a67bb2d12 100644 --- a/api/core/evaluation/frameworks/ragas/ragas_evaluator.py +++ b/api/core/evaluation/frameworks/ragas/ragas_evaluator.py @@ -4,20 +4,32 @@ from typing import Any from core.evaluation.base_evaluation_instance import BaseEvaluationInstance from core.evaluation.entities.config_entity import RagasConfig from core.evaluation.entities.evaluation_entity import ( + AGENT_METRIC_NAMES, + LLM_METRIC_NAMES, + RETRIEVAL_METRIC_NAMES, + WORKFLOW_METRIC_NAMES, EvaluationCategory, EvaluationItemInput, EvaluationItemResult, EvaluationMetric, + EvaluationMetricName, ) from core.evaluation.frameworks.ragas.ragas_model_wrapper import DifyModelWrapper logger = logging.getLogger(__name__) -# Metric name mappings per category -LLM_METRICS = ["faithfulness", "answer_relevancy", "answer_correctness", "semantic_similarity"] -RETRIEVAL_METRICS = ["context_precision", "context_recall", "context_relevance"] -AGENT_METRICS = ["tool_call_accuracy", "answer_correctness"] -WORKFLOW_METRICS = ["faithfulness", "answer_correctness"] +# Maps canonical EvaluationMetricName to the corresponding ragas metric class. +# Metrics not listed here are unsupported by ragas and will be skipped. +_RAGAS_METRIC_MAP: dict[EvaluationMetricName, str] = { + EvaluationMetricName.FAITHFULNESS: "Faithfulness", + EvaluationMetricName.ANSWER_RELEVANCY: "AnswerRelevancy", + EvaluationMetricName.ANSWER_CORRECTNESS: "AnswerCorrectness", + EvaluationMetricName.SEMANTIC_SIMILARITY: "SemanticSimilarity", + EvaluationMetricName.CONTEXT_PRECISION: "ContextPrecision", + EvaluationMetricName.CONTEXT_RECALL: "ContextRecall", + EvaluationMetricName.CONTEXT_RELEVANCE: "ContextRelevance", + EvaluationMetricName.TOOL_CORRECTNESS: "ToolCallAccuracy", +} class RagasEvaluator(BaseEvaluationInstance): @@ -29,15 +41,16 @@ class RagasEvaluator(BaseEvaluationInstance): def get_supported_metrics(self, category: EvaluationCategory) -> list[str]: match category: case EvaluationCategory.LLM: - return LLM_METRICS + candidates = LLM_METRIC_NAMES case EvaluationCategory.RETRIEVAL: - return RETRIEVAL_METRICS + candidates = RETRIEVAL_METRIC_NAMES case EvaluationCategory.AGENT: - return AGENT_METRICS - case EvaluationCategory.WORKFLOW: - return WORKFLOW_METRICS + candidates = AGENT_METRIC_NAMES + case EvaluationCategory.WORKFLOW | EvaluationCategory.SNIPPET: + candidates = WORKFLOW_METRIC_NAMES case _: return [] + return [m for m in candidates if m in _RAGAS_METRIC_MAP] def evaluate_llm( self, @@ -250,7 +263,7 @@ class RagasEvaluator(BaseEvaluationInstance): @staticmethod def _build_ragas_metrics(requested_metrics: list[str]) -> list[Any]: - """Build RAGAS metric instances from metric names.""" + """Build RAGAS metric instances from canonical metric names.""" try: from ragas.metrics.collections import ( AnswerCorrectness, @@ -263,24 +276,25 @@ class RagasEvaluator(BaseEvaluationInstance): ToolCallAccuracy, ) - metric_map: dict[str, Any] = { - "faithfulness": Faithfulness, - "answer_relevancy": AnswerRelevancy, - "answer_correctness": AnswerCorrectness, - "semantic_similarity": SemanticSimilarity, - "context_precision": ContextPrecision, - "context_recall": ContextRecall, - "context_relevance": ContextRelevance, - "tool_call_accuracy": ToolCallAccuracy, + # Maps canonical name → ragas metric class + ragas_class_map: dict[str, Any] = { + EvaluationMetricName.FAITHFULNESS: Faithfulness, + EvaluationMetricName.ANSWER_RELEVANCY: AnswerRelevancy, + EvaluationMetricName.ANSWER_CORRECTNESS: AnswerCorrectness, + EvaluationMetricName.SEMANTIC_SIMILARITY: SemanticSimilarity, + EvaluationMetricName.CONTEXT_PRECISION: ContextPrecision, + EvaluationMetricName.CONTEXT_RECALL: ContextRecall, + EvaluationMetricName.CONTEXT_RELEVANCE: ContextRelevance, + EvaluationMetricName.TOOL_CORRECTNESS: ToolCallAccuracy, } metrics = [] for name in requested_metrics: - metric_class = metric_map.get(name) + metric_class = ragas_class_map.get(name) if metric_class: metrics.append(metric_class()) else: - logger.warning("Unknown RAGAS metric: %s", name) + logger.warning("Metric '%s' is not supported by RAGAS, skipping", name) return metrics except ImportError: logger.warning("RAGAS metrics not available") diff --git a/api/core/evaluation/runners/agent_evaluation_runner.py b/api/core/evaluation/runners/agent_evaluation_runner.py index c050f061e7..fb10be2c88 100644 --- a/api/core/evaluation/runners/agent_evaluation_runner.py +++ b/api/core/evaluation/runners/agent_evaluation_runner.py @@ -78,44 +78,29 @@ class AgentEvaluationRunner(BaseEvaluationRunner): tenant_id: str, ) -> list[EvaluationItemResult]: """Compute agent evaluation metrics.""" - result_by_index = {r.index: r for r in results} - merged_items = [] - for item in items: - result = result_by_index.get(item.index) - context = [] - if result and result.actual_output: - context.append(result.actual_output) - merged_items.append( - EvaluationItemInput( - index=item.index, - inputs=item.inputs, - expected_output=item.expected_output, - context=context + (item.context or []), - ) - ) - - evaluated = self.evaluation_instance.evaluate_agent( - merged_items, default_metrics, model_provider, model_name, tenant_id + if not node_run_result_list: + return [] + if not default_metric: + raise ValueError("Default metric is required for agent evaluation") + merged_items = self._merge_results_into_items(node_run_result_list) + return self.evaluation_instance.evaluate_agent( + merged_items, default_metric.metric, model_provider, model_name, tenant_id ) - # Merge metrics back preserving metadata - eval_by_index = {r.index: r for r in evaluated} - final_results = [] - for result in results: - if result.index in eval_by_index: - eval_result = eval_by_index[result.index] - final_results.append( - EvaluationItemResult( - index=result.index, - actual_output=result.actual_output, - metrics=eval_result.metrics, - metadata=result.metadata, - error=result.error, - ) + @staticmethod + def _merge_results_into_items(items: list[NodeRunResult]) -> list[EvaluationItemInput]: + """Create EvaluationItemInput list from NodeRunResult for agent evaluation.""" + merged = [] + for i, item in enumerate(items): + output = _extract_agent_output(item.outputs) + merged.append( + EvaluationItemInput( + index=i, + inputs=dict(item.inputs), + output=output, ) - else: - final_results.append(result) - return final_results + ) + return merged @staticmethod def _extract_query(inputs: dict[str, Any]) -> str: @@ -157,3 +142,13 @@ class AgentEvaluationRunner(BaseEvaluationRunner): logger.exception("Error consuming agent stream") return "".join(answer_parts), tool_calls + + +def _extract_agent_output(outputs: Mapping[str, Any]) -> str: + """Extract the primary output text from agent NodeRunResult.outputs.""" + if "answer" in outputs: + return str(outputs["answer"]) + if "text" in outputs: + return str(outputs["text"]) + values = list(outputs.values()) + return str(values[0]) if values else "" diff --git a/api/core/evaluation/runners/retrieval_evaluation_runner.py b/api/core/evaluation/runners/retrieval_evaluation_runner.py index 49fa01b026..7b6c12bf3a 100644 --- a/api/core/evaluation/runners/retrieval_evaluation_runner.py +++ b/api/core/evaluation/runners/retrieval_evaluation_runner.py @@ -63,7 +63,8 @@ class RetrievalEvaluationRunner(BaseEvaluationRunner): @staticmethod def _extract_query(inputs: dict[str, Any]) -> str: - for key in "query": + for key in ("query", "question", "input", "text"): if key in inputs: return str(inputs[key]) - return "" + values = list(inputs.values()) + return str(values[0]) if values else "" diff --git a/api/core/evaluation/runners/snippet_evaluation_runner.py b/api/core/evaluation/runners/snippet_evaluation_runner.py index 09aea22fd7..bfab5e18d8 100644 --- a/api/core/evaluation/runners/snippet_evaluation_runner.py +++ b/api/core/evaluation/runners/snippet_evaluation_runner.py @@ -109,44 +109,29 @@ class SnippetEvaluationRunner(BaseEvaluationRunner): Snippets are essentially workflows, so we reuse evaluate_workflow from the evaluation instance. """ - result_by_index = {r.index: r for r in results} - merged_items = [] - for item in items: - result = result_by_index.get(item.index) - context = [] - if result and result.actual_output: - context.append(result.actual_output) - merged_items.append( - EvaluationItemInput( - index=item.index, - inputs=item.inputs, - expected_output=item.expected_output, - context=context + (item.context or []), - ) - ) - - evaluated = self.evaluation_instance.evaluate_workflow( - merged_items, default_metrics, model_provider, model_name, tenant_id + if not node_run_result_list: + return [] + if not default_metric: + raise ValueError("Default metric is required for snippet evaluation") + merged_items = self._merge_results_into_items(node_run_result_list) + return self.evaluation_instance.evaluate_workflow( + merged_items, default_metric.metric, model_provider, model_name, tenant_id ) - # Merge metrics back preserving metadata from Phase 1 - eval_by_index = {r.index: r for r in evaluated} - final_results = [] - for result in results: - if result.index in eval_by_index: - eval_result = eval_by_index[result.index] - final_results.append( - EvaluationItemResult( - index=result.index, - actual_output=result.actual_output, - metrics=eval_result.metrics, - metadata=result.metadata, - error=result.error, - ) + @staticmethod + def _merge_results_into_items(items: list[NodeRunResult]) -> list[EvaluationItemInput]: + """Create EvaluationItemInput list from NodeRunResult for snippet evaluation.""" + merged = [] + for i, item in enumerate(items): + output = _extract_snippet_output(item.outputs) + merged.append( + EvaluationItemInput( + index=i, + inputs=dict(item.inputs), + output=output, ) - else: - final_results.append(result) - return final_results + ) + return merged @staticmethod def _extract_output(response: Mapping[str, Any]) -> str: @@ -235,3 +220,13 @@ class SnippetEvaluationRunner(BaseEvaluationRunner): "error": node.error, "elapsed_time": node.elapsed_time, } + + +def _extract_snippet_output(outputs: Mapping[str, Any]) -> str: + """Extract the primary output text from snippet NodeRunResult.outputs.""" + if "answer" in outputs: + return str(outputs["answer"]) + if "text" in outputs: + return str(outputs["text"]) + values = list(outputs.values()) + return str(values[0]) if values else "" diff --git a/api/core/evaluation/runners/workflow_evaluation_runner.py b/api/core/evaluation/runners/workflow_evaluation_runner.py index 76e778ebdf..f38ed35df3 100644 --- a/api/core/evaluation/runners/workflow_evaluation_runner.py +++ b/api/core/evaluation/runners/workflow_evaluation_runner.py @@ -34,44 +34,29 @@ class WorkflowEvaluationRunner(BaseEvaluationRunner): tenant_id: str, ) -> list[EvaluationItemResult]: """Compute workflow evaluation metrics (end-to-end).""" - result_by_index = {r.index: r for r in results} - merged_items = [] - for item in items: - result = result_by_index.get(item.index) - context = [] - if result and result.actual_output: - context.append(result.actual_output) - merged_items.append( - EvaluationItemInput( - index=item.index, - inputs=item.inputs, - expected_output=item.expected_output, - context=context + (item.context or []), - ) - ) - - evaluated = self.evaluation_instance.evaluate_workflow( - merged_items, default_metrics, model_provider, model_name, tenant_id + if not node_run_result_list: + return [] + if not default_metric: + raise ValueError("Default metric is required for workflow evaluation") + merged_items = self._merge_results_into_items(node_run_result_list) + return self.evaluation_instance.evaluate_workflow( + merged_items, default_metric.metric, model_provider, model_name, tenant_id ) - # Merge metrics back preserving metadata - eval_by_index = {r.index: r for r in evaluated} - final_results = [] - for result in results: - if result.index in eval_by_index: - eval_result = eval_by_index[result.index] - final_results.append( - EvaluationItemResult( - index=result.index, - actual_output=result.actual_output, - metrics=eval_result.metrics, - metadata=result.metadata, - error=result.error, - ) + @staticmethod + def _merge_results_into_items(items: list[NodeRunResult]) -> list[EvaluationItemInput]: + """Create EvaluationItemInput list from NodeRunResult for workflow evaluation.""" + merged = [] + for i, item in enumerate(items): + output = _extract_workflow_output(item.outputs) + merged.append( + EvaluationItemInput( + index=i, + inputs=dict(item.inputs), + output=output, ) - else: - final_results.append(result) - return final_results + ) + return merged @staticmethod def _extract_output(response: Mapping[str, Any]) -> str: @@ -91,3 +76,13 @@ class WorkflowEvaluationRunner(BaseEvaluationRunner): if isinstance(data, Mapping): return data.get("node_executions", []) return [] + + +def _extract_workflow_output(outputs: Mapping[str, Any]) -> str: + """Extract the primary output text from workflow NodeRunResult.outputs.""" + if "answer" in outputs: + return str(outputs["answer"]) + if "text" in outputs: + return str(outputs["text"]) + values = list(outputs.values()) + return str(values[0]) if values else "" diff --git a/api/models/evaluation.py b/api/models/evaluation.py index 9737242b76..3fdd9a9459 100644 --- a/api/models/evaluation.py +++ b/api/models/evaluation.py @@ -105,6 +105,7 @@ class EvaluationRun(Base): error: Mapped[str | None] = mapped_column(Text, nullable=True) celery_task_id: Mapped[str | None] = mapped_column(String(255), nullable=True) + metrics_summary: Mapped[str | None] = mapped_column(LongText, nullable=True) created_by: Mapped[str] = mapped_column(StringUUID, nullable=False) started_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True) diff --git a/api/tasks/evaluation_task.py b/api/tasks/evaluation_task.py index 3f4af7487d..664d46918e 100644 --- a/api/tasks/evaluation_task.py +++ b/api/tasks/evaluation_task.py @@ -12,6 +12,7 @@ from configs import dify_config from core.evaluation.base_evaluation_instance import BaseEvaluationInstance from core.evaluation.entities.evaluation_entity import ( EvaluationCategory, + EvaluationDatasetInput, EvaluationItemResult, EvaluationRunData, ) @@ -88,23 +89,23 @@ def _execute_evaluation(session: Any, run_data: EvaluationRunData) -> None: ) results: list[EvaluationItemResult] = _execute_evaluation_runner( - session, - run_data, - evaluation_instance, - node_run_result_mapping_list, + session=session, + run_data=run_data, + evaluation_instance=evaluation_instance, + node_run_result_mapping_list=node_run_result_mapping_list, ) # Compute summary metrics metrics_summary = _compute_metrics_summary(results, run_data.judgment_config) # Generate result XLSX - result_xlsx = _generate_result_xlsx(run_data.items, results) + result_xlsx = _generate_result_xlsx(run_data.input_list, results) # Store result file result_file_id = _store_result_file(run_data.tenant_id, run_data.evaluation_run_id, result_xlsx, session) # Update run to completed - evaluation_run = session.query(EvaluationRun).filter_by(id=run_data.evaluation_run_id).first() + evaluation_run: EvaluationRun = session.query(EvaluationRun).filter_by(id=run_data.evaluation_run_id).first() if evaluation_run: evaluation_run.status = EvaluationRunStatus.COMPLETED evaluation_run.completed_at = naive_utc_now() @@ -232,10 +233,10 @@ def _compute_metrics_summary( def _generate_result_xlsx( - items: list[Any], + input_list: list[EvaluationDatasetInput], results: list[EvaluationItemResult], ) -> bytes: - """Generate result XLSX with input data, actual output, and metric scores.""" + """Generate result XLSX with input data, actual output, metric scores, and judgment.""" wb = Workbook() ws = wb.active if ws is None: @@ -261,14 +262,18 @@ def _generate_result_xlsx( # Collect all input keys input_keys: list[str] = [] - for item in items: + for item in input_list: for key in item.inputs: if key not in input_keys: input_keys.append(key) + # Include judgment column only when at least one result has judgment conditions evaluated + has_judgment = any(bool(r.judgment.condition_results) for r in results) + # Build headers + judgment_headers = ["judgment"] if has_judgment else [] headers = ( - ["index"] + input_keys + ["expected_output", "actual_output"] + all_metric_names + ["overall_score", "error"] + ["index"] + input_keys + ["expected_output", "actual_output"] + all_metric_names + judgment_headers + ["error"] ) # Write header row @@ -288,7 +293,7 @@ def _generate_result_xlsx( result_by_index = {r.index: r for r in results} # Write data rows - for row_idx, item in enumerate(items, start=2): + for row_idx, item in enumerate(input_list, start=2): result = result_by_index.get(item.index) col = 1 @@ -317,9 +322,14 @@ def _generate_result_xlsx( ws.cell(row=row_idx, column=col, value=score if score is not None else "").border = thin_border col += 1 - # Overall score - ws.cell(row=row_idx, column=col, value=result.overall_score if result else "").border = thin_border - col += 1 + # Judgment result + if has_judgment: + if result and result.judgment.condition_results: + judgment_value = "Pass" if result.judgment.passed else "Fail" + else: + judgment_value = "" + ws.cell(row=row_idx, column=col, value=judgment_value).border = thin_border + col += 1 # Error ws.cell(row=row_idx, column=col, value=result.error if result else "").border = thin_border