From f692def738a349d36e913ee7cd5272e56f5b8830 Mon Sep 17 00:00:00 2001
From: jyong <718720800@qq.com>
Date: Tue, 17 Mar 2026 15:26:39 +0800
Subject: [PATCH] evaluation runtime

---
 .../evaluation/entities/evaluation_entity.py  | 50 ++++++++++
 .../frameworks/deepeval/deepeval_evaluator.py | 93 +++++++++++--------
 .../frameworks/ragas/ragas_evaluator.py       | 58 +++++++-----
 .../runners/agent_evaluation_runner.py        | 65 ++++++-------
 .../runners/retrieval_evaluation_runner.py    |  5 +-
 .../runners/snippet_evaluation_runner.py      | 65 ++++++-------
 .../runners/workflow_evaluation_runner.py     | 65 ++++++-------
 api/models/evaluation.py                      |  1 +
 api/tasks/evaluation_task.py                  | 38 +++++---
 9 files changed, 258 insertions(+), 182 deletions(-)

diff --git a/api/core/evaluation/entities/evaluation_entity.py b/api/core/evaluation/entities/evaluation_entity.py
index 8d065888a3..13cfa70ae4 100644
--- a/api/core/evaluation/entities/evaluation_entity.py
+++ b/api/core/evaluation/entities/evaluation_entity.py
@@ -15,6 +15,56 @@ class EvaluationCategory(StrEnum):
     RETRIEVAL_TEST = "retrieval_test"
 
 
+class EvaluationMetricName(StrEnum):
+    """Canonical metric names shared across all evaluation frameworks.
+
+    Each framework maps these names to its own internal implementation.
+    A framework that does not support a given metric should log a warning
+    and skip it rather than raising an error.
+    """
+
+    # LLM / general text-quality metrics
+    FAITHFULNESS = "faithfulness"
+    ANSWER_RELEVANCY = "answer_relevancy"
+    ANSWER_CORRECTNESS = "answer_correctness"
+    SEMANTIC_SIMILARITY = "semantic_similarity"
+
+    # Retrieval-quality metrics
+    CONTEXT_PRECISION = "context_precision"
+    CONTEXT_RECALL = "context_recall"
+    CONTEXT_RELEVANCE = "context_relevance"
+
+    # Agent-quality metrics
+    TOOL_CORRECTNESS = "tool_correctness"
+    TASK_COMPLETION = "task_completion"
+
+
+# Per-category canonical metric lists used by get_supported_metrics().
+LLM_METRIC_NAMES: list[EvaluationMetricName] = [
+    EvaluationMetricName.FAITHFULNESS,
+    EvaluationMetricName.ANSWER_RELEVANCY,
+    EvaluationMetricName.ANSWER_CORRECTNESS,
+    EvaluationMetricName.SEMANTIC_SIMILARITY,
+]
+
+RETRIEVAL_METRIC_NAMES: list[EvaluationMetricName] = [
+    EvaluationMetricName.CONTEXT_PRECISION,
+    EvaluationMetricName.CONTEXT_RECALL,
+    EvaluationMetricName.CONTEXT_RELEVANCE,
+]
+
+AGENT_METRIC_NAMES: list[EvaluationMetricName] = [
+    EvaluationMetricName.TOOL_CORRECTNESS,
+    EvaluationMetricName.TASK_COMPLETION,
+]
+
+WORKFLOW_METRIC_NAMES: list[EvaluationMetricName] = [
+    EvaluationMetricName.FAITHFULNESS,
+    EvaluationMetricName.ANSWER_RELEVANCY,
+    EvaluationMetricName.ANSWER_CORRECTNESS,
+]
+
+
 class EvaluationMetric(BaseModel):
     name: str
     value: Any
diff --git a/api/core/evaluation/frameworks/deepeval/deepeval_evaluator.py b/api/core/evaluation/frameworks/deepeval/deepeval_evaluator.py
index 3ea6739bdc..3893f81061 100644
--- a/api/core/evaluation/frameworks/deepeval/deepeval_evaluator.py
+++ b/api/core/evaluation/frameworks/deepeval/deepeval_evaluator.py
@@ -4,30 +4,39 @@ from typing import Any
 from core.evaluation.base_evaluation_instance import BaseEvaluationInstance
 from core.evaluation.entities.config_entity import DeepEvalConfig
 from core.evaluation.entities.evaluation_entity import (
+    AGENT_METRIC_NAMES,
+    LLM_METRIC_NAMES,
+    RETRIEVAL_METRIC_NAMES,
+    WORKFLOW_METRIC_NAMES,
     EvaluationCategory,
     EvaluationItemInput,
     EvaluationItemResult,
     EvaluationMetric,
+    EvaluationMetricName,
 )
 from core.evaluation.frameworks.ragas.ragas_model_wrapper import DifyModelWrapper
 
 logger = logging.getLogger(__name__)
 
-# Metric name mappings per category
-#
+# Maps canonical EvaluationMetricName to the corresponding deepeval metric class name.
 # deepeval metric field requirements (LLMTestCase fields):
-#   - faithfulness:          input, actual_output, retrieval_context
-#   - answer_relevancy:      input, actual_output
-#   - contextual_precision:  input, actual_output, expected_output, retrieval_context
-#   - contextual_recall:     input, actual_output, expected_output, retrieval_context
-#   - contextual_relevancy:  input, actual_output, retrieval_context
-#   - hallucination:         input, actual_output, context
-#   - tool_correctness:      input, actual_output, expected_tools
-#   - task_completion:       input, actual_output
-LLM_METRICS = ["faithfulness", "answer_relevancy"]
-RETRIEVAL_METRICS = ["contextual_precision", "contextual_recall", "contextual_relevancy"]
-AGENT_METRICS = ["tool_correctness", "task_completion"]
-WORKFLOW_METRICS = ["faithfulness", "answer_relevancy"]
+#   - faithfulness:       input, actual_output, retrieval_context
+#   - answer_relevancy:   input, actual_output
+#   - context_precision:  input, actual_output, expected_output, retrieval_context
+#   - context_recall:     input, actual_output, expected_output, retrieval_context
+#   - context_relevance:  input, actual_output, retrieval_context
+#   - tool_correctness:   input, actual_output, expected_tools
+#   - task_completion:    input, actual_output
+# Metrics not listed here are unsupported by deepeval and will be skipped.
+_DEEPEVAL_METRIC_MAP: dict[EvaluationMetricName, str] = {
+    EvaluationMetricName.FAITHFULNESS: "FaithfulnessMetric",
+    EvaluationMetricName.ANSWER_RELEVANCY: "AnswerRelevancyMetric",
+    EvaluationMetricName.CONTEXT_PRECISION: "ContextualPrecisionMetric",
+    EvaluationMetricName.CONTEXT_RECALL: "ContextualRecallMetric",
+    EvaluationMetricName.CONTEXT_RELEVANCE: "ContextualRelevancyMetric",
+    EvaluationMetricName.TOOL_CORRECTNESS: "ToolCorrectnessMetric",
+    EvaluationMetricName.TASK_COMPLETION: "TaskCompletionMetric",
+}
 
 
 class DeepEvalEvaluator(BaseEvaluationInstance):
@@ -39,15 +48,16 @@ class DeepEvalEvaluator(BaseEvaluationInstance):
     def get_supported_metrics(self, category: EvaluationCategory) -> list[str]:
         match category:
             case EvaluationCategory.LLM:
-                return LLM_METRICS
+                candidates = LLM_METRIC_NAMES
             case EvaluationCategory.RETRIEVAL:
-                return RETRIEVAL_METRICS
+                candidates = RETRIEVAL_METRIC_NAMES
             case EvaluationCategory.AGENT:
-                return AGENT_METRICS
-            case EvaluationCategory.WORKFLOW:
-                return WORKFLOW_METRICS
+                candidates = AGENT_METRIC_NAMES
+            case EvaluationCategory.WORKFLOW | EvaluationCategory.SNIPPET:
+                candidates = WORKFLOW_METRIC_NAMES
             case _:
                 return []
+        return [m for m in candidates if m in _DEEPEVAL_METRIC_MAP]
 
     def evaluate_llm(
         self,
@@ -121,8 +131,8 @@ class DeepEvalEvaluator(BaseEvaluationInstance):
         - Retrieval: input=query, actual_output=output, expected_output, retrieval_context=context
         - Agent: input=query, actual_output=output
         """
-        deepeval_metrics = _build_deepeval_metrics(requested_metrics)
-        if not deepeval_metrics:
+        metric_pairs = _build_deepeval_metrics(requested_metrics)
+        if not metric_pairs:
             logger.warning("No valid DeepEval metrics found for: %s", requested_metrics)
             return [EvaluationItemResult(index=item.index) for item in items]
 
@@ -130,15 +140,15 @@ class DeepEvalEvaluator(BaseEvaluationInstance):
         for item in items:
             test_case = self._build_test_case(item, category)
             metrics: list[EvaluationMetric] = []
-            for metric in deepeval_metrics:
+            for canonical_name, metric in metric_pairs:
                 try:
                     metric.measure(test_case)
                     if metric.score is not None:
-                        metrics.append(EvaluationMetric(name=metric.__class__.__name__, value=float(metric.score)))
+                        metrics.append(EvaluationMetric(name=canonical_name, value=float(metric.score)))
                 except Exception:
                     logger.exception(
                         "Failed to compute metric %s for item %d",
-                        metric.__class__.__name__,
+                        canonical_name,
                         item.index,
                     )
             results.append(EvaluationItemResult(index=item.index, metrics=metrics))
@@ -248,8 +258,12 @@ def _format_input(inputs: dict[str, Any], category: EvaluationCategory) -> str:
             return str(next(iter(inputs.values()), "")) if inputs else ""
 
 
-def _build_deepeval_metrics(requested_metrics: list[str]) -> list[Any]:
-    """Build DeepEval metric instances from metric names."""
+def _build_deepeval_metrics(requested_metrics: list[str]) -> list[tuple[str, Any]]:
+    """Build DeepEval metric instances from canonical metric names.
+
+    Returns a list of (canonical_name, metric_instance) pairs so that callers
+    can record the canonical name rather than the framework-internal class name.
+    """
     try:
         from deepeval.metrics import (
             AnswerRelevancyMetric,
@@ -261,24 +275,25 @@ def _build_deepeval_metrics(requested_metrics: list[str]) -> list[Any]:
             ToolCorrectnessMetric,
         )
 
-        metric_map: dict[str, Any] = {
-            "faithfulness": FaithfulnessMetric,
-            "answer_relevancy": AnswerRelevancyMetric,
-            "contextual_precision": ContextualPrecisionMetric,
-            "contextual_recall": ContextualRecallMetric,
-            "contextual_relevancy": ContextualRelevancyMetric,
-            "tool_correctness": ToolCorrectnessMetric,
-            "task_completion": TaskCompletionMetric,
+        # Maps canonical name → deepeval metric class
+        deepeval_class_map: dict[str, Any] = {
+            EvaluationMetricName.FAITHFULNESS: FaithfulnessMetric,
+            EvaluationMetricName.ANSWER_RELEVANCY: AnswerRelevancyMetric,
+            EvaluationMetricName.CONTEXT_PRECISION: ContextualPrecisionMetric,
+            EvaluationMetricName.CONTEXT_RECALL: ContextualRecallMetric,
+            EvaluationMetricName.CONTEXT_RELEVANCE: ContextualRelevancyMetric,
+            EvaluationMetricName.TOOL_CORRECTNESS: ToolCorrectnessMetric,
+            EvaluationMetricName.TASK_COMPLETION: TaskCompletionMetric,
         }
 
-        metrics = []
+        pairs: list[tuple[str, Any]] = []
         for name in requested_metrics:
-            metric_class = metric_map.get(name)
+            metric_class = deepeval_class_map.get(name)
             if metric_class:
-                metrics.append(metric_class(threshold=0.5))
+                pairs.append((name, metric_class(threshold=0.5)))
             else:
-                logger.warning("Unknown DeepEval metric: %s", name)
-        return metrics
+                logger.warning("Metric '%s' is not supported by DeepEval, skipping", name)
+        return pairs
     except ImportError:
         logger.warning("DeepEval metrics not available")
         return []
diff --git a/api/core/evaluation/frameworks/ragas/ragas_evaluator.py b/api/core/evaluation/frameworks/ragas/ragas_evaluator.py
index 842dfb29eb..4a67bb2d12 100644
--- a/api/core/evaluation/frameworks/ragas/ragas_evaluator.py
+++ b/api/core/evaluation/frameworks/ragas/ragas_evaluator.py
@@ -4,20 +4,32 @@ from typing import Any
 from core.evaluation.base_evaluation_instance import BaseEvaluationInstance
 from core.evaluation.entities.config_entity import RagasConfig
 from core.evaluation.entities.evaluation_entity import (
+    AGENT_METRIC_NAMES,
+    LLM_METRIC_NAMES,
+    RETRIEVAL_METRIC_NAMES,
+    WORKFLOW_METRIC_NAMES,
     EvaluationCategory,
     EvaluationItemInput,
     EvaluationItemResult,
     EvaluationMetric,
+    EvaluationMetricName,
 )
 from core.evaluation.frameworks.ragas.ragas_model_wrapper import DifyModelWrapper
 
 logger = logging.getLogger(__name__)
 
-# Metric name mappings per category
-LLM_METRICS = ["faithfulness", "answer_relevancy", "answer_correctness", "semantic_similarity"]
-RETRIEVAL_METRICS = ["context_precision", "context_recall", "context_relevance"]
-AGENT_METRICS = ["tool_call_accuracy", "answer_correctness"]
-WORKFLOW_METRICS = ["faithfulness", "answer_correctness"]
+# Maps canonical EvaluationMetricName to the corresponding ragas metric class.
+# Metrics not listed here are unsupported by ragas and will be skipped.
+_RAGAS_METRIC_MAP: dict[EvaluationMetricName, str] = {
+    EvaluationMetricName.FAITHFULNESS: "Faithfulness",
+    EvaluationMetricName.ANSWER_RELEVANCY: "AnswerRelevancy",
+    EvaluationMetricName.ANSWER_CORRECTNESS: "AnswerCorrectness",
+    EvaluationMetricName.SEMANTIC_SIMILARITY: "SemanticSimilarity",
+    EvaluationMetricName.CONTEXT_PRECISION: "ContextPrecision",
+    EvaluationMetricName.CONTEXT_RECALL: "ContextRecall",
+    EvaluationMetricName.CONTEXT_RELEVANCE: "ContextRelevance",
+    EvaluationMetricName.TOOL_CORRECTNESS: "ToolCallAccuracy",
+}
 
 
 class RagasEvaluator(BaseEvaluationInstance):
@@ -29,15 +41,16 @@ class RagasEvaluator(BaseEvaluationInstance):
     def get_supported_metrics(self, category: EvaluationCategory) -> list[str]:
         match category:
             case EvaluationCategory.LLM:
-                return LLM_METRICS
+                candidates = LLM_METRIC_NAMES
             case EvaluationCategory.RETRIEVAL:
-                return RETRIEVAL_METRICS
+                candidates = RETRIEVAL_METRIC_NAMES
             case EvaluationCategory.AGENT:
-                return AGENT_METRICS
-            case EvaluationCategory.WORKFLOW:
-                return WORKFLOW_METRICS
+                candidates = AGENT_METRIC_NAMES
+            case EvaluationCategory.WORKFLOW | EvaluationCategory.SNIPPET:
+                candidates = WORKFLOW_METRIC_NAMES
             case _:
                 return []
+        return [m for m in candidates if m in _RAGAS_METRIC_MAP]
 
     def evaluate_llm(
         self,
@@ -250,7 +263,7 @@ class RagasEvaluator(BaseEvaluationInstance):
 
     @staticmethod
     def _build_ragas_metrics(requested_metrics: list[str]) -> list[Any]:
-        """Build RAGAS metric instances from metric names."""
+        """Build RAGAS metric instances from canonical metric names."""
         try:
             from ragas.metrics.collections import (
                 AnswerCorrectness,
@@ -263,24 +276,25 @@ class RagasEvaluator(BaseEvaluationInstance):
                 ToolCallAccuracy,
             )
 
-            metric_map: dict[str, Any] = {
-                "faithfulness": Faithfulness,
-                "answer_relevancy": AnswerRelevancy,
-                "answer_correctness": AnswerCorrectness,
-                "semantic_similarity": SemanticSimilarity,
-                "context_precision": ContextPrecision,
-                "context_recall": ContextRecall,
-                "context_relevance": ContextRelevance,
-                "tool_call_accuracy": ToolCallAccuracy,
+            # Maps canonical name → ragas metric class
+            ragas_class_map: dict[str, Any] = {
+                EvaluationMetricName.FAITHFULNESS: Faithfulness,
+                EvaluationMetricName.ANSWER_RELEVANCY: AnswerRelevancy,
+                EvaluationMetricName.ANSWER_CORRECTNESS: AnswerCorrectness,
+                EvaluationMetricName.SEMANTIC_SIMILARITY: SemanticSimilarity,
+                EvaluationMetricName.CONTEXT_PRECISION: ContextPrecision,
+                EvaluationMetricName.CONTEXT_RECALL: ContextRecall,
+                EvaluationMetricName.CONTEXT_RELEVANCE: ContextRelevance,
+                EvaluationMetricName.TOOL_CORRECTNESS: ToolCallAccuracy,
             }
 
             metrics = []
             for name in requested_metrics:
-                metric_class = metric_map.get(name)
+                metric_class = ragas_class_map.get(name)
                 if metric_class:
                     metrics.append(metric_class())
                 else:
-                    logger.warning("Unknown RAGAS metric: %s", name)
+                    logger.warning("Metric '%s' is not supported by RAGAS, skipping", name)
             return metrics
         except ImportError:
             logger.warning("RAGAS metrics not available")
diff --git a/api/core/evaluation/runners/agent_evaluation_runner.py b/api/core/evaluation/runners/agent_evaluation_runner.py
index c050f061e7..fb10be2c88 100644
--- a/api/core/evaluation/runners/agent_evaluation_runner.py
+++ b/api/core/evaluation/runners/agent_evaluation_runner.py
@@ -78,44 +78,29 @@ class AgentEvaluationRunner(BaseEvaluationRunner):
         tenant_id: str,
     ) -> list[EvaluationItemResult]:
         """Compute agent evaluation metrics."""
-        result_by_index = {r.index: r for r in results}
-        merged_items = []
-        for item in items:
-            result = result_by_index.get(item.index)
-            context = []
-            if result and result.actual_output:
-                context.append(result.actual_output)
-            merged_items.append(
-                EvaluationItemInput(
-                    index=item.index,
-                    inputs=item.inputs,
-                    expected_output=item.expected_output,
-                    context=context + (item.context or []),
-                )
-            )
-
-        evaluated = self.evaluation_instance.evaluate_agent(
-            merged_items, default_metrics, model_provider, model_name, tenant_id
+        if not node_run_result_list:
+            return []
+        if not default_metric:
+            raise ValueError("Default metric is required for agent evaluation")
+        merged_items = self._merge_results_into_items(node_run_result_list)
+        return self.evaluation_instance.evaluate_agent(
+            merged_items, default_metric.metric, model_provider, model_name, tenant_id
         )
 
-        # Merge metrics back preserving metadata
-        eval_by_index = {r.index: r for r in evaluated}
-        final_results = []
-        for result in results:
-            if result.index in eval_by_index:
-                eval_result = eval_by_index[result.index]
-                final_results.append(
-                    EvaluationItemResult(
-                        index=result.index,
-                        actual_output=result.actual_output,
-                        metrics=eval_result.metrics,
-                        metadata=result.metadata,
-                        error=result.error,
-                    )
+    @staticmethod
+    def _merge_results_into_items(items: list[NodeRunResult]) -> list[EvaluationItemInput]:
+        """Create EvaluationItemInput list from NodeRunResult for agent evaluation."""
+        merged = []
+        for i, item in enumerate(items):
+            output = _extract_agent_output(item.outputs)
+            merged.append(
+                EvaluationItemInput(
+                    index=i,
+                    inputs=dict(item.inputs),
+                    output=output,
                 )
-            else:
-                final_results.append(result)
-        return final_results
+            )
+        return merged
 
     @staticmethod
     def _extract_query(inputs: dict[str, Any]) -> str:
@@ -157,3 +142,13 @@ class AgentEvaluationRunner(BaseEvaluationRunner):
             logger.exception("Error consuming agent stream")
 
         return "".join(answer_parts), tool_calls
+
+
+def _extract_agent_output(outputs: Mapping[str, Any]) -> str:
+    """Extract the primary output text from agent NodeRunResult.outputs."""
+    if "answer" in outputs:
+        return str(outputs["answer"])
+    if "text" in outputs:
+        return str(outputs["text"])
+    values = list(outputs.values())
+    return str(values[0]) if values else ""
diff --git a/api/core/evaluation/runners/retrieval_evaluation_runner.py b/api/core/evaluation/runners/retrieval_evaluation_runner.py
index 49fa01b026..7b6c12bf3a 100644
--- a/api/core/evaluation/runners/retrieval_evaluation_runner.py
+++ b/api/core/evaluation/runners/retrieval_evaluation_runner.py
@@ -63,7 +63,8 @@ class RetrievalEvaluationRunner(BaseEvaluationRunner):
 
     @staticmethod
     def _extract_query(inputs: dict[str, Any]) -> str:
-        for key in "query":
+        for key in ("query", "question", "input", "text"):
             if key in inputs:
                 return str(inputs[key])
-        return ""
+        values = list(inputs.values())
+        return str(values[0]) if values else ""
diff --git a/api/core/evaluation/runners/snippet_evaluation_runner.py b/api/core/evaluation/runners/snippet_evaluation_runner.py
index 09aea22fd7..bfab5e18d8 100644
--- a/api/core/evaluation/runners/snippet_evaluation_runner.py
+++ b/api/core/evaluation/runners/snippet_evaluation_runner.py
@@ -109,44 +109,29 @@ class SnippetEvaluationRunner(BaseEvaluationRunner):
         Snippets are essentially workflows, so we reuse evaluate_workflow from
         the evaluation instance.
         """
-        result_by_index = {r.index: r for r in results}
-        merged_items = []
-        for item in items:
-            result = result_by_index.get(item.index)
-            context = []
-            if result and result.actual_output:
-                context.append(result.actual_output)
-            merged_items.append(
-                EvaluationItemInput(
-                    index=item.index,
-                    inputs=item.inputs,
-                    expected_output=item.expected_output,
-                    context=context + (item.context or []),
-                )
-            )
-
-        evaluated = self.evaluation_instance.evaluate_workflow(
-            merged_items, default_metrics, model_provider, model_name, tenant_id
+        if not node_run_result_list:
+            return []
+        if not default_metric:
+            raise ValueError("Default metric is required for snippet evaluation")
+        merged_items = self._merge_results_into_items(node_run_result_list)
+        return self.evaluation_instance.evaluate_workflow(
+            merged_items, default_metric.metric, model_provider, model_name, tenant_id
         )
 
-        # Merge metrics back preserving metadata from Phase 1
-        eval_by_index = {r.index: r for r in evaluated}
-        final_results = []
-        for result in results:
-            if result.index in eval_by_index:
-                eval_result = eval_by_index[result.index]
-                final_results.append(
-                    EvaluationItemResult(
-                        index=result.index,
-                        actual_output=result.actual_output,
-                        metrics=eval_result.metrics,
-                        metadata=result.metadata,
-                        error=result.error,
-                    )
+    @staticmethod
+    def _merge_results_into_items(items: list[NodeRunResult]) -> list[EvaluationItemInput]:
+        """Create EvaluationItemInput list from NodeRunResult for snippet evaluation."""
+        merged = []
+        for i, item in enumerate(items):
+            output = _extract_snippet_output(item.outputs)
+            merged.append(
+                EvaluationItemInput(
+                    index=i,
+                    inputs=dict(item.inputs),
+                    output=output,
                 )
-            else:
-                final_results.append(result)
-        return final_results
+            )
+        return merged
 
     @staticmethod
     def _extract_output(response: Mapping[str, Any]) -> str:
@@ -235,3 +220,13 @@ class SnippetEvaluationRunner(BaseEvaluationRunner):
             "error": node.error,
             "elapsed_time": node.elapsed_time,
         }
+
+
+def _extract_snippet_output(outputs: Mapping[str, Any]) -> str:
+    """Extract the primary output text from snippet NodeRunResult.outputs."""
+    if "answer" in outputs:
+        return str(outputs["answer"])
+    if "text" in outputs:
+        return str(outputs["text"])
+    values = list(outputs.values())
+    return str(values[0]) if values else ""
diff --git a/api/core/evaluation/runners/workflow_evaluation_runner.py b/api/core/evaluation/runners/workflow_evaluation_runner.py
index 76e778ebdf..f38ed35df3 100644
--- a/api/core/evaluation/runners/workflow_evaluation_runner.py
+++ b/api/core/evaluation/runners/workflow_evaluation_runner.py
@@ -34,44 +34,29 @@ class WorkflowEvaluationRunner(BaseEvaluationRunner):
         tenant_id: str,
     ) -> list[EvaluationItemResult]:
         """Compute workflow evaluation metrics (end-to-end)."""
-        result_by_index = {r.index: r for r in results}
-        merged_items = []
-        for item in items:
-            result = result_by_index.get(item.index)
-            context = []
-            if result and result.actual_output:
-                context.append(result.actual_output)
-            merged_items.append(
-                EvaluationItemInput(
-                    index=item.index,
-                    inputs=item.inputs,
-                    expected_output=item.expected_output,
-                    context=context + (item.context or []),
-                )
-            )
-
-        evaluated = self.evaluation_instance.evaluate_workflow(
-            merged_items, default_metrics, model_provider, model_name, tenant_id
+        if not node_run_result_list:
+            return []
+        if not default_metric:
+            raise ValueError("Default metric is required for workflow evaluation")
+        merged_items = self._merge_results_into_items(node_run_result_list)
+        return self.evaluation_instance.evaluate_workflow(
+            merged_items, default_metric.metric, model_provider, model_name, tenant_id
         )
 
-        # Merge metrics back preserving metadata
-        eval_by_index = {r.index: r for r in evaluated}
-        final_results = []
-        for result in results:
-            if result.index in eval_by_index:
-                eval_result = eval_by_index[result.index]
-                final_results.append(
-                    EvaluationItemResult(
-                        index=result.index,
-                        actual_output=result.actual_output,
-                        metrics=eval_result.metrics,
-                        metadata=result.metadata,
-                        error=result.error,
-                    )
+    @staticmethod
+    def _merge_results_into_items(items: list[NodeRunResult]) -> list[EvaluationItemInput]:
+        """Create EvaluationItemInput list from NodeRunResult for workflow evaluation."""
+        merged = []
+        for i, item in enumerate(items):
+            output = _extract_workflow_output(item.outputs)
+            merged.append(
+                EvaluationItemInput(
+                    index=i,
+                    inputs=dict(item.inputs),
+                    output=output,
                 )
-            else:
-                final_results.append(result)
-        return final_results
+            )
+        return merged
 
     @staticmethod
     def _extract_output(response: Mapping[str, Any]) -> str:
@@ -91,3 +76,13 @@ class WorkflowEvaluationRunner(BaseEvaluationRunner):
         if isinstance(data, Mapping):
             return data.get("node_executions", [])
         return []
+
+
+def _extract_workflow_output(outputs: Mapping[str, Any]) -> str:
+    """Extract the primary output text from workflow NodeRunResult.outputs."""
+    if "answer" in outputs:
+        return str(outputs["answer"])
+    if "text" in outputs:
+        return str(outputs["text"])
+    values = list(outputs.values())
+    return str(values[0]) if values else ""
diff --git a/api/models/evaluation.py b/api/models/evaluation.py
index 9737242b76..3fdd9a9459 100644
--- a/api/models/evaluation.py
+++ b/api/models/evaluation.py
@@ -105,6 +105,7 @@ class EvaluationRun(Base):
     error: Mapped[str | None] = mapped_column(Text, nullable=True)
 
     celery_task_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
+    metrics_summary: Mapped[str | None] = mapped_column(LongText, nullable=True)
 
     created_by: Mapped[str] = mapped_column(StringUUID, nullable=False)
     started_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True)
diff --git a/api/tasks/evaluation_task.py b/api/tasks/evaluation_task.py
index 3f4af7487d..664d46918e 100644
--- a/api/tasks/evaluation_task.py
+++ b/api/tasks/evaluation_task.py
@@ -12,6 +12,7 @@ from configs import dify_config
 from core.evaluation.base_evaluation_instance import BaseEvaluationInstance
 from core.evaluation.entities.evaluation_entity import (
     EvaluationCategory,
+    EvaluationDatasetInput,
     EvaluationItemResult,
     EvaluationRunData,
 )
@@ -88,23 +89,23 @@ def _execute_evaluation(session: Any, run_data: EvaluationRunData) -> None:
     )
 
     results: list[EvaluationItemResult] = _execute_evaluation_runner(
-        session,
-        run_data,
-        evaluation_instance,
-        node_run_result_mapping_list,
+        session=session,
+        run_data=run_data,
+        evaluation_instance=evaluation_instance,
+        node_run_result_mapping_list=node_run_result_mapping_list,
     )
 
     # Compute summary metrics
     metrics_summary = _compute_metrics_summary(results, run_data.judgment_config)
 
     # Generate result XLSX
-    result_xlsx = _generate_result_xlsx(run_data.items, results)
+    result_xlsx = _generate_result_xlsx(run_data.input_list, results)
 
     # Store result file
     result_file_id = _store_result_file(run_data.tenant_id, run_data.evaluation_run_id, result_xlsx, session)
 
     # Update run to completed
-    evaluation_run = session.query(EvaluationRun).filter_by(id=run_data.evaluation_run_id).first()
+    evaluation_run: EvaluationRun = session.query(EvaluationRun).filter_by(id=run_data.evaluation_run_id).first()
     if evaluation_run:
         evaluation_run.status = EvaluationRunStatus.COMPLETED
         evaluation_run.completed_at = naive_utc_now()
@@ -232,10 +233,10 @@ def _compute_metrics_summary(
 
 
 def _generate_result_xlsx(
-    items: list[Any],
+    input_list: list[EvaluationDatasetInput],
     results: list[EvaluationItemResult],
 ) -> bytes:
-    """Generate result XLSX with input data, actual output, and metric scores."""
+    """Generate result XLSX with input data, actual output, metric scores, and judgment."""
     wb = Workbook()
     ws = wb.active
     if ws is None:
@@ -261,14 +262,18 @@ def _generate_result_xlsx(
 
     # Collect all input keys
     input_keys: list[str] = []
-    for item in items:
+    for item in input_list:
         for key in item.inputs:
             if key not in input_keys:
                 input_keys.append(key)
 
+    # Include judgment column only when at least one result has judgment conditions evaluated
+    has_judgment = any(bool(r.judgment.condition_results) for r in results)
+
     # Build headers
+    judgment_headers = ["judgment"] if has_judgment else []
     headers = (
-        ["index"] + input_keys + ["expected_output", "actual_output"] + all_metric_names + ["overall_score", "error"]
+        ["index"] + input_keys + ["expected_output", "actual_output"] + all_metric_names + judgment_headers + ["error"]
     )
 
     # Write header row
@@ -288,7 +293,7 @@ def _generate_result_xlsx(
     result_by_index = {r.index: r for r in results}
 
     # Write data rows
-    for row_idx, item in enumerate(items, start=2):
+    for row_idx, item in enumerate(input_list, start=2):
         result = result_by_index.get(item.index)
 
         col = 1
@@ -317,9 +322,14 @@ def _generate_result_xlsx(
             ws.cell(row=row_idx, column=col, value=score if score is not None else "").border = thin_border
             col += 1
 
-        # Overall score
-        ws.cell(row=row_idx, column=col, value=result.overall_score if result else "").border = thin_border
-        col += 1
+        # Judgment result
+        if has_judgment:
+            if result and result.judgment.condition_results:
+                judgment_value = "Pass" if result.judgment.passed else "Fail"
+            else:
+                judgment_value = ""
+            ws.cell(row=row_idx, column=col, value=judgment_value).border = thin_border
+            col += 1
 
         # Error
         ws.cell(row=row_idx, column=col, value=result.error if result else "").border = thin_border