feat: Parse the expression to get the input parameters for the evaluation workflow.

2026-05-11 14:58:23 +08:00 · 2026-03-13 10:09:38 +08:00 · 2026-03-13 10:09:38 +08:00 · c68194093e
commit c68194093e
parent 18198b88ff
2 changed files with 40 additions and 22 deletions
--- a/api/core/evaluation/base_evaluation_instance.py
+++ b/api/core/evaluation/base_evaluation_instance.py
@ -182,14 +182,14 @@ class BaseEvaluationInstance(ABC):
            # Check if the entire value is a single expression.
            full_match = VARIABLE_REGEX.fullmatch(value_source)
            if full_match:
-                workflow_inputs[field_name] = _resolve_variable_selector(
+                workflow_inputs[field_name] = resolve_variable_selector(
                    full_match.group(1), node_run_result_mapping,
                )
            elif VARIABLE_REGEX.search(value_source):
                # Mixed template: interpolate all expressions as strings.
                workflow_inputs[field_name] = VARIABLE_REGEX.sub(
                    lambda m: str(
-                        _resolve_variable_selector(m.group(1), node_run_result_mapping)
+                        resolve_variable_selector(m.group(1), node_run_result_mapping)
                    ),
                    value_source,
                )
@ -203,12 +203,7 @@ class BaseEvaluationInstance(ABC):
    def _extract_workflow_metrics(
        response: Mapping[str, object],
    ) -> list[EvaluationMetric]:
-        """Extract evaluation metrics from workflow output variables.
-
-        Each output variable is treated as a metric. The variable name
-        becomes the metric name, and its value is stored as-is regardless
-        of type (numeric, string, dict, etc.).
-        """
+        """Extract evaluation metrics from workflow output variables."""
        metrics: list[EvaluationMetric] = []

        data = response.get("data")
@ -231,15 +226,14 @@ class BaseEvaluationInstance(ABC):
        return metrics


-def _resolve_variable_selector(
+def resolve_variable_selector(
    selector_raw: str,
    node_run_result_mapping: dict[str, NodeRunResult],
 ) -> object:
-    """Resolve a ``#node_id.output_key#`` selector against node run results.
-    Returns the resolved value in its original type, or an empty string
-    if the node or any key along the path is not found.
    """
-    # "#node_id.output_key#" → "node_id.output_key"
+    Resolve a ``#node_id.output_key#`` selector against node run results.
+    """
+    #
    cleaned = selector_raw.strip("#")
    parts = cleaned.split(".")

--- a/api/core/evaluation/runners/base_evaluation_runner.py
+++ b/api/core/evaluation/runners/base_evaluation_runner.py
@ -67,7 +67,7 @@ class BaseEvaluationRunner(ABC):
        evaluation_run = self.session.query(EvaluationRun).filter_by(id=evaluation_run_id).first()
        if not evaluation_run:
            raise ValueError(f"EvaluationRun {evaluation_run_id} not found")
-        
+
        if not default_metric and not customized_metrics:
            raise ValueError("Either default_metric or customized_metrics must be provided")

@ -144,7 +144,17 @@ class BaseEvaluationRunner(ABC):
        node_run_result_mapping_list: list[dict[str, NodeRunResult]] | None = None,
    ) -> list[EvaluationItemResult]:
        """Apply judgment conditions to each result's metrics.
+
+        Left side (``metric_name``): looked up from evaluate-phase metrics only.
+        Right side: when ``value_source="variable"``, ``condition.value``
+        contains an expression (e.g. ``{{#node_id.output_key#}}``).  The
+        expression is parsed and resolved against the corresponding
+        ``node_run_result_mapping`` to obtain the actual comparison value.
        """
+        from core.evaluation.base_evaluation_instance import resolve_variable_selector
+        from core.evaluation.entities.judgment_entity import JudgmentValueSource
+        from core.workflow.nodes.base.variable_template_parser import REGEX as VARIABLE_REGEX
+
        judged_results: list[EvaluationItemResult] = []

        for idx, result in enumerate(results):
@ -155,14 +165,28 @@ class BaseEvaluationRunner(ABC):
            # Left side: only metrics
            metric_values: dict[str, object] = {m.name: m.value for m in result.metrics}

-            # Right side variable pool: metrics + intermediate node run results
-            variable_values: dict[str, object] = dict(metric_values)
-            if node_run_result_mapping_list and idx < len(node_run_result_mapping_list):
-                node_run_result_mapping = node_run_result_mapping_list[idx]
-                for node_id, node_result in node_run_result_mapping.items():
-                    if node_result.outputs:
-                        for output_key, output_value in node_result.outputs.items():
-                            variable_values[f"{node_id}.{output_key}"] = output_value
+            # Right side: pre-resolve variable expressions against node run results.
+            # Each condition.value expression (e.g. "{{#llm1.text#}}") is resolved
+            # and stored in variable_values keyed by the raw expression string, so
+            # that JudgmentProcessor._resolve_comparison_value can look it up.
+            variable_values: dict[str, object] = {}
+            node_run_result_mapping = (
+                node_run_result_mapping_list[idx]
+                if node_run_result_mapping_list and idx < len(node_run_result_mapping_list)
+                else {}
+            )
+            for condition in judgment_config.conditions:
+                if (
+                    condition.value_source == JudgmentValueSource.VARIABLE
+                    and isinstance(condition.value, str)
+                    and node_run_result_mapping
+                ):
+                    match = VARIABLE_REGEX.fullmatch(condition.value)
+                    if match:
+                        resolved = resolve_variable_selector(
+                            match.group(1), node_run_result_mapping
+                        )
+                        variable_values[condition.value] = resolved

            judgment_result = JudgmentProcessor.evaluate(
                metric_values, judgment_config, variable_values=variable_values