From c68194093e4948bc29b40d6e0df78cc4a34e0de0 Mon Sep 17 00:00:00 2001 From: FFXN Date: Fri, 13 Mar 2026 10:09:38 +0800 Subject: [PATCH] feat: Parse the expression to get the input parameters for the evaluation workflow. --- .../evaluation/base_evaluation_instance.py | 20 ++++----- .../runners/base_evaluation_runner.py | 42 +++++++++++++++---- 2 files changed, 40 insertions(+), 22 deletions(-) diff --git a/api/core/evaluation/base_evaluation_instance.py b/api/core/evaluation/base_evaluation_instance.py index f6c64eac96..c5afaa25a5 100644 --- a/api/core/evaluation/base_evaluation_instance.py +++ b/api/core/evaluation/base_evaluation_instance.py @@ -182,14 +182,14 @@ class BaseEvaluationInstance(ABC): # Check if the entire value is a single expression. full_match = VARIABLE_REGEX.fullmatch(value_source) if full_match: - workflow_inputs[field_name] = _resolve_variable_selector( + workflow_inputs[field_name] = resolve_variable_selector( full_match.group(1), node_run_result_mapping, ) elif VARIABLE_REGEX.search(value_source): # Mixed template: interpolate all expressions as strings. workflow_inputs[field_name] = VARIABLE_REGEX.sub( lambda m: str( - _resolve_variable_selector(m.group(1), node_run_result_mapping) + resolve_variable_selector(m.group(1), node_run_result_mapping) ), value_source, ) @@ -203,12 +203,7 @@ class BaseEvaluationInstance(ABC): def _extract_workflow_metrics( response: Mapping[str, object], ) -> list[EvaluationMetric]: - """Extract evaluation metrics from workflow output variables. - - Each output variable is treated as a metric. The variable name - becomes the metric name, and its value is stored as-is regardless - of type (numeric, string, dict, etc.). - """ + """Extract evaluation metrics from workflow output variables.""" metrics: list[EvaluationMetric] = [] data = response.get("data") @@ -231,15 +226,14 @@ class BaseEvaluationInstance(ABC): return metrics -def _resolve_variable_selector( +def resolve_variable_selector( selector_raw: str, node_run_result_mapping: dict[str, NodeRunResult], ) -> object: - """Resolve a ``#node_id.output_key#`` selector against node run results. - Returns the resolved value in its original type, or an empty string - if the node or any key along the path is not found. """ - # "#node_id.output_key#" → "node_id.output_key" + Resolve a ``#node_id.output_key#`` selector against node run results. + """ + # cleaned = selector_raw.strip("#") parts = cleaned.split(".") diff --git a/api/core/evaluation/runners/base_evaluation_runner.py b/api/core/evaluation/runners/base_evaluation_runner.py index 0c8f3e4f5f..984f6dd7b9 100644 --- a/api/core/evaluation/runners/base_evaluation_runner.py +++ b/api/core/evaluation/runners/base_evaluation_runner.py @@ -67,7 +67,7 @@ class BaseEvaluationRunner(ABC): evaluation_run = self.session.query(EvaluationRun).filter_by(id=evaluation_run_id).first() if not evaluation_run: raise ValueError(f"EvaluationRun {evaluation_run_id} not found") - + if not default_metric and not customized_metrics: raise ValueError("Either default_metric or customized_metrics must be provided") @@ -144,7 +144,17 @@ class BaseEvaluationRunner(ABC): node_run_result_mapping_list: list[dict[str, NodeRunResult]] | None = None, ) -> list[EvaluationItemResult]: """Apply judgment conditions to each result's metrics. + + Left side (``metric_name``): looked up from evaluate-phase metrics only. + Right side: when ``value_source="variable"``, ``condition.value`` + contains an expression (e.g. ``{{#node_id.output_key#}}``). The + expression is parsed and resolved against the corresponding + ``node_run_result_mapping`` to obtain the actual comparison value. """ + from core.evaluation.base_evaluation_instance import resolve_variable_selector + from core.evaluation.entities.judgment_entity import JudgmentValueSource + from core.workflow.nodes.base.variable_template_parser import REGEX as VARIABLE_REGEX + judged_results: list[EvaluationItemResult] = [] for idx, result in enumerate(results): @@ -155,14 +165,28 @@ class BaseEvaluationRunner(ABC): # Left side: only metrics metric_values: dict[str, object] = {m.name: m.value for m in result.metrics} - # Right side variable pool: metrics + intermediate node run results - variable_values: dict[str, object] = dict(metric_values) - if node_run_result_mapping_list and idx < len(node_run_result_mapping_list): - node_run_result_mapping = node_run_result_mapping_list[idx] - for node_id, node_result in node_run_result_mapping.items(): - if node_result.outputs: - for output_key, output_value in node_result.outputs.items(): - variable_values[f"{node_id}.{output_key}"] = output_value + # Right side: pre-resolve variable expressions against node run results. + # Each condition.value expression (e.g. "{{#llm1.text#}}") is resolved + # and stored in variable_values keyed by the raw expression string, so + # that JudgmentProcessor._resolve_comparison_value can look it up. + variable_values: dict[str, object] = {} + node_run_result_mapping = ( + node_run_result_mapping_list[idx] + if node_run_result_mapping_list and idx < len(node_run_result_mapping_list) + else {} + ) + for condition in judgment_config.conditions: + if ( + condition.value_source == JudgmentValueSource.VARIABLE + and isinstance(condition.value, str) + and node_run_result_mapping + ): + match = VARIABLE_REGEX.fullmatch(condition.value) + if match: + resolved = resolve_variable_selector( + match.group(1), node_run_result_mapping + ) + variable_values[condition.value] = resolved judgment_result = JudgmentProcessor.evaluate( metric_values, judgment_config, variable_values=variable_values