From 99d3c645b81889528dec2b51554cc163765d2739 Mon Sep 17 00:00:00 2001 From: FFXN Date: Thu, 5 Mar 2026 13:36:05 +0800 Subject: [PATCH] feat: Implement customized evaluation in BaseEvaluationInstance. --- .../runners/base_evaluation_runner.py | 52 ++----------------- 1 file changed, 4 insertions(+), 48 deletions(-) diff --git a/api/core/evaluation/runners/base_evaluation_runner.py b/api/core/evaluation/runners/base_evaluation_runner.py index 934ef7eeb9..24fbea5ee6 100644 --- a/api/core/evaluation/runners/base_evaluation_runner.py +++ b/api/core/evaluation/runners/base_evaluation_runner.py @@ -29,20 +29,7 @@ logger = logging.getLogger(__name__) class BaseEvaluationRunner(ABC): - """Abstract base class for evaluation runners. - - Runners are responsible for executing the target (App/Snippet/Retrieval) - to collect actual outputs, then computing evaluation metrics, optionally - applying judgment conditions, and persisting results. - - Built-in capabilities (implemented in this base class): - - Customized workflow dispatch (``_evaluate_customized``) - - Judgment condition evaluation (``_apply_judgment``) - - Subclass responsibilities: - - ``execute_target`` — target-specific execution logic - - ``evaluate_metrics`` — framework-specific metric computation (RAGAS etc.) - """ + """Abstract base class for evaluation runners. """ def __init__(self, evaluation_instance: BaseEvaluationInstance, session: Session): self.evaluation_instance = evaluation_instance @@ -69,18 +56,7 @@ class BaseEvaluationRunner(ABC): model_name: str, tenant_id: str, ) -> list[EvaluationItemResult]: - """Compute evaluation metrics on the collected results. - - Called only when the evaluation is NOT using a customized workflow - (i.e. ``metrics_config`` does not contain ``workflow_id``). - - Implementations should: - 1. Merge ``actual_output`` from ``results`` into the ``context`` - field of each ``EvaluationItemInput``. - 2. Call ``self.evaluation_instance.evaluate_xxx()`` with the - merged items. - 3. Return updated results with metrics populated. - """ + """Compute evaluation metrics on the collected results.""" ... def run( @@ -131,13 +107,10 @@ class BaseEvaluationRunner(ABC): if successful_items and successful_results: try: if _is_customized_evaluation(metrics_config): - # Customized workflow evaluation — target-type agnostic, - # handled via BaseEvaluationInstance.evaluate_with_customized_workflow(). evaluated_results = self._evaluate_customized( successful_items, successful_results, metrics_config, tenant_id, ) else: - # Framework-specific evaluation — delegate to subclass evaluated_results = self.evaluate_metrics( successful_items, successful_results, metrics_config, model_provider, model_name, tenant_id, @@ -176,10 +149,6 @@ class BaseEvaluationRunner(ABC): return results - # ------------------------------------------------------------------ - # Customized workflow evaluation dispatch - # ------------------------------------------------------------------ - def _evaluate_customized( self, items: list[EvaluationItemInput], @@ -187,13 +156,7 @@ class BaseEvaluationRunner(ABC): metrics_config: dict, tenant_id: str, ) -> list[EvaluationItemResult]: - """Delegate to the instance's customized workflow evaluator. - - Unlike the framework path (which merges ``actual_output`` into - ``context``), here we pass ``results`` directly — the instance's - ``evaluate_with_customized_workflow()`` reads ``actual_output`` - from each ``EvaluationItemResult``. - """ + """Delegate to the instance's customized workflow evaluator.""" evaluated = self.evaluation_instance.evaluate_with_customized_workflow( items, results, metrics_config, tenant_id, ) @@ -217,9 +180,6 @@ class BaseEvaluationRunner(ABC): final_results.append(result) return final_results - # ------------------------------------------------------------------ - # Judgment (target-type agnostic) - # ------------------------------------------------------------------ @staticmethod def _apply_judgment( @@ -268,9 +228,5 @@ class BaseEvaluationRunner(ABC): def _is_customized_evaluation(metrics_config: dict[str, Any]) -> bool: - """Check if metrics_config indicates a customized workflow evaluation. - - The convention is that ``metrics_config["workflow_id"]`` is present - when a user-defined workflow should be used for evaluation. - """ + """Check if metrics_config indicates a customized workflow evaluation.""" return bool(metrics_config.get("workflow_id"))