diff --git a/api/configs/feature/__init__.py b/api/configs/feature/__init__.py index 47f83d1ec0..78862c1384 100644 --- a/api/configs/feature/__init__.py +++ b/api/configs/feature/__init__.py @@ -1383,7 +1383,7 @@ class EvaluationConfig(BaseSettings): EVALUATION_MAX_DATASET_ROWS: PositiveInt = Field( description="Maximum number of rows allowed in an evaluation dataset", - default=1000, + default=500, ) EVALUATION_TASK_TIMEOUT: PositiveInt = Field( diff --git a/api/controllers/console/__init__.py b/api/controllers/console/__init__.py index 086d6f15a3..5f038ac73e 100644 --- a/api/controllers/console/__init__.py +++ b/api/controllers/console/__init__.py @@ -106,6 +106,9 @@ from .datasets.rag_pipeline import ( rag_pipeline_workflow, ) +# Import evaluation controllers +from .evaluation import evaluation + # Import explore controllers from .explore import ( banner, @@ -116,9 +119,6 @@ from .explore import ( trial, ) -# Import evaluation controllers -from .evaluation import evaluation - # Import snippet controllers from .snippets import snippet_workflow diff --git a/api/controllers/console/evaluation/evaluation.py b/api/controllers/console/evaluation/evaluation.py index 1b0445728d..362e613b40 100644 --- a/api/controllers/console/evaluation/evaluation.py +++ b/api/controllers/console/evaluation/evaluation.py @@ -153,9 +153,7 @@ def get_evaluation_target(view_func: Callable[P, R]): target: Union[App, CustomizedSnippet] | None = None if target_type == "app": - target = ( - db.session.query(App).where(App.id == target_id, App.tenant_id == current_tenant_id).first() - ) + target = db.session.query(App).where(App.id == target_id, App.tenant_id == current_tenant_id).first() elif target_type == "snippets": target = ( db.session.query(CustomizedSnippet) @@ -229,9 +227,7 @@ class EvaluationDetailApi(Resource): _, current_tenant_id = current_account_with_tenant() with Session(db.engine, expire_on_commit=False) as session: - config = EvaluationService.get_evaluation_config( - session, current_tenant_id, target_type, str(target.id) - ) + config = EvaluationService.get_evaluation_config(session, current_tenant_id, target_type, str(target.id)) if config is None: return { @@ -360,9 +356,7 @@ class EvaluationRunApi(Resource): # Load dataset file upload_file = ( - db.session.query(UploadFile) - .filter_by(id=run_request.file_id, tenant_id=current_tenant_id) - .first() + db.session.query(UploadFile).filter_by(id=run_request.file_id, tenant_id=current_tenant_id).first() ) if not upload_file: raise NotFound("Dataset file not found.") @@ -397,9 +391,7 @@ class EvaluationRunApi(Resource): return {"message": str(e.description)}, 400 -@console_ns.route( - "///evaluation/runs/" -) +@console_ns.route("///evaluation/runs/") class EvaluationRunDetailApi(Resource): @console_ns.doc("get_evaluation_run_detail") @console_ns.response(200, "Evaluation run detail retrieved") @@ -444,9 +436,7 @@ class EvaluationRunDetailApi(Resource): return {"message": str(e.description)}, 404 -@console_ns.route( - "///evaluation/runs//cancel" -) +@console_ns.route("///evaluation/runs//cancel") class EvaluationRunCancelApi(Resource): @console_ns.doc("cancel_evaluation_run") @console_ns.response(200, "Evaluation run cancelled") @@ -599,6 +589,7 @@ def _serialize_evaluation_run_item(item: EvaluationRunItem) -> dict[str, object] "expected_output": item.expected_output, "actual_output": item.actual_output, "metrics": item.metrics_list, + "judgment": item.judgment_dict, "metadata": item.metadata_dict, "error": item.error, "overall_score": item.overall_score, diff --git a/api/controllers/console/workspace/snippets.py b/api/controllers/console/workspace/snippets.py index 51b66d2e55..bb5fea043e 100644 --- a/api/controllers/console/workspace/snippets.py +++ b/api/controllers/console/workspace/snippets.py @@ -237,9 +237,7 @@ class CustomizedSnippetExportApi(Resource): with Session(db.engine) as session: export_service = SnippetDslService(session) - result = export_service.export_snippet_dsl( - snippet=snippet, include_secret=query.include_secret == "true" - ) + result = export_service.export_snippet_dsl(snippet=snippet, include_secret=query.include_secret == "true") return {"data": result}, 200 diff --git a/api/core/evaluation/base_evaluation_instance.py b/api/core/evaluation/base_evaluation_instance.py index c5afaa25a5..9aeef0d0b5 100644 --- a/api/core/evaluation/base_evaluation_instance.py +++ b/api/core/evaluation/base_evaluation_instance.py @@ -16,7 +16,7 @@ logger = logging.getLogger(__name__) class BaseEvaluationInstance(ABC): - """Abstract base class for evaluation framework adapters. """ + """Abstract base class for evaluation framework adapters.""" @abstractmethod def evaluate_llm( @@ -95,33 +95,26 @@ class BaseEvaluationInstance(ABC): workflow_id = customized_metrics.evaluation_workflow_id if not workflow_id: - raise ValueError( - "customized_metrics must contain 'evaluation_workflow_id' for customized evaluator" - ) + raise ValueError("customized_metrics must contain 'evaluation_workflow_id' for customized evaluator") # Load the evaluator workflow resources using a dedicated session with Session(db.engine, expire_on_commit=False) as session, session.begin(): - app = session.query(App).filter_by( - id=workflow_id, tenant_id=tenant_id - ).first() + app = session.query(App).filter_by(id=workflow_id, tenant_id=tenant_id).first() if not app: - raise ValueError( - f"Evaluation workflow app {workflow_id} not found in tenant {tenant_id}" - ) + raise ValueError(f"Evaluation workflow app {workflow_id} not found in tenant {tenant_id}") service_account = get_service_account_for_app(session, workflow_id) workflow_service = WorkflowService() published_workflow = workflow_service.get_published_workflow(app_model=app) if not published_workflow: - raise ValueError( - f"No published workflow found for evaluation app {workflow_id}" - ) + raise ValueError(f"No published workflow found for evaluation app {workflow_id}") eval_results: list[EvaluationItemResult] = [] for idx, node_run_result_mapping in enumerate(node_run_result_mapping_list): try: workflow_inputs = self._build_workflow_inputs( - customized_metrics.input_fields, node_run_result_mapping, + customized_metrics.input_fields, + node_run_result_mapping, ) generator = WorkflowAppGenerator() @@ -183,14 +176,13 @@ class BaseEvaluationInstance(ABC): full_match = VARIABLE_REGEX.fullmatch(value_source) if full_match: workflow_inputs[field_name] = resolve_variable_selector( - full_match.group(1), node_run_result_mapping, + full_match.group(1), + node_run_result_mapping, ) elif VARIABLE_REGEX.search(value_source): # Mixed template: interpolate all expressions as strings. workflow_inputs[field_name] = VARIABLE_REGEX.sub( - lambda m: str( - resolve_variable_selector(m.group(1), node_run_result_mapping) - ), + lambda m: str(resolve_variable_selector(m.group(1), node_run_result_mapping)), value_source, ) else: @@ -213,9 +205,7 @@ class BaseEvaluationInstance(ABC): outputs = data.get("outputs") if not isinstance(outputs, dict): - logger.warning( - "Unexpected workflow response format: 'outputs' is not a dict" - ) + logger.warning("Unexpected workflow response format: 'outputs' is not a dict") return metrics for key, raw_value in outputs.items(): @@ -239,7 +229,8 @@ def resolve_variable_selector( if len(parts) < 2: logger.warning( - "Selector '%s' must have at least node_id.output_key", selector_raw, + "Selector '%s' must have at least node_id.output_key", + selector_raw, ) return "" @@ -250,7 +241,8 @@ def resolve_variable_selector( if not node_result or not node_result.outputs: logger.warning( "Selector '%s': node '%s' not found or has no outputs", - selector_raw, node_id, + selector_raw, + node_id, ) return "" @@ -262,14 +254,17 @@ def resolve_variable_selector( if next_val is None: logger.warning( "Selector '%s': key '%s' not found in node '%s' outputs", - selector_raw, key, node_id, + selector_raw, + key, + node_id, ) return "" current = next_val else: logger.warning( "Selector '%s': cannot traverse into non-dict value at key '%s'", - selector_raw, key, + selector_raw, + key, ) return "" diff --git a/api/core/evaluation/entities/config_entity.py b/api/core/evaluation/entities/config_entity.py index c19f965dd6..ef9f22f185 100644 --- a/api/core/evaluation/entities/config_entity.py +++ b/api/core/evaluation/entities/config_entity.py @@ -11,14 +11,17 @@ class EvaluationFrameworkEnum(StrEnum): class BaseEvaluationConfig(BaseModel): """Base configuration for evaluation frameworks.""" + pass class RagasConfig(BaseEvaluationConfig): """RAGAS-specific configuration.""" + pass class DeepEvalConfig(BaseEvaluationConfig): """DeepEval-specific configuration.""" + pass diff --git a/api/core/evaluation/entities/evaluation_entity.py b/api/core/evaluation/entities/evaluation_entity.py index f61dea2f15..8d065888a3 100644 --- a/api/core/evaluation/entities/evaluation_entity.py +++ b/api/core/evaluation/entities/evaluation_entity.py @@ -40,9 +40,10 @@ class EvaluationItemResult(BaseModel): actual_output: str | None = None metrics: list[EvaluationMetric] = Field(default_factory=list) metadata: dict[str, Any] = Field(default_factory=dict) - judgment: JudgmentResult | None = None + judgment: JudgmentResult = Field(default_factory=JudgmentResult) error: str | None = None + class NodeInfo(BaseModel): node_id: str type: str @@ -67,6 +68,7 @@ class CustomizedMetrics(BaseModel): class EvaluationConfigData(BaseModel): """Structured data for saving evaluation configuration.""" + evaluation_model: str = "" evaluation_model_provider: str = "" default_metrics: list[DefaultMetric] = Field(default_factory=list) @@ -76,11 +78,13 @@ class EvaluationConfigData(BaseModel): class EvaluationRunRequest(EvaluationConfigData): """Request body for starting an evaluation run.""" + file_id: str class EvaluationRunData(BaseModel): """Serializable data for Celery task.""" + evaluation_run_id: str tenant_id: str target_type: str diff --git a/api/core/evaluation/entities/judgment_entity.py b/api/core/evaluation/entities/judgment_entity.py index 58e5fe91e3..ee9bd64a2d 100644 --- a/api/core/evaluation/entities/judgment_entity.py +++ b/api/core/evaluation/entities/judgment_entity.py @@ -1,11 +1,6 @@ """Judgment condition entities for evaluation metric assessment. Key concepts: - - **value_source**: Where the comparison target comes from. - - "constant": a literal value supplied by the user (e.g. threshold "0.8"). - - "variable": a named field from the evaluation target's runtime data - (inputs, actual_output, expected_output). The ``value`` field holds the - variable key; the actual comparison value is resolved at evaluation time. - **condition_type**: Determines operator semantics and type coercion. - "string": string operators (contains, is, start with, …). - "number": numeric operators (>, <, =, ≠, ≥, ≤). @@ -18,34 +13,19 @@ Typical usage: JudgmentCondition( metric_name="faithfulness", comparison_operator=">", - value="0.8", + condition_value="0.8", condition_type="number", - ), - JudgmentCondition( - metric_name="output", - comparison_operator="contains", - value="expected_output", - value_source="variable", - condition_type="string", - ), + ) ], ) """ -from collections.abc import Sequence from enum import StrEnum from typing import Any, Literal from pydantic import BaseModel, Field -class JudgmentValueSource(StrEnum): - """Where the comparison target value comes from.""" - - CONSTANT = "constant" - VARIABLE = "variable" - - class JudgmentConditionType(StrEnum): """Category of the condition, controls operator semantics and type coercion.""" @@ -90,22 +70,15 @@ class JudgmentCondition(BaseModel): metric_name: The name of the evaluation metric to check (left side). Must match an EvaluationMetric.name in the results. comparison_operator: The comparison operator to apply. - value: The comparison target (right side). - - When value_source is "constant": the literal threshold/expected value. - - When value_source is "variable": the variable key name to look up - from the evaluation target's runtime data. - For unary operators (empty, null, etc.), this can be None. - value_source: Where the comparison value comes from. - "constant" (default) for user-supplied literals, - "variable" for references to evaluation target data. + condition_value: The comparison target (right side). For unary operators + such as ``empty`` or ``null`` this can be ``None``. condition_type: Controls type coercion and which operators are valid. "string" (default), "number", or "datetime". """ metric_name: str comparison_operator: JudgmentComparisonOperator - value: str | Sequence[str] | None = None - value_source: JudgmentValueSource = JudgmentValueSource.CONSTANT + condition_value: Any | None = None condition_type: JudgmentConditionType = JudgmentConditionType.STRING diff --git a/api/core/evaluation/frameworks/deepeval/deepeval_evaluator.py b/api/core/evaluation/frameworks/deepeval/deepeval_evaluator.py index 72d84ef9ec..3ea6739bdc 100644 --- a/api/core/evaluation/frameworks/deepeval/deepeval_evaluator.py +++ b/api/core/evaluation/frameworks/deepeval/deepeval_evaluator.py @@ -67,9 +67,7 @@ class DeepEvalEvaluator(BaseEvaluationInstance): model_name: str, tenant_id: str, ) -> list[EvaluationItemResult]: - return self._evaluate( - items, metric_name, model_provider, model_name, tenant_id, EvaluationCategory.RETRIEVAL - ) + return self._evaluate(items, metric_name, model_provider, model_name, tenant_id, EvaluationCategory.RETRIEVAL) def evaluate_agent( self, @@ -79,9 +77,7 @@ class DeepEvalEvaluator(BaseEvaluationInstance): model_name: str, tenant_id: str, ) -> list[EvaluationItemResult]: - return self._evaluate( - items, metric_name, model_provider, model_name, tenant_id, EvaluationCategory.AGENT - ) + return self._evaluate(items, metric_name, model_provider, model_name, tenant_id, EvaluationCategory.AGENT) def evaluate_workflow( self, @@ -91,9 +87,7 @@ class DeepEvalEvaluator(BaseEvaluationInstance): model_name: str, tenant_id: str, ) -> list[EvaluationItemResult]: - return self._evaluate( - items, metric_name, model_provider, model_name, tenant_id, EvaluationCategory.WORKFLOW - ) + return self._evaluate(items, metric_name, model_provider, model_name, tenant_id, EvaluationCategory.WORKFLOW) def _evaluate( self, @@ -106,11 +100,7 @@ class DeepEvalEvaluator(BaseEvaluationInstance): ) -> list[EvaluationItemResult]: """Core evaluation logic using DeepEval.""" model_wrapper = DifyModelWrapper(model_provider, model_name, tenant_id) - requested_metrics = ( - [metric_name] - if metric_name - else self.get_supported_metrics(category) - ) + requested_metrics = [metric_name] if metric_name else self.get_supported_metrics(category) try: return self._evaluate_with_deepeval(items, requested_metrics, category) @@ -144,9 +134,7 @@ class DeepEvalEvaluator(BaseEvaluationInstance): try: metric.measure(test_case) if metric.score is not None: - metrics.append( - EvaluationMetric(name=metric.__class__.__name__, value=float(metric.score)) - ) + metrics.append(EvaluationMetric(name=metric.__class__.__name__, value=float(metric.score))) except Exception: logger.exception( "Failed to compute metric %s for item %d", @@ -229,9 +217,7 @@ class DeepEvalEvaluator(BaseEvaluationInstance): parts.append(f"\nExpected Output: {item.expected_output}") if item.context: parts.append(f"\nContext: {'; '.join(item.context)}") - parts.append( - "\nRespond with ONLY a single floating point number between 0.0 and 1.0, nothing else." - ) + parts.append("\nRespond with ONLY a single floating point number between 0.0 and 1.0, nothing else.") return "\n".join(parts) @staticmethod diff --git a/api/core/evaluation/frameworks/ragas/ragas_evaluator.py b/api/core/evaluation/frameworks/ragas/ragas_evaluator.py index 88588f0d6e..842dfb29eb 100644 --- a/api/core/evaluation/frameworks/ragas/ragas_evaluator.py +++ b/api/core/evaluation/frameworks/ragas/ragas_evaluator.py @@ -57,9 +57,7 @@ class RagasEvaluator(BaseEvaluationInstance): model_name: str, tenant_id: str, ) -> list[EvaluationItemResult]: - return self._evaluate( - items, metric_name, model_provider, model_name, tenant_id, EvaluationCategory.RETRIEVAL - ) + return self._evaluate(items, metric_name, model_provider, model_name, tenant_id, EvaluationCategory.RETRIEVAL) def evaluate_agent( self, @@ -79,9 +77,7 @@ class RagasEvaluator(BaseEvaluationInstance): model_name: str, tenant_id: str, ) -> list[EvaluationItemResult]: - return self._evaluate( - items, metric_name, model_provider, model_name, tenant_id, EvaluationCategory.WORKFLOW - ) + return self._evaluate(items, metric_name, model_provider, model_name, tenant_id, EvaluationCategory.WORKFLOW) def _evaluate( self, @@ -94,11 +90,7 @@ class RagasEvaluator(BaseEvaluationInstance): ) -> list[EvaluationItemResult]: """Core evaluation logic using RAGAS.""" model_wrapper = DifyModelWrapper(model_provider, model_name, tenant_id) - requested_metrics = ( - [metric_name] - if metric_name - else self.get_supported_metrics(category) - ) + requested_metrics = [metric_name] if metric_name else self.get_supported_metrics(category) try: return self._evaluate_with_ragas(items, requested_metrics, model_wrapper, category) @@ -237,9 +229,7 @@ class RagasEvaluator(BaseEvaluationInstance): parts.append(f"\nExpected Output: {item.expected_output}") if item.context: parts.append(f"\nContext: {'; '.join(item.context)}") - parts.append( - "\nRespond with ONLY a single floating point number between 0.0 and 1.0, nothing else." - ) + parts.append("\nRespond with ONLY a single floating point number between 0.0 and 1.0, nothing else.") return "\n".join(parts) @staticmethod diff --git a/api/core/evaluation/judgment/processor.py b/api/core/evaluation/judgment/processor.py index d28c1e10f0..04b0ac9924 100644 --- a/api/core/evaluation/judgment/processor.py +++ b/api/core/evaluation/judgment/processor.py @@ -1,35 +1,20 @@ """Judgment condition processor for evaluation metrics. Evaluates pass/fail judgment conditions against evaluation metric values. -Reuses the core comparison engine from the workflow condition system -(core.workflow.utils.condition.processor._evaluate_condition) to ensure -consistent operator semantics across the platform. +Each condition uses: + - ``metric_name`` as the left-hand side lookup key from ``metric_values`` + - ``comparison_operator`` as the operator + - ``condition_value`` as the right-hand side comparison value -The processor is intentionally decoupled from evaluation frameworks -(RAGAS / Customized) and runners. It operates on plain ``dict`` mappings -and can be invoked from any context. - -Typical usage:: - - metrics = {"faithfulness": 0.85, "answer_relevancy": 0.6} - variables = {"expected_output": "Hello World", "created_at": "2025-01-01T00:00:00"} - config = JudgmentConfig( - logical_operator="and", - conditions=[ - JudgmentCondition(metric_name="faithfulness", comparison_operator=">", - value="0.8", condition_type="number"), - JudgmentCondition(metric_name="output", comparison_operator="contains", - value="expected_output", value_source="variable", - condition_type="string"), - ], - ) - result = JudgmentProcessor.evaluate(metrics, config, variable_values=variables) +The processor is intentionally decoupled from evaluation frameworks and +runners. It operates on plain ``dict`` mappings and can be invoked anywhere +that already has per-item metric results. """ import logging from collections.abc import Sequence from datetime import datetime -from typing import Any +from typing import Any, cast from core.evaluation.entities.judgment_entity import ( JudgmentCondition, @@ -37,9 +22,9 @@ from core.evaluation.entities.judgment_entity import ( JudgmentConditionType, JudgmentConfig, JudgmentResult, - JudgmentValueSource, ) -from core.workflow.utils.condition.processor import _evaluate_condition +from core.workflow.utils.condition.entities import SupportedComparisonOperator +from core.workflow.utils.condition.processor import _evaluate_condition # pyright: ignore[reportPrivateUsage] logger = logging.getLogger(__name__) @@ -48,12 +33,10 @@ _UNARY_OPERATORS = frozenset({"null", "not null", "empty", "not empty"}) class JudgmentProcessor: - @staticmethod def evaluate( metric_values: dict[str, Any], config: JudgmentConfig, - variable_values: dict[str, Any] | None = None, ) -> JudgmentResult: """Evaluate all judgment conditions against the given metric values. @@ -61,9 +44,6 @@ class JudgmentProcessor: metric_values: Mapping of metric name → metric value (e.g. ``{"faithfulness": 0.85, "status": "success"}``). config: The judgment configuration with logical_operator and conditions. - variable_values: Optional mapping of variable name → value, used when - a condition's ``value_source`` is ``"variable"``. Typically built - from the evaluation target's inputs / outputs. Returns: JudgmentResult with overall pass/fail and per-condition details. @@ -78,9 +58,7 @@ class JudgmentProcessor: condition_results: list[JudgmentConditionResult] = [] for condition in config.conditions: - result = JudgmentProcessor._evaluate_single_condition( - metric_values, condition, variable_values - ) + result = JudgmentProcessor._evaluate_single_condition(metric_values, condition) condition_results.append(result) if config.logical_operator == "and" and not result.passed: @@ -112,14 +90,12 @@ class JudgmentProcessor: def _evaluate_single_condition( metric_values: dict[str, Any], condition: JudgmentCondition, - variable_values: dict[str, Any] | None = None, ) -> JudgmentConditionResult: """Evaluate a single judgment condition. Steps: 1. Look up the metric value (left side) by ``metric_name``. - 2. Resolve the comparison value (right side) — either a constant - or a variable reference. + 2. Read ``condition_value`` as the comparison value (right side). 3. Dispatch to the correct type handler (string / number / datetime). """ metric_name = condition.metric_name @@ -130,41 +106,28 @@ class JudgmentProcessor: return JudgmentConditionResult( metric_name=metric_name, comparison_operator=condition.comparison_operator, - expected_value=condition.value, + expected_value=condition.condition_value, actual_value=None, passed=False, error=f"Metric '{metric_name}' not found in evaluation results", ) - # Resolve the comparison value (right side) - try: - resolved_value = JudgmentProcessor._resolve_comparison_value( - condition, variable_values - ) - except ValueError as e: - return JudgmentConditionResult( - metric_name=metric_name, - comparison_operator=condition.comparison_operator, - expected_value=condition.value, - actual_value=actual_value, - passed=False, - error=str(e), - ) + resolved_value = condition.condition_value # Dispatch to the appropriate type handler try: match condition.condition_type: case JudgmentConditionType.DATETIME: - passed = _evaluate_datetime_condition( - actual_value, condition.comparison_operator, resolved_value - ) + passed = _evaluate_datetime_condition(actual_value, condition.comparison_operator, resolved_value) case JudgmentConditionType.NUMBER: - passed = _evaluate_number_condition( - actual_value, condition.comparison_operator, resolved_value - ) + passed = _evaluate_number_condition(actual_value, condition.comparison_operator, resolved_value) case _: # STRING (default) — delegate to workflow engine + if condition.comparison_operator in {"before", "after"}: + raise ValueError( + f"Operator '{condition.comparison_operator}' is not supported for string conditions" + ) passed = _evaluate_condition( - operator=condition.comparison_operator, + operator=cast(SupportedComparisonOperator, condition.comparison_operator), value=actual_value, expected=resolved_value, ) @@ -191,51 +154,6 @@ class JudgmentProcessor: error=str(e), ) - @staticmethod - def _resolve_comparison_value( - condition: JudgmentCondition, - variable_values: dict[str, Any] | None, - ) -> str | Sequence[str] | None: - """Resolve the right-side comparison value. - - For ``value_source == "constant"``, returns ``condition.value`` as-is. - For ``value_source == "variable"``, looks up ``condition.value`` (as a key) - in ``variable_values`` and returns the resolved value (converted to string - for compatibility with the comparison engine). - - Raises: - ValueError: If the variable cannot be resolved. - """ - if condition.value_source == JudgmentValueSource.CONSTANT: - return condition.value - - # Variable resolution - if condition.value is None: - raise ValueError("Variable name (value) must be provided when value_source is 'variable'") - - if not variable_values: - raise ValueError( - f"Cannot resolve variable '{condition.value}': no variable values provided" - ) - - var_key = condition.value if isinstance(condition.value, str) else str(condition.value) - if var_key not in variable_values: - raise ValueError( - f"Variable '{var_key}' not found in evaluation target data. " - f"Available variables: {list(variable_values.keys())}" - ) - - resolved = variable_values[var_key] - # Convert to string for the comparison engine, unless it's already - # a str/Sequence[str]/None which the engine expects. - if resolved is None: - return None - if isinstance(resolved, str): - return resolved - if isinstance(resolved, Sequence) and all(isinstance(v, str) for v in resolved): - return resolved - return str(resolved) - _DATETIME_FORMATS = [ "%Y-%m-%dT%H:%M:%S", @@ -348,7 +266,11 @@ def _evaluate_number_condition( """ # Unary operators — delegate to workflow engine as-is if operator in _UNARY_OPERATORS: - return _evaluate_condition(operator=operator, value=actual, expected=expected) + return _evaluate_condition( + operator=cast(SupportedComparisonOperator, operator), + value=actual, + expected=cast(str | Sequence[str] | bool | Sequence[bool] | None, expected), + ) if actual is None: return False @@ -356,7 +278,7 @@ def _evaluate_number_condition( # Coerce actual to numeric if not isinstance(actual, (int, float)): try: - actual = float(actual) + actual = float(cast(str | int | float, actual)) except (TypeError, ValueError) as e: raise ValueError(f"Cannot convert actual value '{actual}' to number") from e @@ -365,4 +287,8 @@ def _evaluate_number_condition( if expected is not None and not isinstance(expected, str): expected = str(expected) - return _evaluate_condition(operator=operator, value=actual, expected=expected) + return _evaluate_condition( + operator=cast(SupportedComparisonOperator, operator), + value=actual, + expected=expected, + ) diff --git a/api/core/evaluation/runners/__init__.py b/api/core/evaluation/runners/__init__.py index 0137d6cdef..0a69087432 100644 --- a/api/core/evaluation/runners/__init__.py +++ b/api/core/evaluation/runners/__init__.py @@ -20,11 +20,7 @@ def get_service_account_for_app(session: Session, app_id: str) -> Account: if not account: raise ValueError(f"Creator account not found for app {app_id}") - current_tenant = ( - session.query(TenantAccountJoin) - .filter_by(account_id=account.id, current=True) - .first() - ) + current_tenant = session.query(TenantAccountJoin).filter_by(account_id=account.id, current=True).first() if not current_tenant: raise ValueError(f"Current tenant not found for account {account.id}") @@ -48,11 +44,7 @@ def get_service_account_for_snippet(session: Session, snippet_id: str) -> Accoun if not account: raise ValueError(f"Creator account not found for snippet {snippet_id}") - current_tenant = ( - session.query(TenantAccountJoin) - .filter_by(account_id=account.id, current=True) - .first() - ) + current_tenant = session.query(TenantAccountJoin).filter_by(account_id=account.id, current=True).first() if not current_tenant: raise ValueError(f"Current tenant not found for account {account.id}") diff --git a/api/core/evaluation/runners/agent_evaluation_runner.py b/api/core/evaluation/runners/agent_evaluation_runner.py index 8cc5c3cc5b..c050f061e7 100644 --- a/api/core/evaluation/runners/agent_evaluation_runner.py +++ b/api/core/evaluation/runners/agent_evaluation_runner.py @@ -141,10 +141,12 @@ class AgentEvaluationRunner(BaseEvaluationRunner): answer_parts.append(thought) tool = chunk.get("tool") if tool: - tool_calls.append({ - "tool": tool, - "tool_input": chunk.get("tool_input", ""), - }) + tool_calls.append( + { + "tool": tool, + "tool_input": chunk.get("tool_input", ""), + } + ) elif event == "message": answer = chunk.get("answer", "") if answer: diff --git a/api/core/evaluation/runners/base_evaluation_runner.py b/api/core/evaluation/runners/base_evaluation_runner.py index 984f6dd7b9..1aa6b7a8e0 100644 --- a/api/core/evaluation/runners/base_evaluation_runner.py +++ b/api/core/evaluation/runners/base_evaluation_runner.py @@ -1,11 +1,14 @@ """Base evaluation runner. Orchestrates the evaluation lifecycle in four phases: - 1. execute_target — run the target and collect actual outputs (abstract) + 1. execute_target — run the target and collect actual outputs (abstract) 2. evaluate_metrics — compute metrics via framework or customized workflow 3. apply_judgment — evaluate pass/fail judgment conditions on metrics 4. persist — save results to the database +The persisted ``EvaluationRunItem.judgment`` payload must reflect the final +judgment result for each evaluated item, so judgment evaluation happens before +the persistence phase whenever a ``JudgmentConfig`` is supplied. """ import json @@ -30,7 +33,7 @@ logger = logging.getLogger(__name__) class BaseEvaluationRunner(ABC): - """Abstract base class for evaluation runners. """ + """Abstract base class for evaluation runners.""" def __init__(self, evaluation_instance: BaseEvaluationInstance, session: Session): self.evaluation_instance = evaluation_instance @@ -62,6 +65,7 @@ class BaseEvaluationRunner(ABC): model_provider: str = "", model_name: str = "", node_run_result_mapping_list: list[dict[str, NodeRunResult]] | None = None, + judgment_config: JudgmentConfig | None = None, ) -> list[EvaluationItemResult]: """Orchestrate target execution + metric evaluation + judgment for all items.""" evaluation_run = self.session.query(EvaluationRun).filter_by(id=evaluation_run_id).first() @@ -115,9 +119,16 @@ class BaseEvaluationRunner(ABC): results = list(results_by_index.values()) + if judgment_config is not None: + results = self._apply_judgment( + results=results, + judgment_config=judgment_config, + node_run_result_mapping_list=node_run_result_mapping_list, + ) + # Phase 4: Persist individual items for result in results: - item_input = next((item for item in items if item.index == result.index), None) + item_input = next((item for item in evaluation_run.input_list if item.index == result.index), None) run_item = EvaluationRunItem( evaluation_run_id=evaluation_run_id, item_index=result.index, @@ -129,7 +140,7 @@ class BaseEvaluationRunner(ABC): judgment=json.dumps(result.judgment.model_dump()) if result.judgment else None, metadata_json=json.dumps(result.metadata) if result.metadata else None, error=result.error, - overall_score=result.overall_score, + overall_score=getattr(result, "overall_score", None), ) self.session.add(run_item) @@ -145,54 +156,21 @@ class BaseEvaluationRunner(ABC): ) -> list[EvaluationItemResult]: """Apply judgment conditions to each result's metrics. - Left side (``metric_name``): looked up from evaluate-phase metrics only. - Right side: when ``value_source="variable"``, ``condition.value`` - contains an expression (e.g. ``{{#node_id.output_key#}}``). The - expression is parsed and resolved against the corresponding - ``node_run_result_mapping`` to obtain the actual comparison value. + Judgment is computed only from the per-item metric values and the + supplied ``JudgmentConfig``. ``metric_name`` selects the left-hand side + metric, and ``condition_value`` is used as the comparison target. """ - from core.evaluation.base_evaluation_instance import resolve_variable_selector - from core.evaluation.entities.judgment_entity import JudgmentValueSource - from core.workflow.nodes.base.variable_template_parser import REGEX as VARIABLE_REGEX judged_results: list[EvaluationItemResult] = [] - for idx, result in enumerate(results): + for result in results: if result.error is not None or not result.metrics: judged_results.append(result) continue # Left side: only metrics metric_values: dict[str, object] = {m.name: m.value for m in result.metrics} + judgment_result = JudgmentProcessor.evaluate(metric_values, judgment_config) - # Right side: pre-resolve variable expressions against node run results. - # Each condition.value expression (e.g. "{{#llm1.text#}}") is resolved - # and stored in variable_values keyed by the raw expression string, so - # that JudgmentProcessor._resolve_comparison_value can look it up. - variable_values: dict[str, object] = {} - node_run_result_mapping = ( - node_run_result_mapping_list[idx] - if node_run_result_mapping_list and idx < len(node_run_result_mapping_list) - else {} - ) - for condition in judgment_config.conditions: - if ( - condition.value_source == JudgmentValueSource.VARIABLE - and isinstance(condition.value, str) - and node_run_result_mapping - ): - match = VARIABLE_REGEX.fullmatch(condition.value) - if match: - resolved = resolve_variable_selector( - match.group(1), node_run_result_mapping - ) - variable_values[condition.value] = resolved - - judgment_result = JudgmentProcessor.evaluate( - metric_values, judgment_config, variable_values=variable_values - ) - - judged_results.append( - result.model_copy(update={"judgment": judgment_result}) - ) + judged_results.append(result.model_copy(update={"judgment": judgment_result})) return judged_results diff --git a/api/core/evaluation/runners/retrieval_evaluation_runner.py b/api/core/evaluation/runners/retrieval_evaluation_runner.py index d1dc2a4151..49fa01b026 100644 --- a/api/core/evaluation/runners/retrieval_evaluation_runner.py +++ b/api/core/evaluation/runners/retrieval_evaluation_runner.py @@ -63,7 +63,7 @@ class RetrievalEvaluationRunner(BaseEvaluationRunner): @staticmethod def _extract_query(inputs: dict[str, Any]) -> str: - for key in ("query"): + for key in "query": if key in inputs: return str(inputs[key]) return "" diff --git a/api/core/evaluation/runners/snippet_evaluation_runner.py b/api/core/evaluation/runners/snippet_evaluation_runner.py index 8bef78ce22..09aea22fd7 100644 --- a/api/core/evaluation/runners/snippet_evaluation_runner.py +++ b/api/core/evaluation/runners/snippet_evaluation_runner.py @@ -75,11 +75,15 @@ class SnippetEvaluationRunner(BaseEvaluationRunner): # Retrieve per-node execution records from DB workflow_run_id = self._extract_workflow_run_id(response) - node_executions = self._query_node_executions( - tenant_id=tenant_id, - app_id=target_id, - workflow_run_id=workflow_run_id, - ) if workflow_run_id else [] + node_executions = ( + self._query_node_executions( + tenant_id=tenant_id, + app_id=target_id, + workflow_run_id=workflow_run_id, + ) + if workflow_run_id + else [] + ) return EvaluationItemResult( index=item.index, @@ -189,18 +193,18 @@ class SnippetEvaluationRunner(BaseEvaluationRunner): Returns a list of serialisable dicts for storage in ``metadata``. """ - stmt = WorkflowNodeExecutionModel.preload_offload_data( - select(WorkflowNodeExecutionModel) - ).where( - WorkflowNodeExecutionModel.tenant_id == tenant_id, - WorkflowNodeExecutionModel.app_id == app_id, - WorkflowNodeExecutionModel.workflow_run_id == workflow_run_id, - ).order_by(asc(WorkflowNodeExecutionModel.created_at)) - - node_models: Sequence[WorkflowNodeExecutionModel] = ( - self.session.execute(stmt).scalars().all() + stmt = ( + WorkflowNodeExecutionModel.preload_offload_data(select(WorkflowNodeExecutionModel)) + .where( + WorkflowNodeExecutionModel.tenant_id == tenant_id, + WorkflowNodeExecutionModel.app_id == app_id, + WorkflowNodeExecutionModel.workflow_run_id == workflow_run_id, + ) + .order_by(asc(WorkflowNodeExecutionModel.created_at)) ) + node_models: Sequence[WorkflowNodeExecutionModel] = self.session.execute(stmt).scalars().all() + return [self._serialize_node_execution(node) for node in node_models] @staticmethod @@ -211,6 +215,7 @@ class SnippetEvaluationRunner(BaseEvaluationRunner): status, error, and elapsed_time. The virtual Start node injected by SnippetGenerateService is filtered out by the caller if needed. """ + def _safe_parse_json(value: str | None) -> Any: if not value: return None diff --git a/api/core/evaluation/runners/workflow_evaluation_runner.py b/api/core/evaluation/runners/workflow_evaluation_runner.py index c0fcfb4114..76e778ebdf 100644 --- a/api/core/evaluation/runners/workflow_evaluation_runner.py +++ b/api/core/evaluation/runners/workflow_evaluation_runner.py @@ -13,7 +13,6 @@ from core.evaluation.entities.evaluation_entity import ( ) from core.evaluation.runners.base_evaluation_runner import BaseEvaluationRunner from core.workflow.node_events import NodeRunResult -from models.model import App logger = logging.getLogger(__name__) diff --git a/api/models/__init__.py b/api/models/__init__.py index edd0001093..97a9f8ec6f 100644 --- a/api/models/__init__.py +++ b/api/models/__init__.py @@ -26,13 +26,6 @@ from .dataset import ( TidbAuthBinding, Whitelist, ) -from .evaluation import ( - EvaluationConfiguration, - EvaluationRun, - EvaluationRunItem, - EvaluationRunStatus, - EvaluationTargetType, -) from .enums import ( AppTriggerStatus, AppTriggerType, @@ -41,6 +34,13 @@ from .enums import ( WorkflowRunTriggeredFrom, WorkflowTriggerStatus, ) +from .evaluation import ( + EvaluationConfiguration, + EvaluationRun, + EvaluationRunItem, + EvaluationRunStatus, + EvaluationTargetType, +) from .execution_extra_content import ExecutionExtraContent, HumanInputContent from .human_input import HumanInputForm from .model import ( @@ -165,12 +165,12 @@ __all__ = [ "Document", "DocumentSegment", "Embedding", + "EndUser", "EvaluationConfiguration", "EvaluationRun", "EvaluationRunItem", "EvaluationRunStatus", "EvaluationTargetType", - "EndUser", "ExecutionExtraContent", "ExporleBanner", "ExternalKnowledgeApis", diff --git a/api/models/evaluation.py b/api/models/evaluation.py index 898cecb6a3..9737242b76 100644 --- a/api/models/evaluation.py +++ b/api/models/evaluation.py @@ -50,9 +50,7 @@ class EvaluationConfiguration(Base): created_by: Mapped[str] = mapped_column(StringUUID, nullable=False) updated_by: Mapped[str] = mapped_column(StringUUID, nullable=False) - created_at: Mapped[datetime] = mapped_column( - DateTime, nullable=False, server_default=func.current_timestamp() - ) + created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp()) updated_at: Mapped[datetime] = mapped_column( DateTime, nullable=False, server_default=func.current_timestamp(), onupdate=func.current_timestamp() ) @@ -97,17 +95,13 @@ class EvaluationRun(Base): target_id: Mapped[str] = mapped_column(StringUUID, nullable=False) evaluation_config_id: Mapped[str] = mapped_column(StringUUID, nullable=False) - status: Mapped[str] = mapped_column( - String(20), nullable=False, default=EvaluationRunStatus.PENDING - ) + status: Mapped[str] = mapped_column(String(20), nullable=False, default=EvaluationRunStatus.PENDING) dataset_file_id: Mapped[str | None] = mapped_column(StringUUID, nullable=True) result_file_id: Mapped[str | None] = mapped_column(StringUUID, nullable=True) total_items: Mapped[int] = mapped_column(Integer, nullable=False, default=0) completed_items: Mapped[int] = mapped_column(Integer, nullable=False, default=0) failed_items: Mapped[int] = mapped_column(Integer, nullable=False, default=0) - - metrics_summary: Mapped[str | None] = mapped_column(LongText, nullable=True) error: Mapped[str | None] = mapped_column(Text, nullable=True) celery_task_id: Mapped[str | None] = mapped_column(String(255), nullable=True) @@ -115,23 +109,11 @@ class EvaluationRun(Base): created_by: Mapped[str] = mapped_column(StringUUID, nullable=False) started_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True) completed_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True) - created_at: Mapped[datetime] = mapped_column( - DateTime, nullable=False, server_default=func.current_timestamp() - ) + created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp()) updated_at: Mapped[datetime] = mapped_column( DateTime, nullable=False, server_default=func.current_timestamp(), onupdate=func.current_timestamp() ) - @property - def metrics_summary_dict(self) -> dict[str, Any]: - if self.metrics_summary: - return json.loads(self.metrics_summary) - return {} - - @metrics_summary_dict.setter - def metrics_summary_dict(self, value: dict[str, Any]) -> None: - self.metrics_summary = json.dumps(value) - @property def progress(self) -> float: if self.total_items == 0: @@ -168,9 +150,7 @@ class EvaluationRunItem(Base): overall_score: Mapped[float | None] = mapped_column(Float, nullable=True) - created_at: Mapped[datetime] = mapped_column( - DateTime, nullable=False, server_default=func.current_timestamp() - ) + created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp()) @property def inputs_dict(self) -> dict[str, Any]: @@ -184,6 +164,12 @@ class EvaluationRunItem(Base): return json.loads(self.metrics) return [] + @property + def judgment_dict(self) -> dict[str, Any]: + if self.judgment: + return json.loads(self.judgment) + return {} + @property def metadata_dict(self) -> dict[str, Any]: if self.metadata_json: diff --git a/api/services/evaluation_service.py b/api/services/evaluation_service.py index 02046c62cb..16a4ee347d 100644 --- a/api/services/evaluation_service.py +++ b/api/services/evaluation_service.py @@ -245,13 +245,13 @@ class EvaluationService: config.evaluation_model_provider = data.evaluation_model_provider config.evaluation_model = data.evaluation_model - config.metrics_config = json.dumps({ - "default_metrics": [m.model_dump() for m in data.default_metrics], - "customized_metrics": data.customized_metrics.model_dump() if data.customized_metrics else None, - }) - config.judgement_conditions = json.dumps( - data.judgment_config.model_dump() if data.judgment_config else {} + config.metrics_config = json.dumps( + { + "default_metrics": [m.model_dump() for m in data.default_metrics], + "customized_metrics": data.customized_metrics.model_dump() if data.customized_metrics else None, + } ) + config.judgement_conditions = json.dumps(data.judgment_config.model_dump() if data.judgment_config else {}) config.updated_by = account_id session.commit() session.refresh(config) @@ -299,9 +299,7 @@ class EvaluationService: ) max_concurrent = dify_config.EVALUATION_MAX_CONCURRENT_RUNS if active_runs >= max_concurrent: - raise EvaluationMaxConcurrentRunsError( - f"Maximum concurrent runs ({max_concurrent}) reached." - ) + raise EvaluationMaxConcurrentRunsError(f"Maximum concurrent runs ({max_concurrent}) reached.") # Parse dataset items = cls._parse_dataset(dataset_file_content) @@ -373,11 +371,7 @@ class EvaluationService: tenant_id: str, run_id: str, ) -> EvaluationRun: - run = ( - session.query(EvaluationRun) - .filter_by(id=run_id, tenant_id=tenant_id) - .first() - ) + run = session.query(EvaluationRun).filter_by(id=run_id, tenant_id=tenant_id).first() if not run: raise EvaluationNotFoundError("Evaluation run not found.") return run @@ -587,8 +581,7 @@ class EvaluationService: @staticmethod def _extract_workflow_run_id(response: Mapping[str, object]) -> str | None: - """Extract ``workflow_run_id`` from a blocking workflow response. - """ + """Extract ``workflow_run_id`` from a blocking workflow response.""" wf_run_id = response.get("workflow_run_id") if wf_run_id: return str(wf_run_id) @@ -610,13 +603,15 @@ class EvaluationService: from core.workflow.enums import WorkflowNodeExecutionStatus from models.workflow import WorkflowNodeExecutionModel - stmt = WorkflowNodeExecutionModel.preload_offload_data( - select(WorkflowNodeExecutionModel) - ).where( - WorkflowNodeExecutionModel.tenant_id == tenant_id, - WorkflowNodeExecutionModel.app_id == app_id, - WorkflowNodeExecutionModel.workflow_run_id == workflow_run_id, - ).order_by(asc(WorkflowNodeExecutionModel.created_at)) + stmt = ( + WorkflowNodeExecutionModel.preload_offload_data(select(WorkflowNodeExecutionModel)) + .where( + WorkflowNodeExecutionModel.tenant_id == tenant_id, + WorkflowNodeExecutionModel.app_id == app_id, + WorkflowNodeExecutionModel.workflow_run_id == workflow_run_id, + ) + .order_by(asc(WorkflowNodeExecutionModel.created_at)) + ) node_models: list[WorkflowNodeExecutionModel] = list(session.execute(stmt).scalars().all()) diff --git a/api/services/snippet_dsl_service.py b/api/services/snippet_dsl_service.py index fb2a75c5d3..c6a31c3ceb 100644 --- a/api/services/snippet_dsl_service.py +++ b/api/services/snippet_dsl_service.py @@ -452,8 +452,9 @@ class SnippetDslService: }, } - self._append_workflow_export_data(export_data=export_data, snippet=snippet, workflow=workflow, - include_secret=include_secret) + self._append_workflow_export_data( + export_data=export_data, snippet=snippet, workflow=workflow, include_secret=include_secret + ) return yaml.dump(export_data, allow_unicode=True) # type: ignore diff --git a/api/tasks/evaluation_task.py b/api/tasks/evaluation_task.py index 742509ff53..3f4af7487d 100644 --- a/api/tasks/evaluation_task.py +++ b/api/tasks/evaluation_task.py @@ -15,6 +15,7 @@ from core.evaluation.entities.evaluation_entity import ( EvaluationItemResult, EvaluationRunData, ) +from core.evaluation.entities.judgment_entity import JudgmentConfig from core.evaluation.evaluation_manager import EvaluationManager from core.evaluation.runners.agent_evaluation_runner import AgentEvaluationRunner from core.evaluation.runners.base_evaluation_runner import BaseEvaluationRunner @@ -87,23 +88,20 @@ def _execute_evaluation(session: Any, run_data: EvaluationRunData) -> None: ) results: list[EvaluationItemResult] = _execute_evaluation_runner( - session, - run_data, - evaluation_instance, + session, + run_data, + evaluation_instance, node_run_result_mapping_list, ) - # Compute summary metrics - metrics_summary = _compute_metrics_summary(results) + metrics_summary = _compute_metrics_summary(results, run_data.judgment_config) # Generate result XLSX result_xlsx = _generate_result_xlsx(run_data.items, results) # Store result file - result_file_id = _store_result_file( - run_data.tenant_id, run_data.evaluation_run_id, result_xlsx, session - ) + result_file_id = _store_result_file(run_data.tenant_id, run_data.evaluation_run_id, result_xlsx, session) # Update run to completed evaluation_run = session.query(EvaluationRun).filter_by(id=run_data.evaluation_run_id).first() @@ -119,9 +117,9 @@ def _execute_evaluation(session: Any, run_data: EvaluationRunData) -> None: def _execute_evaluation_runner( - session: Any, - run_data: EvaluationRunData, - evaluation_instance: BaseEvaluationInstance, + session: Any, + run_data: EvaluationRunData, + evaluation_instance: BaseEvaluationInstance, node_run_result_mapping_list: list[dict[str, NodeRunResult]], ) -> list[EvaluationItemResult]: """Execute the evaluation runner.""" @@ -137,31 +135,37 @@ def _execute_evaluation_runner( node_run_result_list.append(node_run_result) if node_run_result_list: runner = _create_runner(EvaluationCategory(node_info.type), evaluation_instance, session) - results.extend(runner.run( - evaluation_run_id=run_data.evaluation_run_id, - tenant_id=run_data.tenant_id, - target_id=run_data.target_id, - target_type=run_data.target_type, - default_metric=default_metric, - customized_metrics=None, - model_provider=run_data.evaluation_model_provider, - model_name=run_data.evaluation_model, - node_run_result_list=node_run_result_list, - )) + results.extend( + runner.run( + evaluation_run_id=run_data.evaluation_run_id, + tenant_id=run_data.tenant_id, + target_id=run_data.target_id, + target_type=run_data.target_type, + default_metric=default_metric, + customized_metrics=None, + model_provider=run_data.evaluation_model_provider, + model_name=run_data.evaluation_model, + node_run_result_list=node_run_result_list, + judgment_config=run_data.judgment_config, + ) + ) if customized_metrics: runner = _create_runner(EvaluationCategory.WORKFLOW, evaluation_instance, session) - results.extend(runner.run( - evaluation_run_id=run_data.evaluation_run_id, - tenant_id=run_data.tenant_id, - target_id=run_data.target_id, - target_type=run_data.target_type, - default_metric=None, - customized_metrics=customized_metrics, - node_run_result_list=None, - node_run_result_mapping_list=node_run_result_mapping_list, - )) + results.extend( + runner.run( + evaluation_run_id=run_data.evaluation_run_id, + tenant_id=run_data.tenant_id, + target_id=run_data.target_id, + target_type=run_data.target_type, + default_metric=None, + customized_metrics=customized_metrics, + node_run_result_list=None, + node_run_result_mapping_list=node_run_result_mapping_list, + judgment_config=run_data.judgment_config, + ) + ) return results - + def _create_runner( category: EvaluationCategory, @@ -197,34 +201,32 @@ def _mark_run_failed(session: Any, run_id: str, error: str) -> None: logger.exception("Failed to mark run %s as failed", run_id) -def _compute_metrics_summary(results: list[EvaluationItemResult]) -> dict[str, Any]: - """Compute average scores per metric across all results.""" - metric_scores: dict[str, list[float]] = {} - for result in results: - if result.error: - continue - for metric in result.metrics: - if metric.name not in metric_scores: - metric_scores[metric.name] = [] - metric_scores[metric.name].append(float(metric.value)) +def _compute_metrics_summary( + results: list[EvaluationItemResult], + judgment_config: JudgmentConfig | None, +) -> dict[str, Any]: + """Compute aggregate metric and judgment summaries for an evaluation run. + + Metric statistics are calculated from successful item results only. When a + judgment config is present, the summary also reports how many successful + items passed or failed the configured judgment rules. + """ summary: dict[str, Any] = {} - for name, scores in metric_scores.items(): - summary[name] = { - "average": sum(scores) / len(scores) if scores else 0.0, - "min": min(scores) if scores else 0.0, - "max": max(scores) if scores else 0.0, - "count": len(scores), - } - # Overall average - all_scores = [s for scores in metric_scores.values() for s in scores] - summary["_overall"] = { - "average": sum(all_scores) / len(all_scores) if all_scores else 0.0, - "total_items": len(results), - "successful_items": sum(1 for r in results if r.error is None), - "failed_items": sum(1 for r in results if r.error is not None), - } + if judgment_config is not None and judgment_config.conditions: + evaluated_results: list[EvaluationItemResult] = [result for result in results if result.error is None and result.metrics] + passed_items = sum(1 for result in evaluated_results if result.judgment.passed) + evaluated_items = len(evaluated_results) + summary["_judgment"] = { + "enabled": True, + "logical_operator": judgment_config.logical_operator, + "configured_conditions": len(judgment_config.conditions), + "evaluated_items": evaluated_items, + "passed_items": passed_items, + "failed_items": evaluated_items - passed_items, + "pass_rate": passed_items / evaluated_items if evaluated_items else 0.0, + } return summary @@ -266,11 +268,7 @@ def _generate_result_xlsx( # Build headers headers = ( - ["index"] - + input_keys - + ["expected_output", "actual_output"] - + all_metric_names - + ["overall_score", "error"] + ["index"] + input_keys + ["expected_output", "actual_output"] + all_metric_names + ["overall_score", "error"] ) # Write header row @@ -320,9 +318,7 @@ def _generate_result_xlsx( col += 1 # Overall score - ws.cell( - row=row_idx, column=col, value=result.overall_score if result else "" - ).border = thin_border + ws.cell(row=row_idx, column=col, value=result.overall_score if result else "").border = thin_border col += 1 # Error