From 751c938d8a551e628487b7e7354c426c29cd129f Mon Sep 17 00:00:00 2001 From: jyong <718720800@qq.com> Date: Mon, 16 Mar 2026 18:09:10 +0800 Subject: [PATCH] evaluation runtime --- .../evaluation/judgment/test_processor.py | 69 ++++++++++++++++ .../runners/test_base_evaluation_runner.py | 80 +++++++++++++++++++ .../unit_tests/tasks/test_evaluation_task.py | 58 ++++++++++++++ 3 files changed, 207 insertions(+) create mode 100644 api/tests/unit_tests/core/evaluation/judgment/test_processor.py create mode 100644 api/tests/unit_tests/core/evaluation/runners/test_base_evaluation_runner.py create mode 100644 api/tests/unit_tests/tasks/test_evaluation_task.py diff --git a/api/tests/unit_tests/core/evaluation/judgment/test_processor.py b/api/tests/unit_tests/core/evaluation/judgment/test_processor.py new file mode 100644 index 0000000000..2dfeff0b54 --- /dev/null +++ b/api/tests/unit_tests/core/evaluation/judgment/test_processor.py @@ -0,0 +1,69 @@ +"""Unit tests for metric-based judgment evaluation.""" + +from core.evaluation.entities.judgment_entity import JudgmentCondition, JudgmentConfig +from core.evaluation.judgment.processor import JudgmentProcessor + + +def test_evaluate_uses_and_conditions_against_metric_values() -> None: + """All conditions must pass when the logical operator is ``and``.""" + config = JudgmentConfig( + logical_operator="and", + conditions=[ + JudgmentCondition( + metric_name="faithfulness", + comparison_operator=">", + condition_value="0.8", + condition_type="number", + ), + JudgmentCondition( + metric_name="answer_relevancy", + comparison_operator="≥", + condition_value="0.7", + condition_type="number", + ), + ], + ) + + result = JudgmentProcessor.evaluate( + { + "faithfulness": 0.9, + "answer_relevancy": 0.75, + }, + config, + ) + + assert result.passed is True + assert len(result.condition_results) == 2 + assert all(condition_result.passed for condition_result in result.condition_results) + + +def test_evaluate_sets_passed_false_when_any_and_condition_fails() -> None: + """A failed metric comparison should make the overall judgment fail.""" + config = JudgmentConfig( + logical_operator="and", + conditions=[ + JudgmentCondition( + metric_name="faithfulness", + comparison_operator=">", + condition_value="0.8", + condition_type="number", + ), + JudgmentCondition( + metric_name="answer_relevancy", + comparison_operator="≥", + condition_value="0.7", + condition_type="number", + ), + ], + ) + + result = JudgmentProcessor.evaluate( + { + "faithfulness": 0.9, + "answer_relevancy": 0.6, + }, + config, + ) + + assert result.passed is False + assert result.condition_results[-1].passed is False diff --git a/api/tests/unit_tests/core/evaluation/runners/test_base_evaluation_runner.py b/api/tests/unit_tests/core/evaluation/runners/test_base_evaluation_runner.py new file mode 100644 index 0000000000..477a678bae --- /dev/null +++ b/api/tests/unit_tests/core/evaluation/runners/test_base_evaluation_runner.py @@ -0,0 +1,80 @@ +"""Tests for judgment application in the base evaluation runner.""" + +from unittest.mock import Mock + +from core.evaluation.entities.evaluation_entity import DefaultMetric, EvaluationItemResult, EvaluationMetric +from core.evaluation.entities.judgment_entity import JudgmentCondition, JudgmentConfig +from core.evaluation.runners.base_evaluation_runner import BaseEvaluationRunner + + +class _FakeItemInput: + def __init__(self, index: int) -> None: + self.index = index + self.inputs = {"query": "hello"} + self.expected_output = "world" + self.context = None + + +class _FakeEvaluationRun: + def __init__(self) -> None: + self.status = None + self.started_at = None + self.input_list = [_FakeItemInput(index=0)] + + +class _FakeRunner(BaseEvaluationRunner): + def evaluate_metrics( + self, + node_run_result_mapping_list, + node_run_result_list, + default_metric, + customized_metrics, + model_provider, + model_name, + tenant_id, + ) -> list[EvaluationItemResult]: + return [ + EvaluationItemResult( + index=0, + actual_output="result", + metrics=[EvaluationMetric(name="faithfulness", value=0.91)], + ) + ] + + +def test_run_applies_judgment_before_persisting_results() -> None: + """Runner should evaluate judgment rules before persisting item rows.""" + # Arrange + session = Mock() + evaluation_run = _FakeEvaluationRun() + session.query.return_value.filter_by.return_value.first.return_value = evaluation_run + + runner = _FakeRunner(evaluation_instance=Mock(), session=session) + judgment_config = JudgmentConfig( + logical_operator="and", + conditions=[ + JudgmentCondition( + metric_name="faithfulness", + comparison_operator=">", + condition_value="0.8", + condition_type="number", + ) + ], + ) + + # Act + results = runner.run( + evaluation_run_id="run-id", + tenant_id="tenant-id", + target_id="target-id", + target_type="app", + node_run_result_list=[Mock()], + default_metric=DefaultMetric(metric="faithfulness", node_info_list=[]), + judgment_config=judgment_config, + ) + + # Assert + assert results[0].judgment.passed is True + persisted_item = session.add.call_args.args[0] + assert persisted_item.judgment is not None + assert '"passed": true' in persisted_item.judgment diff --git a/api/tests/unit_tests/tasks/test_evaluation_task.py b/api/tests/unit_tests/tasks/test_evaluation_task.py new file mode 100644 index 0000000000..34d2849d2f --- /dev/null +++ b/api/tests/unit_tests/tasks/test_evaluation_task.py @@ -0,0 +1,58 @@ +"""Unit tests for evaluation task judgment aggregation helpers.""" + +from core.evaluation.entities.evaluation_entity import EvaluationItemResult, EvaluationMetric +from core.evaluation.entities.judgment_entity import ( + JudgmentCondition, + JudgmentConfig, + JudgmentResult, +) +from tasks.evaluation_task import _compute_metrics_summary + + +def test_compute_metrics_summary_includes_judgment_counts() -> None: + """Summary should expose pass/fail counts when judgment rules are configured.""" + # Arrange + judgment_config = JudgmentConfig( + logical_operator="and", + conditions=[ + JudgmentCondition( + metric_name="faithfulness", + comparison_operator=">", + condition_value="0.8", + condition_type="number", + ) + ], + ) + results = [ + EvaluationItemResult( + index=0, + metrics=[EvaluationMetric(name="faithfulness", value=0.9)], + judgment=JudgmentResult(passed=True, logical_operator="and", condition_results=[]), + ), + EvaluationItemResult( + index=1, + metrics=[EvaluationMetric(name="faithfulness", value=0.4)], + judgment=JudgmentResult(passed=False, logical_operator="and", condition_results=[]), + ), + EvaluationItemResult(index=2, error="timeout"), + ] + + # Act + summary = _compute_metrics_summary(results, judgment_config) + + # Assert + assert summary["faithfulness"] == { + "average": 0.65, + "min": 0.4, + "max": 0.9, + "count": 2, + } + assert summary["_judgment"] == { + "enabled": True, + "logical_operator": "and", + "configured_conditions": 1, + "evaluated_items": 2, + "passed_items": 1, + "failed_items": 1, + "pass_rate": 0.5, + }