evaluation runtime

This commit is contained in:
jyong 2026-03-16 18:09:10 +08:00
parent f81bcf53e3
commit 751c938d8a
3 changed files with 207 additions and 0 deletions

View File

@ -0,0 +1,69 @@
"""Unit tests for metric-based judgment evaluation."""
from core.evaluation.entities.judgment_entity import JudgmentCondition, JudgmentConfig
from core.evaluation.judgment.processor import JudgmentProcessor
def test_evaluate_uses_and_conditions_against_metric_values() -> None:
"""All conditions must pass when the logical operator is ``and``."""
config = JudgmentConfig(
logical_operator="and",
conditions=[
JudgmentCondition(
metric_name="faithfulness",
comparison_operator=">",
condition_value="0.8",
condition_type="number",
),
JudgmentCondition(
metric_name="answer_relevancy",
comparison_operator="",
condition_value="0.7",
condition_type="number",
),
],
)
result = JudgmentProcessor.evaluate(
{
"faithfulness": 0.9,
"answer_relevancy": 0.75,
},
config,
)
assert result.passed is True
assert len(result.condition_results) == 2
assert all(condition_result.passed for condition_result in result.condition_results)
def test_evaluate_sets_passed_false_when_any_and_condition_fails() -> None:
"""A failed metric comparison should make the overall judgment fail."""
config = JudgmentConfig(
logical_operator="and",
conditions=[
JudgmentCondition(
metric_name="faithfulness",
comparison_operator=">",
condition_value="0.8",
condition_type="number",
),
JudgmentCondition(
metric_name="answer_relevancy",
comparison_operator="",
condition_value="0.7",
condition_type="number",
),
],
)
result = JudgmentProcessor.evaluate(
{
"faithfulness": 0.9,
"answer_relevancy": 0.6,
},
config,
)
assert result.passed is False
assert result.condition_results[-1].passed is False

View File

@ -0,0 +1,80 @@
"""Tests for judgment application in the base evaluation runner."""
from unittest.mock import Mock
from core.evaluation.entities.evaluation_entity import DefaultMetric, EvaluationItemResult, EvaluationMetric
from core.evaluation.entities.judgment_entity import JudgmentCondition, JudgmentConfig
from core.evaluation.runners.base_evaluation_runner import BaseEvaluationRunner
class _FakeItemInput:
def __init__(self, index: int) -> None:
self.index = index
self.inputs = {"query": "hello"}
self.expected_output = "world"
self.context = None
class _FakeEvaluationRun:
def __init__(self) -> None:
self.status = None
self.started_at = None
self.input_list = [_FakeItemInput(index=0)]
class _FakeRunner(BaseEvaluationRunner):
def evaluate_metrics(
self,
node_run_result_mapping_list,
node_run_result_list,
default_metric,
customized_metrics,
model_provider,
model_name,
tenant_id,
) -> list[EvaluationItemResult]:
return [
EvaluationItemResult(
index=0,
actual_output="result",
metrics=[EvaluationMetric(name="faithfulness", value=0.91)],
)
]
def test_run_applies_judgment_before_persisting_results() -> None:
"""Runner should evaluate judgment rules before persisting item rows."""
# Arrange
session = Mock()
evaluation_run = _FakeEvaluationRun()
session.query.return_value.filter_by.return_value.first.return_value = evaluation_run
runner = _FakeRunner(evaluation_instance=Mock(), session=session)
judgment_config = JudgmentConfig(
logical_operator="and",
conditions=[
JudgmentCondition(
metric_name="faithfulness",
comparison_operator=">",
condition_value="0.8",
condition_type="number",
)
],
)
# Act
results = runner.run(
evaluation_run_id="run-id",
tenant_id="tenant-id",
target_id="target-id",
target_type="app",
node_run_result_list=[Mock()],
default_metric=DefaultMetric(metric="faithfulness", node_info_list=[]),
judgment_config=judgment_config,
)
# Assert
assert results[0].judgment.passed is True
persisted_item = session.add.call_args.args[0]
assert persisted_item.judgment is not None
assert '"passed": true' in persisted_item.judgment

View File

@ -0,0 +1,58 @@
"""Unit tests for evaluation task judgment aggregation helpers."""
from core.evaluation.entities.evaluation_entity import EvaluationItemResult, EvaluationMetric
from core.evaluation.entities.judgment_entity import (
JudgmentCondition,
JudgmentConfig,
JudgmentResult,
)
from tasks.evaluation_task import _compute_metrics_summary
def test_compute_metrics_summary_includes_judgment_counts() -> None:
"""Summary should expose pass/fail counts when judgment rules are configured."""
# Arrange
judgment_config = JudgmentConfig(
logical_operator="and",
conditions=[
JudgmentCondition(
metric_name="faithfulness",
comparison_operator=">",
condition_value="0.8",
condition_type="number",
)
],
)
results = [
EvaluationItemResult(
index=0,
metrics=[EvaluationMetric(name="faithfulness", value=0.9)],
judgment=JudgmentResult(passed=True, logical_operator="and", condition_results=[]),
),
EvaluationItemResult(
index=1,
metrics=[EvaluationMetric(name="faithfulness", value=0.4)],
judgment=JudgmentResult(passed=False, logical_operator="and", condition_results=[]),
),
EvaluationItemResult(index=2, error="timeout"),
]
# Act
summary = _compute_metrics_summary(results, judgment_config)
# Assert
assert summary["faithfulness"] == {
"average": 0.65,
"min": 0.4,
"max": 0.9,
"count": 2,
}
assert summary["_judgment"] == {
"enabled": True,
"logical_operator": "and",
"configured_conditions": 1,
"evaluated_items": 2,
"passed_items": 1,
"failed_items": 1,
"pass_rate": 0.5,
}