mirror of
https://github.com/langgenius/dify.git
synced 2026-05-11 23:18:39 +08:00
evaluation runtime
This commit is contained in:
parent
f81bcf53e3
commit
751c938d8a
@ -0,0 +1,69 @@
|
||||
"""Unit tests for metric-based judgment evaluation."""
|
||||
|
||||
from core.evaluation.entities.judgment_entity import JudgmentCondition, JudgmentConfig
|
||||
from core.evaluation.judgment.processor import JudgmentProcessor
|
||||
|
||||
|
||||
def test_evaluate_uses_and_conditions_against_metric_values() -> None:
|
||||
"""All conditions must pass when the logical operator is ``and``."""
|
||||
config = JudgmentConfig(
|
||||
logical_operator="and",
|
||||
conditions=[
|
||||
JudgmentCondition(
|
||||
metric_name="faithfulness",
|
||||
comparison_operator=">",
|
||||
condition_value="0.8",
|
||||
condition_type="number",
|
||||
),
|
||||
JudgmentCondition(
|
||||
metric_name="answer_relevancy",
|
||||
comparison_operator="≥",
|
||||
condition_value="0.7",
|
||||
condition_type="number",
|
||||
),
|
||||
],
|
||||
)
|
||||
|
||||
result = JudgmentProcessor.evaluate(
|
||||
{
|
||||
"faithfulness": 0.9,
|
||||
"answer_relevancy": 0.75,
|
||||
},
|
||||
config,
|
||||
)
|
||||
|
||||
assert result.passed is True
|
||||
assert len(result.condition_results) == 2
|
||||
assert all(condition_result.passed for condition_result in result.condition_results)
|
||||
|
||||
|
||||
def test_evaluate_sets_passed_false_when_any_and_condition_fails() -> None:
|
||||
"""A failed metric comparison should make the overall judgment fail."""
|
||||
config = JudgmentConfig(
|
||||
logical_operator="and",
|
||||
conditions=[
|
||||
JudgmentCondition(
|
||||
metric_name="faithfulness",
|
||||
comparison_operator=">",
|
||||
condition_value="0.8",
|
||||
condition_type="number",
|
||||
),
|
||||
JudgmentCondition(
|
||||
metric_name="answer_relevancy",
|
||||
comparison_operator="≥",
|
||||
condition_value="0.7",
|
||||
condition_type="number",
|
||||
),
|
||||
],
|
||||
)
|
||||
|
||||
result = JudgmentProcessor.evaluate(
|
||||
{
|
||||
"faithfulness": 0.9,
|
||||
"answer_relevancy": 0.6,
|
||||
},
|
||||
config,
|
||||
)
|
||||
|
||||
assert result.passed is False
|
||||
assert result.condition_results[-1].passed is False
|
||||
@ -0,0 +1,80 @@
|
||||
"""Tests for judgment application in the base evaluation runner."""
|
||||
|
||||
from unittest.mock import Mock
|
||||
|
||||
from core.evaluation.entities.evaluation_entity import DefaultMetric, EvaluationItemResult, EvaluationMetric
|
||||
from core.evaluation.entities.judgment_entity import JudgmentCondition, JudgmentConfig
|
||||
from core.evaluation.runners.base_evaluation_runner import BaseEvaluationRunner
|
||||
|
||||
|
||||
class _FakeItemInput:
|
||||
def __init__(self, index: int) -> None:
|
||||
self.index = index
|
||||
self.inputs = {"query": "hello"}
|
||||
self.expected_output = "world"
|
||||
self.context = None
|
||||
|
||||
|
||||
class _FakeEvaluationRun:
|
||||
def __init__(self) -> None:
|
||||
self.status = None
|
||||
self.started_at = None
|
||||
self.input_list = [_FakeItemInput(index=0)]
|
||||
|
||||
|
||||
class _FakeRunner(BaseEvaluationRunner):
|
||||
def evaluate_metrics(
|
||||
self,
|
||||
node_run_result_mapping_list,
|
||||
node_run_result_list,
|
||||
default_metric,
|
||||
customized_metrics,
|
||||
model_provider,
|
||||
model_name,
|
||||
tenant_id,
|
||||
) -> list[EvaluationItemResult]:
|
||||
return [
|
||||
EvaluationItemResult(
|
||||
index=0,
|
||||
actual_output="result",
|
||||
metrics=[EvaluationMetric(name="faithfulness", value=0.91)],
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
def test_run_applies_judgment_before_persisting_results() -> None:
|
||||
"""Runner should evaluate judgment rules before persisting item rows."""
|
||||
# Arrange
|
||||
session = Mock()
|
||||
evaluation_run = _FakeEvaluationRun()
|
||||
session.query.return_value.filter_by.return_value.first.return_value = evaluation_run
|
||||
|
||||
runner = _FakeRunner(evaluation_instance=Mock(), session=session)
|
||||
judgment_config = JudgmentConfig(
|
||||
logical_operator="and",
|
||||
conditions=[
|
||||
JudgmentCondition(
|
||||
metric_name="faithfulness",
|
||||
comparison_operator=">",
|
||||
condition_value="0.8",
|
||||
condition_type="number",
|
||||
)
|
||||
],
|
||||
)
|
||||
|
||||
# Act
|
||||
results = runner.run(
|
||||
evaluation_run_id="run-id",
|
||||
tenant_id="tenant-id",
|
||||
target_id="target-id",
|
||||
target_type="app",
|
||||
node_run_result_list=[Mock()],
|
||||
default_metric=DefaultMetric(metric="faithfulness", node_info_list=[]),
|
||||
judgment_config=judgment_config,
|
||||
)
|
||||
|
||||
# Assert
|
||||
assert results[0].judgment.passed is True
|
||||
persisted_item = session.add.call_args.args[0]
|
||||
assert persisted_item.judgment is not None
|
||||
assert '"passed": true' in persisted_item.judgment
|
||||
58
api/tests/unit_tests/tasks/test_evaluation_task.py
Normal file
58
api/tests/unit_tests/tasks/test_evaluation_task.py
Normal file
@ -0,0 +1,58 @@
|
||||
"""Unit tests for evaluation task judgment aggregation helpers."""
|
||||
|
||||
from core.evaluation.entities.evaluation_entity import EvaluationItemResult, EvaluationMetric
|
||||
from core.evaluation.entities.judgment_entity import (
|
||||
JudgmentCondition,
|
||||
JudgmentConfig,
|
||||
JudgmentResult,
|
||||
)
|
||||
from tasks.evaluation_task import _compute_metrics_summary
|
||||
|
||||
|
||||
def test_compute_metrics_summary_includes_judgment_counts() -> None:
|
||||
"""Summary should expose pass/fail counts when judgment rules are configured."""
|
||||
# Arrange
|
||||
judgment_config = JudgmentConfig(
|
||||
logical_operator="and",
|
||||
conditions=[
|
||||
JudgmentCondition(
|
||||
metric_name="faithfulness",
|
||||
comparison_operator=">",
|
||||
condition_value="0.8",
|
||||
condition_type="number",
|
||||
)
|
||||
],
|
||||
)
|
||||
results = [
|
||||
EvaluationItemResult(
|
||||
index=0,
|
||||
metrics=[EvaluationMetric(name="faithfulness", value=0.9)],
|
||||
judgment=JudgmentResult(passed=True, logical_operator="and", condition_results=[]),
|
||||
),
|
||||
EvaluationItemResult(
|
||||
index=1,
|
||||
metrics=[EvaluationMetric(name="faithfulness", value=0.4)],
|
||||
judgment=JudgmentResult(passed=False, logical_operator="and", condition_results=[]),
|
||||
),
|
||||
EvaluationItemResult(index=2, error="timeout"),
|
||||
]
|
||||
|
||||
# Act
|
||||
summary = _compute_metrics_summary(results, judgment_config)
|
||||
|
||||
# Assert
|
||||
assert summary["faithfulness"] == {
|
||||
"average": 0.65,
|
||||
"min": 0.4,
|
||||
"max": 0.9,
|
||||
"count": 2,
|
||||
}
|
||||
assert summary["_judgment"] == {
|
||||
"enabled": True,
|
||||
"logical_operator": "and",
|
||||
"configured_conditions": 1,
|
||||
"evaluated_items": 2,
|
||||
"passed_items": 1,
|
||||
"failed_items": 1,
|
||||
"pass_rate": 0.5,
|
||||
}
|
||||
Loading…
Reference in New Issue
Block a user