mirror of
https://github.com/langgenius/dify.git
synced 2026-05-11 14:58:23 +08:00
evaluation runtime
This commit is contained in:
parent
2b3f5adfab
commit
dabad46393
@ -6,10 +6,13 @@ from sqlalchemy.orm import Session
|
||||
|
||||
from core.evaluation.base_evaluation_instance import BaseEvaluationInstance
|
||||
from core.evaluation.entities.evaluation_entity import (
|
||||
CustomizedMetrics,
|
||||
DefaultMetric,
|
||||
EvaluationItemInput,
|
||||
EvaluationItemResult,
|
||||
)
|
||||
from core.evaluation.runners.base_evaluation_runner import BaseEvaluationRunner
|
||||
from core.workflow.node_events import NodeRunResult
|
||||
from models.model import App
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@ -66,9 +69,10 @@ class AgentEvaluationRunner(BaseEvaluationRunner):
|
||||
|
||||
def evaluate_metrics(
|
||||
self,
|
||||
items: list[EvaluationItemInput],
|
||||
results: list[EvaluationItemResult],
|
||||
default_metrics: list[dict[str, Any]],
|
||||
node_run_result_mapping: dict[str, NodeRunResult] | None,
|
||||
node_run_result: NodeRunResult | None,
|
||||
default_metric: DefaultMetric | None,
|
||||
customized_metrics: CustomizedMetrics | None,
|
||||
model_provider: str,
|
||||
model_name: str,
|
||||
tenant_id: str,
|
||||
|
||||
@ -42,9 +42,10 @@ class BaseEvaluationRunner(ABC):
|
||||
@abstractmethod
|
||||
def evaluate_metrics(
|
||||
self,
|
||||
items: list[EvaluationItemInput],
|
||||
results: list[EvaluationItemResult],
|
||||
default_metrics: list[dict[str, Any]],
|
||||
node_run_result_mapping: dict[str, NodeRunResult] | None,
|
||||
node_run_result: NodeRunResult | None,
|
||||
default_metric: DefaultMetric | None,
|
||||
customized_metrics: CustomizedMetrics | None,
|
||||
model_provider: str,
|
||||
model_name: str,
|
||||
tenant_id: str,
|
||||
@ -58,11 +59,12 @@ class BaseEvaluationRunner(ABC):
|
||||
tenant_id: str,
|
||||
target_id: str,
|
||||
target_type: str,
|
||||
node_run_result: NodeRunResult,
|
||||
node_run_result: NodeRunResult | None = None,
|
||||
default_metric: DefaultMetric | None = None,
|
||||
customized_metrics: CustomizedMetrics | None = None,
|
||||
model_provider: str = "",
|
||||
model_name: str = "",
|
||||
model_name: str = "",
|
||||
node_run_result_mapping: dict[str, NodeRunResult] | None = None,
|
||||
) -> list[EvaluationItemResult]:
|
||||
"""Orchestrate target execution + metric evaluation + judgment for all items."""
|
||||
evaluation_run = self.session.query(EvaluationRun).filter_by(id=evaluation_run_id).first()
|
||||
@ -82,17 +84,15 @@ class BaseEvaluationRunner(ABC):
|
||||
# Phase 1: run evaluation
|
||||
if node_run_result.status == WorkflowNodeExecutionStatus.SUCCEEDED:
|
||||
try:
|
||||
if customized_metrics is not None:
|
||||
# Customized workflow evaluation — target-type agnostic
|
||||
evaluated_results = self._evaluate_customized(
|
||||
successful_items, successful_results, customized_metrics, tenant_id,
|
||||
)
|
||||
else:
|
||||
# Framework-specific evaluation — delegate to subclass
|
||||
evaluated_results = self.evaluate_metrics(
|
||||
successful_items, successful_results, default_metrics,
|
||||
model_provider, model_name, tenant_id,
|
||||
)
|
||||
evaluated_results = self.evaluate_metrics(
|
||||
node_run_result_mapping=node_run_result_mapping,
|
||||
node_run_result=node_run_result,
|
||||
default_metric=default_metric,
|
||||
customized_metrics=customized_metrics,
|
||||
model_provider=model_provider,
|
||||
model_name=model_name,
|
||||
tenant_id=tenant_id,
|
||||
)
|
||||
# Merge evaluated metrics back into results
|
||||
evaluated_by_index = {r.index: r for r in evaluated_results}
|
||||
for i, result in enumerate(results):
|
||||
|
||||
@ -6,10 +6,13 @@ from sqlalchemy.orm import Session
|
||||
|
||||
from core.evaluation.base_evaluation_instance import BaseEvaluationInstance
|
||||
from core.evaluation.entities.evaluation_entity import (
|
||||
CustomizedMetrics,
|
||||
DefaultMetric,
|
||||
EvaluationItemInput,
|
||||
EvaluationItemResult,
|
||||
)
|
||||
from core.evaluation.runners.base_evaluation_runner import BaseEvaluationRunner
|
||||
from core.workflow.node_events import NodeRunResult
|
||||
from models.model import App, AppMode
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@ -23,9 +26,10 @@ class LLMEvaluationRunner(BaseEvaluationRunner):
|
||||
|
||||
def evaluate_metrics(
|
||||
self,
|
||||
items: list[EvaluationItemInput],
|
||||
results: list[EvaluationItemResult],
|
||||
default_metrics: list[dict[str, Any]],
|
||||
node_run_result_mapping: dict[str, NodeRunResult] | None,
|
||||
node_run_result: NodeRunResult | None,
|
||||
default_metric: DefaultMetric | None,
|
||||
customized_metrics: CustomizedMetrics | None,
|
||||
model_provider: str,
|
||||
model_name: str,
|
||||
tenant_id: str,
|
||||
|
||||
@ -5,10 +5,13 @@ from sqlalchemy.orm import Session
|
||||
|
||||
from core.evaluation.base_evaluation_instance import BaseEvaluationInstance
|
||||
from core.evaluation.entities.evaluation_entity import (
|
||||
CustomizedMetrics,
|
||||
DefaultMetric,
|
||||
EvaluationItemInput,
|
||||
EvaluationItemResult,
|
||||
)
|
||||
from core.evaluation.runners.base_evaluation_runner import BaseEvaluationRunner
|
||||
from core.workflow.node_events import NodeRunResult
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@ -21,9 +24,10 @@ class RetrievalEvaluationRunner(BaseEvaluationRunner):
|
||||
|
||||
def evaluate_metrics(
|
||||
self,
|
||||
items: list[EvaluationItemInput],
|
||||
results: list[EvaluationItemResult],
|
||||
default_metrics: list[dict[str, Any]],
|
||||
node_run_result_mapping: dict[str, NodeRunResult] | None,
|
||||
node_run_result: NodeRunResult | None,
|
||||
default_metric: DefaultMetric | None,
|
||||
customized_metrics: CustomizedMetrics | None,
|
||||
model_provider: str,
|
||||
model_name: str,
|
||||
tenant_id: str,
|
||||
|
||||
@ -16,10 +16,13 @@ from sqlalchemy.orm import Session
|
||||
|
||||
from core.evaluation.base_evaluation_instance import BaseEvaluationInstance
|
||||
from core.evaluation.entities.evaluation_entity import (
|
||||
CustomizedMetrics,
|
||||
DefaultMetric,
|
||||
EvaluationItemInput,
|
||||
EvaluationItemResult,
|
||||
)
|
||||
from core.evaluation.runners.base_evaluation_runner import BaseEvaluationRunner
|
||||
from core.workflow.node_events import NodeRunResult
|
||||
from models.snippet import CustomizedSnippet
|
||||
from models.workflow import WorkflowNodeExecutionModel
|
||||
|
||||
@ -89,9 +92,10 @@ class SnippetEvaluationRunner(BaseEvaluationRunner):
|
||||
|
||||
def evaluate_metrics(
|
||||
self,
|
||||
items: list[EvaluationItemInput],
|
||||
results: list[EvaluationItemResult],
|
||||
default_metrics: list[dict[str, Any]],
|
||||
node_run_result_mapping: dict[str, NodeRunResult] | None,
|
||||
node_run_result: NodeRunResult | None,
|
||||
default_metric: DefaultMetric | None,
|
||||
customized_metrics: CustomizedMetrics | None,
|
||||
model_provider: str,
|
||||
model_name: str,
|
||||
tenant_id: str,
|
||||
|
||||
@ -6,10 +6,13 @@ from sqlalchemy.orm import Session
|
||||
|
||||
from core.evaluation.base_evaluation_instance import BaseEvaluationInstance
|
||||
from core.evaluation.entities.evaluation_entity import (
|
||||
CustomizedMetrics,
|
||||
DefaultMetric,
|
||||
EvaluationItemInput,
|
||||
EvaluationItemResult,
|
||||
)
|
||||
from core.evaluation.runners.base_evaluation_runner import BaseEvaluationRunner
|
||||
from core.workflow.node_events import NodeRunResult
|
||||
from models.model import App
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@ -23,9 +26,10 @@ class WorkflowEvaluationRunner(BaseEvaluationRunner):
|
||||
|
||||
def evaluate_metrics(
|
||||
self,
|
||||
items: list[EvaluationItemInput],
|
||||
results: list[EvaluationItemResult],
|
||||
default_metrics: list[dict[str, Any]],
|
||||
node_run_result_mapping: dict[str, NodeRunResult] | None,
|
||||
node_run_result: NodeRunResult | None,
|
||||
default_metric: DefaultMetric | None,
|
||||
customized_metrics: CustomizedMetrics | None,
|
||||
model_provider: str,
|
||||
model_name: str,
|
||||
tenant_id: str,
|
||||
|
||||
@ -129,13 +129,18 @@ def _execute_evaluation_runner(
|
||||
)
|
||||
else:
|
||||
default_metric.score = 0
|
||||
for customized_metric in customized_metrics:
|
||||
runner = _create_runner(run_data.evaluation_category, evaluation_instance, session)
|
||||
runner.run(
|
||||
evaluation_run_id=run_data.evaluation_run_id,
|
||||
tenant_id=run_data.tenant_id,
|
||||
target_id=run_data.target_id,
|
||||
)
|
||||
if customized_metrics:
|
||||
runner = _create_runner(EvaluationCategory.WORKFLOW, evaluation_instance, session)
|
||||
runner.run(
|
||||
evaluation_run_id=run_data.evaluation_run_id,
|
||||
tenant_id=run_data.tenant_id,
|
||||
target_id=run_data.target_id,
|
||||
target_type=run_data.target_type,
|
||||
default_metric=None,
|
||||
customized_metrics=customized_metrics,
|
||||
node_run_result=None,
|
||||
node_run_result_mapping=node_run_result_mapping,
|
||||
)
|
||||
|
||||
def _create_runner(
|
||||
category: EvaluationCategory,
|
||||
|
||||
Loading…
Reference in New Issue
Block a user