evaluation runtime

This commit is contained in:
jyong 2026-03-09 15:56:03 +08:00
parent 2b3f5adfab
commit dabad46393
7 changed files with 63 additions and 38 deletions

View File

@ -6,10 +6,13 @@ from sqlalchemy.orm import Session
from core.evaluation.base_evaluation_instance import BaseEvaluationInstance
from core.evaluation.entities.evaluation_entity import (
CustomizedMetrics,
DefaultMetric,
EvaluationItemInput,
EvaluationItemResult,
)
from core.evaluation.runners.base_evaluation_runner import BaseEvaluationRunner
from core.workflow.node_events import NodeRunResult
from models.model import App
logger = logging.getLogger(__name__)
@ -66,9 +69,10 @@ class AgentEvaluationRunner(BaseEvaluationRunner):
def evaluate_metrics(
self,
items: list[EvaluationItemInput],
results: list[EvaluationItemResult],
default_metrics: list[dict[str, Any]],
node_run_result_mapping: dict[str, NodeRunResult] | None,
node_run_result: NodeRunResult | None,
default_metric: DefaultMetric | None,
customized_metrics: CustomizedMetrics | None,
model_provider: str,
model_name: str,
tenant_id: str,

View File

@ -42,9 +42,10 @@ class BaseEvaluationRunner(ABC):
@abstractmethod
def evaluate_metrics(
self,
items: list[EvaluationItemInput],
results: list[EvaluationItemResult],
default_metrics: list[dict[str, Any]],
node_run_result_mapping: dict[str, NodeRunResult] | None,
node_run_result: NodeRunResult | None,
default_metric: DefaultMetric | None,
customized_metrics: CustomizedMetrics | None,
model_provider: str,
model_name: str,
tenant_id: str,
@ -58,11 +59,12 @@ class BaseEvaluationRunner(ABC):
tenant_id: str,
target_id: str,
target_type: str,
node_run_result: NodeRunResult,
node_run_result: NodeRunResult | None = None,
default_metric: DefaultMetric | None = None,
customized_metrics: CustomizedMetrics | None = None,
model_provider: str = "",
model_name: str = "",
model_name: str = "",
node_run_result_mapping: dict[str, NodeRunResult] | None = None,
) -> list[EvaluationItemResult]:
"""Orchestrate target execution + metric evaluation + judgment for all items."""
evaluation_run = self.session.query(EvaluationRun).filter_by(id=evaluation_run_id).first()
@ -82,17 +84,15 @@ class BaseEvaluationRunner(ABC):
# Phase 1: run evaluation
if node_run_result.status == WorkflowNodeExecutionStatus.SUCCEEDED:
try:
if customized_metrics is not None:
# Customized workflow evaluation — target-type agnostic
evaluated_results = self._evaluate_customized(
successful_items, successful_results, customized_metrics, tenant_id,
)
else:
# Framework-specific evaluation — delegate to subclass
evaluated_results = self.evaluate_metrics(
successful_items, successful_results, default_metrics,
model_provider, model_name, tenant_id,
)
evaluated_results = self.evaluate_metrics(
node_run_result_mapping=node_run_result_mapping,
node_run_result=node_run_result,
default_metric=default_metric,
customized_metrics=customized_metrics,
model_provider=model_provider,
model_name=model_name,
tenant_id=tenant_id,
)
# Merge evaluated metrics back into results
evaluated_by_index = {r.index: r for r in evaluated_results}
for i, result in enumerate(results):

View File

@ -6,10 +6,13 @@ from sqlalchemy.orm import Session
from core.evaluation.base_evaluation_instance import BaseEvaluationInstance
from core.evaluation.entities.evaluation_entity import (
CustomizedMetrics,
DefaultMetric,
EvaluationItemInput,
EvaluationItemResult,
)
from core.evaluation.runners.base_evaluation_runner import BaseEvaluationRunner
from core.workflow.node_events import NodeRunResult
from models.model import App, AppMode
logger = logging.getLogger(__name__)
@ -23,9 +26,10 @@ class LLMEvaluationRunner(BaseEvaluationRunner):
def evaluate_metrics(
self,
items: list[EvaluationItemInput],
results: list[EvaluationItemResult],
default_metrics: list[dict[str, Any]],
node_run_result_mapping: dict[str, NodeRunResult] | None,
node_run_result: NodeRunResult | None,
default_metric: DefaultMetric | None,
customized_metrics: CustomizedMetrics | None,
model_provider: str,
model_name: str,
tenant_id: str,

View File

@ -5,10 +5,13 @@ from sqlalchemy.orm import Session
from core.evaluation.base_evaluation_instance import BaseEvaluationInstance
from core.evaluation.entities.evaluation_entity import (
CustomizedMetrics,
DefaultMetric,
EvaluationItemInput,
EvaluationItemResult,
)
from core.evaluation.runners.base_evaluation_runner import BaseEvaluationRunner
from core.workflow.node_events import NodeRunResult
logger = logging.getLogger(__name__)
@ -21,9 +24,10 @@ class RetrievalEvaluationRunner(BaseEvaluationRunner):
def evaluate_metrics(
self,
items: list[EvaluationItemInput],
results: list[EvaluationItemResult],
default_metrics: list[dict[str, Any]],
node_run_result_mapping: dict[str, NodeRunResult] | None,
node_run_result: NodeRunResult | None,
default_metric: DefaultMetric | None,
customized_metrics: CustomizedMetrics | None,
model_provider: str,
model_name: str,
tenant_id: str,

View File

@ -16,10 +16,13 @@ from sqlalchemy.orm import Session
from core.evaluation.base_evaluation_instance import BaseEvaluationInstance
from core.evaluation.entities.evaluation_entity import (
CustomizedMetrics,
DefaultMetric,
EvaluationItemInput,
EvaluationItemResult,
)
from core.evaluation.runners.base_evaluation_runner import BaseEvaluationRunner
from core.workflow.node_events import NodeRunResult
from models.snippet import CustomizedSnippet
from models.workflow import WorkflowNodeExecutionModel
@ -89,9 +92,10 @@ class SnippetEvaluationRunner(BaseEvaluationRunner):
def evaluate_metrics(
self,
items: list[EvaluationItemInput],
results: list[EvaluationItemResult],
default_metrics: list[dict[str, Any]],
node_run_result_mapping: dict[str, NodeRunResult] | None,
node_run_result: NodeRunResult | None,
default_metric: DefaultMetric | None,
customized_metrics: CustomizedMetrics | None,
model_provider: str,
model_name: str,
tenant_id: str,

View File

@ -6,10 +6,13 @@ from sqlalchemy.orm import Session
from core.evaluation.base_evaluation_instance import BaseEvaluationInstance
from core.evaluation.entities.evaluation_entity import (
CustomizedMetrics,
DefaultMetric,
EvaluationItemInput,
EvaluationItemResult,
)
from core.evaluation.runners.base_evaluation_runner import BaseEvaluationRunner
from core.workflow.node_events import NodeRunResult
from models.model import App
logger = logging.getLogger(__name__)
@ -23,9 +26,10 @@ class WorkflowEvaluationRunner(BaseEvaluationRunner):
def evaluate_metrics(
self,
items: list[EvaluationItemInput],
results: list[EvaluationItemResult],
default_metrics: list[dict[str, Any]],
node_run_result_mapping: dict[str, NodeRunResult] | None,
node_run_result: NodeRunResult | None,
default_metric: DefaultMetric | None,
customized_metrics: CustomizedMetrics | None,
model_provider: str,
model_name: str,
tenant_id: str,

View File

@ -129,13 +129,18 @@ def _execute_evaluation_runner(
)
else:
default_metric.score = 0
for customized_metric in customized_metrics:
runner = _create_runner(run_data.evaluation_category, evaluation_instance, session)
runner.run(
evaluation_run_id=run_data.evaluation_run_id,
tenant_id=run_data.tenant_id,
target_id=run_data.target_id,
)
if customized_metrics:
runner = _create_runner(EvaluationCategory.WORKFLOW, evaluation_instance, session)
runner.run(
evaluation_run_id=run_data.evaluation_run_id,
tenant_id=run_data.tenant_id,
target_id=run_data.target_id,
target_type=run_data.target_type,
default_metric=None,
customized_metrics=customized_metrics,
node_run_result=None,
node_run_result_mapping=node_run_result_mapping,
)
def _create_runner(
category: EvaluationCategory,