feat: Implement customized evaluation in BaseEvaluationInstance.

This commit is contained in:
FFXN 2026-03-05 14:30:39 +08:00
parent 7149af3dac
commit b160dce4db
8 changed files with 71 additions and 58 deletions

View File

@ -21,7 +21,7 @@ class BaseEvaluationInstance(ABC):
def evaluate_llm( def evaluate_llm(
self, self,
items: list[EvaluationItemInput], items: list[EvaluationItemInput],
metrics_config: dict, default_metrics: list[dict[str, Any]],
model_provider: str, model_provider: str,
model_name: str, model_name: str,
tenant_id: str, tenant_id: str,
@ -33,7 +33,7 @@ class BaseEvaluationInstance(ABC):
def evaluate_retrieval( def evaluate_retrieval(
self, self,
items: list[EvaluationItemInput], items: list[EvaluationItemInput],
metrics_config: dict, default_metrics: list[dict[str, Any]],
model_provider: str, model_provider: str,
model_name: str, model_name: str,
tenant_id: str, tenant_id: str,
@ -45,7 +45,7 @@ class BaseEvaluationInstance(ABC):
def evaluate_agent( def evaluate_agent(
self, self,
items: list[EvaluationItemInput], items: list[EvaluationItemInput],
metrics_config: dict, default_metrics: list[dict[str, Any]],
model_provider: str, model_provider: str,
model_name: str, model_name: str,
tenant_id: str, tenant_id: str,
@ -57,7 +57,7 @@ class BaseEvaluationInstance(ABC):
def evaluate_workflow( def evaluate_workflow(
self, self,
items: list[EvaluationItemInput], items: list[EvaluationItemInput],
metrics_config: dict, default_metrics: list[dict[str, Any]],
model_provider: str, model_provider: str,
model_name: str, model_name: str,
tenant_id: str, tenant_id: str,
@ -74,7 +74,7 @@ class BaseEvaluationInstance(ABC):
self, self,
items: list[EvaluationItemInput], items: list[EvaluationItemInput],
results: list[EvaluationItemResult], results: list[EvaluationItemResult],
metrics_config: dict, customized_metrics: dict[str, Any],
tenant_id: str, tenant_id: str,
) -> list[EvaluationItemResult]: ) -> list[EvaluationItemResult]:
"""Evaluate using a published workflow as the evaluator. """Evaluate using a published workflow as the evaluator.
@ -86,8 +86,8 @@ class BaseEvaluationInstance(ABC):
Args: Args:
items: Evaluation items with inputs, expected_output, context. items: Evaluation items with inputs, expected_output, context.
results: Results from Phase 1 (with actual_output populated). results: Results from Phase 1 (with actual_output populated).
metrics_config: Must contain ``workflow_id`` pointing to a customized_metrics: Must contain ``evaluation_workflow_id``
published WORKFLOW-type App. pointing to a published WORKFLOW-type App.
tenant_id: Tenant scope. tenant_id: Tenant scope.
Returns: Returns:
@ -103,10 +103,10 @@ class BaseEvaluationInstance(ABC):
from models.model import App from models.model import App
from services.workflow_service import WorkflowService from services.workflow_service import WorkflowService
workflow_id = metrics_config.get("workflow_id") workflow_id = customized_metrics.get("evaluation_workflow_id")
if not workflow_id: if not workflow_id:
raise ValueError( raise ValueError(
"metrics_config must contain 'workflow_id' for customized evaluator" "customized_metrics must contain 'evaluation_workflow_id' for customized evaluator"
) )
# Load the evaluator workflow resources using a dedicated session # Load the evaluator workflow resources using a dedicated session

View File

@ -42,51 +42,51 @@ class RagasEvaluator(BaseEvaluationInstance):
def evaluate_llm( def evaluate_llm(
self, self,
items: list[EvaluationItemInput], items: list[EvaluationItemInput],
metrics_config: dict, default_metrics: list[dict[str, Any]],
model_provider: str, model_provider: str,
model_name: str, model_name: str,
tenant_id: str, tenant_id: str,
) -> list[EvaluationItemResult]: ) -> list[EvaluationItemResult]:
return self._evaluate(items, metrics_config, model_provider, model_name, tenant_id, EvaluationCategory.LLM) return self._evaluate(items, default_metrics, model_provider, model_name, tenant_id, EvaluationCategory.LLM)
def evaluate_retrieval( def evaluate_retrieval(
self, self,
items: list[EvaluationItemInput], items: list[EvaluationItemInput],
metrics_config: dict, default_metrics: list[dict[str, Any]],
model_provider: str, model_provider: str,
model_name: str, model_name: str,
tenant_id: str, tenant_id: str,
) -> list[EvaluationItemResult]: ) -> list[EvaluationItemResult]:
return self._evaluate( return self._evaluate(
items, metrics_config, model_provider, model_name, tenant_id, EvaluationCategory.RETRIEVAL items, default_metrics, model_provider, model_name, tenant_id, EvaluationCategory.RETRIEVAL
) )
def evaluate_agent( def evaluate_agent(
self, self,
items: list[EvaluationItemInput], items: list[EvaluationItemInput],
metrics_config: dict, default_metrics: list[dict[str, Any]],
model_provider: str, model_provider: str,
model_name: str, model_name: str,
tenant_id: str, tenant_id: str,
) -> list[EvaluationItemResult]: ) -> list[EvaluationItemResult]:
return self._evaluate(items, metrics_config, model_provider, model_name, tenant_id, EvaluationCategory.AGENT) return self._evaluate(items, default_metrics, model_provider, model_name, tenant_id, EvaluationCategory.AGENT)
def evaluate_workflow( def evaluate_workflow(
self, self,
items: list[EvaluationItemInput], items: list[EvaluationItemInput],
metrics_config: dict, default_metrics: list[dict[str, Any]],
model_provider: str, model_provider: str,
model_name: str, model_name: str,
tenant_id: str, tenant_id: str,
) -> list[EvaluationItemResult]: ) -> list[EvaluationItemResult]:
return self._evaluate( return self._evaluate(
items, metrics_config, model_provider, model_name, tenant_id, EvaluationCategory.WORKFLOW items, default_metrics, model_provider, model_name, tenant_id, EvaluationCategory.WORKFLOW
) )
def _evaluate( def _evaluate(
self, self,
items: list[EvaluationItemInput], items: list[EvaluationItemInput],
metrics_config: dict, default_metrics: list[dict[str, Any]],
model_provider: str, model_provider: str,
model_name: str, model_name: str,
tenant_id: str, tenant_id: str,
@ -98,7 +98,12 @@ class RagasEvaluator(BaseEvaluationInstance):
string similarity if RAGAS import fails. string similarity if RAGAS import fails.
""" """
model_wrapper = DifyModelWrapper(model_provider, model_name, tenant_id) model_wrapper = DifyModelWrapper(model_provider, model_name, tenant_id)
requested_metrics = metrics_config.get("metrics", self.get_supported_metrics(category)) # Extract metric names from default_metrics list; each item has a "metric" key.
requested_metrics = (
[m["metric"] for m in default_metrics if "metric" in m]
if default_metrics
else self.get_supported_metrics(category)
)
try: try:
return self._evaluate_with_ragas(items, requested_metrics, model_wrapper, category) return self._evaluate_with_ragas(items, requested_metrics, model_wrapper, category)
@ -116,11 +121,6 @@ class RagasEvaluator(BaseEvaluationInstance):
"""Evaluate using RAGAS library.""" """Evaluate using RAGAS library."""
from ragas import evaluate as ragas_evaluate from ragas import evaluate as ragas_evaluate
from ragas.dataset_schema import EvaluationDataset, SingleTurnSample from ragas.dataset_schema import EvaluationDataset, SingleTurnSample
from ragas.llms import LangchainLLMWrapper
from ragas.metrics import (
Faithfulness,
ResponseRelevancy,
)
# Build RAGAS dataset # Build RAGAS dataset
samples = [] samples = []

View File

@ -1,5 +1,6 @@
import logging import logging
from typing import Any, Mapping, Union from collections.abc import Mapping
from typing import Any
from sqlalchemy.orm import Session from sqlalchemy.orm import Session
@ -9,7 +10,7 @@ from core.evaluation.entities.evaluation_entity import (
EvaluationItemResult, EvaluationItemResult,
) )
from core.evaluation.runners.base_evaluation_runner import BaseEvaluationRunner from core.evaluation.runners.base_evaluation_runner import BaseEvaluationRunner
from models.model import App, AppMode from models.model import App
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -29,8 +30,8 @@ class AgentEvaluationRunner(BaseEvaluationRunner):
) -> EvaluationItemResult: ) -> EvaluationItemResult:
"""Execute agent app and collect response with tool call information.""" """Execute agent app and collect response with tool call information."""
from core.app.apps.agent_chat.app_generator import AgentChatAppGenerator from core.app.apps.agent_chat.app_generator import AgentChatAppGenerator
from core.evaluation.runners import get_service_account_for_app
from core.app.entities.app_invoke_entities import InvokeFrom from core.app.entities.app_invoke_entities import InvokeFrom
from core.evaluation.runners import get_service_account_for_app
app = self.session.query(App).filter_by(id=target_id).first() app = self.session.query(App).filter_by(id=target_id).first()
if not app: if not app:
@ -67,7 +68,7 @@ class AgentEvaluationRunner(BaseEvaluationRunner):
self, self,
items: list[EvaluationItemInput], items: list[EvaluationItemInput],
results: list[EvaluationItemResult], results: list[EvaluationItemResult],
metrics_config: dict, default_metrics: list[dict[str, Any]],
model_provider: str, model_provider: str,
model_name: str, model_name: str,
tenant_id: str, tenant_id: str,
@ -90,7 +91,7 @@ class AgentEvaluationRunner(BaseEvaluationRunner):
) )
evaluated = self.evaluation_instance.evaluate_agent( evaluated = self.evaluation_instance.evaluate_agent(
merged_items, metrics_config, model_provider, model_name, tenant_id merged_items, default_metrics, model_provider, model_name, tenant_id
) )
# Merge metrics back preserving metadata # Merge metrics back preserving metadata

View File

@ -51,7 +51,7 @@ class BaseEvaluationRunner(ABC):
self, self,
items: list[EvaluationItemInput], items: list[EvaluationItemInput],
results: list[EvaluationItemResult], results: list[EvaluationItemResult],
metrics_config: dict, default_metrics: list[dict[str, Any]],
model_provider: str, model_provider: str,
model_name: str, model_name: str,
tenant_id: str, tenant_id: str,
@ -66,9 +66,10 @@ class BaseEvaluationRunner(ABC):
target_id: str, target_id: str,
target_type: str, target_type: str,
items: list[EvaluationItemInput], items: list[EvaluationItemInput],
metrics_config: dict, default_metrics: list[dict[str, Any]],
model_provider: str, customized_metrics: dict[str, Any] | None = None,
model_name: str, model_provider: str = "",
model_name: str = "",
judgment_config: JudgmentConfig | None = None, judgment_config: JudgmentConfig | None = None,
) -> list[EvaluationItemResult]: ) -> list[EvaluationItemResult]:
"""Orchestrate target execution + metric evaluation + judgment for all items.""" """Orchestrate target execution + metric evaluation + judgment for all items."""
@ -106,13 +107,15 @@ class BaseEvaluationRunner(ABC):
if successful_items and successful_results: if successful_items and successful_results:
try: try:
if _is_customized_evaluation(metrics_config): if customized_metrics is not None:
# Customized workflow evaluation — target-type agnostic
evaluated_results = self._evaluate_customized( evaluated_results = self._evaluate_customized(
successful_items, successful_results, metrics_config, tenant_id, successful_items, successful_results, customized_metrics, tenant_id,
) )
else: else:
# Framework-specific evaluation — delegate to subclass
evaluated_results = self.evaluate_metrics( evaluated_results = self.evaluate_metrics(
successful_items, successful_results, metrics_config, successful_items, successful_results, default_metrics,
model_provider, model_name, tenant_id, model_provider, model_name, tenant_id,
) )
# Merge evaluated metrics back into results # Merge evaluated metrics back into results
@ -153,12 +156,18 @@ class BaseEvaluationRunner(ABC):
self, self,
items: list[EvaluationItemInput], items: list[EvaluationItemInput],
results: list[EvaluationItemResult], results: list[EvaluationItemResult],
metrics_config: dict, customized_metrics: dict[str, Any],
tenant_id: str, tenant_id: str,
) -> list[EvaluationItemResult]: ) -> list[EvaluationItemResult]:
"""Delegate to the instance's customized workflow evaluator.""" """Delegate to the instance's customized workflow evaluator.
Unlike the framework path (which merges ``actual_output`` into
``context``), here we pass ``results`` directly the instance's
``evaluate_with_customized_workflow()`` reads ``actual_output``
from each ``EvaluationItemResult``.
"""
evaluated = self.evaluation_instance.evaluate_with_customized_workflow( evaluated = self.evaluation_instance.evaluate_with_customized_workflow(
items, results, metrics_config, tenant_id, items, results, customized_metrics, tenant_id,
) )
# Merge metrics back preserving actual_output and metadata from Phase 1 # Merge metrics back preserving actual_output and metadata from Phase 1
@ -180,7 +189,6 @@ class BaseEvaluationRunner(ABC):
final_results.append(result) final_results.append(result)
return final_results return final_results
@staticmethod @staticmethod
def _apply_judgment( def _apply_judgment(
results: list[EvaluationItemResult], results: list[EvaluationItemResult],
@ -225,8 +233,3 @@ class BaseEvaluationRunner(ABC):
result.model_copy(update={"judgment": judgment_result}) result.model_copy(update={"judgment": judgment_result})
) )
return judged_results return judged_results
def _is_customized_evaluation(metrics_config: dict[str, Any]) -> bool:
"""Check if metrics_config indicates a customized workflow evaluation."""
return bool(metrics_config.get("workflow_id"))

View File

@ -1,5 +1,6 @@
import logging import logging
from typing import Any, Mapping, Union from collections.abc import Mapping
from typing import Any, Union
from sqlalchemy.orm import Session from sqlalchemy.orm import Session
@ -30,8 +31,8 @@ class LLMEvaluationRunner(BaseEvaluationRunner):
"""Execute the App/Snippet with the given inputs and collect the response.""" """Execute the App/Snippet with the given inputs and collect the response."""
from core.app.apps.completion.app_generator import CompletionAppGenerator from core.app.apps.completion.app_generator import CompletionAppGenerator
from core.app.apps.workflow.app_generator import WorkflowAppGenerator from core.app.apps.workflow.app_generator import WorkflowAppGenerator
from core.evaluation.runners import get_service_account_for_app
from core.app.entities.app_invoke_entities import InvokeFrom from core.app.entities.app_invoke_entities import InvokeFrom
from core.evaluation.runners import get_service_account_for_app
from services.workflow_service import WorkflowService from services.workflow_service import WorkflowService
app = self.session.query(App).filter_by(id=target_id).first() app = self.session.query(App).filter_by(id=target_id).first()
@ -89,7 +90,7 @@ class LLMEvaluationRunner(BaseEvaluationRunner):
self, self,
items: list[EvaluationItemInput], items: list[EvaluationItemInput],
results: list[EvaluationItemResult], results: list[EvaluationItemResult],
metrics_config: dict, default_metrics: list[dict[str, Any]],
model_provider: str, model_provider: str,
model_name: str, model_name: str,
tenant_id: str, tenant_id: str,
@ -98,7 +99,7 @@ class LLMEvaluationRunner(BaseEvaluationRunner):
# Merge actual_output into items for evaluation # Merge actual_output into items for evaluation
merged_items = self._merge_results_into_items(items, results) merged_items = self._merge_results_into_items(items, results)
return self.evaluation_instance.evaluate_llm( return self.evaluation_instance.evaluate_llm(
merged_items, metrics_config, model_provider, model_name, tenant_id merged_items, default_metrics, model_provider, model_name, tenant_id
) )
@staticmethod @staticmethod

View File

@ -58,7 +58,7 @@ class RetrievalEvaluationRunner(BaseEvaluationRunner):
self, self,
items: list[EvaluationItemInput], items: list[EvaluationItemInput],
results: list[EvaluationItemResult], results: list[EvaluationItemResult],
metrics_config: dict, default_metrics: list[dict[str, Any]],
model_provider: str, model_provider: str,
model_name: str, model_name: str,
tenant_id: str, tenant_id: str,
@ -80,7 +80,7 @@ class RetrievalEvaluationRunner(BaseEvaluationRunner):
) )
evaluated = self.evaluation_instance.evaluate_retrieval( evaluated = self.evaluation_instance.evaluate_retrieval(
merged_items, metrics_config, model_provider, model_name, tenant_id merged_items, default_metrics, model_provider, model_name, tenant_id
) )
# Merge metrics back into original results (preserve actual_output and metadata) # Merge metrics back into original results (preserve actual_output and metadata)

View File

@ -1,5 +1,6 @@
import logging import logging
from typing import Any, Mapping from collections.abc import Mapping
from typing import Any
from sqlalchemy.orm import Session from sqlalchemy.orm import Session
@ -29,8 +30,8 @@ class WorkflowEvaluationRunner(BaseEvaluationRunner):
) -> EvaluationItemResult: ) -> EvaluationItemResult:
"""Execute workflow and collect outputs.""" """Execute workflow and collect outputs."""
from core.app.apps.workflow.app_generator import WorkflowAppGenerator from core.app.apps.workflow.app_generator import WorkflowAppGenerator
from core.evaluation.runners import get_service_account_for_app
from core.app.entities.app_invoke_entities import InvokeFrom from core.app.entities.app_invoke_entities import InvokeFrom
from core.evaluation.runners import get_service_account_for_app
from services.workflow_service import WorkflowService from services.workflow_service import WorkflowService
app = self.session.query(App).filter_by(id=target_id).first() app = self.session.query(App).filter_by(id=target_id).first()
@ -68,7 +69,7 @@ class WorkflowEvaluationRunner(BaseEvaluationRunner):
self, self,
items: list[EvaluationItemInput], items: list[EvaluationItemInput],
results: list[EvaluationItemResult], results: list[EvaluationItemResult],
metrics_config: dict, default_metrics: list[dict[str, Any]],
model_provider: str, model_provider: str,
model_name: str, model_name: str,
tenant_id: str, tenant_id: str,
@ -91,7 +92,7 @@ class WorkflowEvaluationRunner(BaseEvaluationRunner):
) )
evaluated = self.evaluation_instance.evaluate_workflow( evaluated = self.evaluation_instance.evaluate_workflow(
merged_items, metrics_config, model_provider, model_name, tenant_id merged_items, default_metrics, model_provider, model_name, tenant_id
) )
# Merge metrics back preserving metadata # Merge metrics back preserving metadata

View File

@ -1,8 +1,6 @@
import io import io
import json import json
import logging import logging
from configs import dify_config
from models.model import UploadFile
from typing import Any from typing import Any
from celery import shared_task from celery import shared_task
@ -10,6 +8,7 @@ from openpyxl import Workbook
from openpyxl.styles import Alignment, Border, Font, PatternFill, Side from openpyxl.styles import Alignment, Border, Font, PatternFill, Side
from openpyxl.utils import get_column_letter from openpyxl.utils import get_column_letter
from configs import dify_config
from core.evaluation.entities.evaluation_entity import ( from core.evaluation.entities.evaluation_entity import (
EvaluationCategory, EvaluationCategory,
EvaluationItemResult, EvaluationItemResult,
@ -23,6 +22,7 @@ from core.evaluation.runners.workflow_evaluation_runner import WorkflowEvaluatio
from extensions.ext_database import db from extensions.ext_database import db
from libs.datetime_utils import naive_utc_now from libs.datetime_utils import naive_utc_now
from models.evaluation import EvaluationRun, EvaluationRunStatus from models.evaluation import EvaluationRun, EvaluationRunStatus
from models.model import UploadFile
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -86,6 +86,7 @@ def _execute_evaluation(session: Any, run_data: EvaluationRunData) -> None:
customized_metrics=run_data.customized_metrics, customized_metrics=run_data.customized_metrics,
model_provider=run_data.evaluation_model_provider, model_provider=run_data.evaluation_model_provider,
model_name=run_data.evaluation_model, model_name=run_data.evaluation_model,
judgment_config=run_data.judgment_config,
) )
# Compute summary metrics # Compute summary metrics
@ -210,7 +211,13 @@ def _generate_result_xlsx(
input_keys.append(key) input_keys.append(key)
# Build headers # Build headers
headers = ["index"] + input_keys + ["expected_output", "actual_output"] + all_metric_names + ["overall_score", "error"] headers = (
["index"]
+ input_keys
+ ["expected_output", "actual_output"]
+ all_metric_names
+ ["overall_score", "error"]
)
# Write header row # Write header row
for col_idx, header in enumerate(headers, start=1): for col_idx, header in enumerate(headers, start=1):