mirror of
https://github.com/langgenius/dify.git
synced 2026-05-07 02:46:32 +08:00
Co-authored-by: jyong <718720800@qq.com> Co-authored-by: Yansong Zhang <916125788@qq.com> Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> Co-authored-by: hj24 <mambahj24@gmail.com> Co-authored-by: hj24 <huangjian@dify.ai> Co-authored-by: Joel <iamjoel007@gmail.com> Co-authored-by: Stephen Zhou <38493346+hyoban@users.noreply.github.com> Co-authored-by: CodingOnStar <hanxujiang@dify.com> Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
1414 lines
54 KiB
Python
1414 lines
54 KiB
Python
import csv
|
|
import io
|
|
import json
|
|
import logging
|
|
from collections.abc import Mapping
|
|
from typing import Any, Union
|
|
|
|
from openpyxl import Workbook, load_workbook
|
|
from openpyxl.styles import Alignment, Border, Font, PatternFill, Side
|
|
from openpyxl.utils import get_column_letter
|
|
from sqlalchemy.orm import Session
|
|
|
|
from configs import dify_config
|
|
from core.evaluation.entities.evaluation_entity import (
|
|
METRIC_NODE_TYPE_MAPPING,
|
|
METRIC_VALUE_TYPE_MAPPING,
|
|
DefaultMetric,
|
|
EvaluationCategory,
|
|
EvaluationConfigData,
|
|
EvaluationDatasetInput,
|
|
EvaluationItemResult,
|
|
EvaluationMetric,
|
|
EvaluationMetricName,
|
|
EvaluationRunData,
|
|
EvaluationRunRequest,
|
|
NodeInfo,
|
|
)
|
|
from core.evaluation.entities.judgment_entity import JudgmentConfig, JudgmentResult
|
|
from core.evaluation.evaluation_manager import EvaluationManager
|
|
from core.evaluation.judgment.processor import JudgmentProcessor
|
|
from graphon.enums import WorkflowNodeExecutionMetadataKey
|
|
from graphon.node_events.base import NodeRunResult
|
|
from libs.datetime_utils import naive_utc_now
|
|
from models.evaluation import (
|
|
EvaluationConfiguration,
|
|
EvaluationRun,
|
|
EvaluationRunItem,
|
|
EvaluationRunStatus,
|
|
EvaluationTargetType,
|
|
)
|
|
from models.model import App, AppMode
|
|
from models.snippet import CustomizedSnippet
|
|
from models.workflow import Workflow
|
|
from services.errors.evaluation import (
|
|
EvaluationDatasetInvalidError,
|
|
EvaluationFrameworkNotConfiguredError,
|
|
EvaluationMaxConcurrentRunsError,
|
|
EvaluationNotFoundError,
|
|
)
|
|
from services.snippet_service import SnippetService
|
|
from services.workflow_service import WorkflowService
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class EvaluationService:
|
|
"""
|
|
Service for evaluation-related operations.
|
|
|
|
Provides functionality to generate evaluation dataset templates
|
|
based on App or Snippet input parameters.
|
|
"""
|
|
|
|
# Excluded app modes that don't support evaluation templates
|
|
EXCLUDED_APP_MODES = {AppMode.RAG_PIPELINE}
|
|
CONSOLE_DISABLED_CATEGORIES = {EvaluationCategory.AGENT}
|
|
CONSOLE_DISABLED_METRICS = {
|
|
EvaluationMetricName.TOOL_CORRECTNESS.value,
|
|
EvaluationMetricName.TASK_COMPLETION.value,
|
|
}
|
|
METRICS_REQUIRING_EXPECTED_OUTPUT = {
|
|
EvaluationMetricName.ANSWER_CORRECTNESS.value,
|
|
EvaluationMetricName.SEMANTIC_SIMILARITY.value,
|
|
EvaluationMetricName.CONTEXT_PRECISION.value,
|
|
EvaluationMetricName.CONTEXT_RECALL.value,
|
|
}
|
|
|
|
@classmethod
|
|
def generate_dataset_template(
|
|
cls,
|
|
target: Union[App, CustomizedSnippet],
|
|
target_type: str,
|
|
) -> tuple[bytes, str]:
|
|
"""
|
|
Generate evaluation dataset template as XLSX bytes.
|
|
|
|
Creates an XLSX file with headers based on the evaluation target's input parameters.
|
|
The first column is index, followed by input parameter columns.
|
|
|
|
:param target: App or CustomizedSnippet instance
|
|
:param target_type: Target type string ("apps" or "snippets")
|
|
:return: Tuple of (xlsx_content_bytes, filename)
|
|
:raises ValueError: If target type is not supported or app mode is excluded
|
|
"""
|
|
# Validate target type
|
|
if target_type == EvaluationTargetType.APPS.value:
|
|
if not isinstance(target, App):
|
|
raise ValueError("Invalid target: expected App instance")
|
|
if AppMode.value_of(target.mode) in cls.EXCLUDED_APP_MODES:
|
|
raise ValueError(f"App mode '{target.mode}' does not support evaluation templates")
|
|
input_fields = cls._get_app_input_fields(target)
|
|
elif target_type == EvaluationTargetType.SNIPPETS.value:
|
|
if not isinstance(target, CustomizedSnippet):
|
|
raise ValueError("Invalid target: expected CustomizedSnippet instance")
|
|
input_fields = cls._get_snippet_input_fields(target)
|
|
else:
|
|
raise ValueError(f"Unsupported target type: {target_type}")
|
|
|
|
# Generate XLSX template
|
|
xlsx_content = cls._generate_xlsx_template(input_fields, target.name)
|
|
|
|
# Build filename
|
|
truncated_name = target.name[:10] + "..." if len(target.name) > 10 else target.name
|
|
filename = f"{truncated_name}-evaluation-dataset.xlsx"
|
|
|
|
return xlsx_content, filename
|
|
|
|
@classmethod
|
|
def _get_app_input_fields(cls, app: App) -> list[dict]:
|
|
"""
|
|
Get input fields from App's workflow.
|
|
|
|
:param app: App instance
|
|
:return: List of input field definitions
|
|
"""
|
|
workflow_service = WorkflowService()
|
|
workflow = workflow_service.get_published_workflow(app_model=app)
|
|
if not workflow:
|
|
workflow = workflow_service.get_draft_workflow(app_model=app)
|
|
|
|
if not workflow:
|
|
return []
|
|
|
|
# Get user input form from workflow
|
|
user_input_form = workflow.user_input_form()
|
|
return user_input_form
|
|
|
|
@classmethod
|
|
def _get_snippet_input_fields(cls, snippet: CustomizedSnippet) -> list[dict]:
|
|
"""
|
|
Get input fields from Snippet.
|
|
|
|
Tries to get from snippet's own input_fields first,
|
|
then falls back to workflow's user_input_form.
|
|
|
|
:param snippet: CustomizedSnippet instance
|
|
:return: List of input field definitions
|
|
"""
|
|
# Try snippet's own input_fields first
|
|
input_fields = snippet.input_fields_list
|
|
if input_fields:
|
|
return input_fields
|
|
|
|
# Fallback to workflow's user_input_form
|
|
snippet_service = SnippetService()
|
|
workflow = snippet_service.get_published_workflow(snippet=snippet)
|
|
if not workflow:
|
|
workflow = snippet_service.get_draft_workflow(snippet=snippet)
|
|
|
|
if workflow:
|
|
return workflow.user_input_form()
|
|
|
|
return []
|
|
|
|
@classmethod
|
|
def _generate_xlsx_template(cls, input_fields: list[dict], target_name: str) -> bytes:
|
|
"""
|
|
Generate XLSX template file content.
|
|
|
|
Creates a workbook with:
|
|
- First row as header row with "index" and input field names
|
|
- Styled header with background color and borders
|
|
- Empty data rows ready for user input
|
|
|
|
:param input_fields: List of input field definitions
|
|
:param target_name: Name of the target (for sheet name)
|
|
:return: XLSX file content as bytes
|
|
"""
|
|
wb = Workbook()
|
|
ws = wb.active
|
|
if ws is None:
|
|
ws = wb.create_sheet("Evaluation Dataset")
|
|
|
|
sheet_name = "Evaluation Dataset"
|
|
ws.title = sheet_name
|
|
|
|
header_font = Font(bold=True, color="FFFFFF")
|
|
header_fill = PatternFill(start_color="4472C4", end_color="4472C4", fill_type="solid")
|
|
header_alignment = Alignment(horizontal="center", vertical="center")
|
|
thin_border = Border(
|
|
left=Side(style="thin"),
|
|
right=Side(style="thin"),
|
|
top=Side(style="thin"),
|
|
bottom=Side(style="thin"),
|
|
)
|
|
|
|
# Build header row
|
|
headers = ["index"]
|
|
|
|
for field in input_fields:
|
|
field_label = str(field.get("label") or field.get("variable") or "")
|
|
headers.append(field_label)
|
|
|
|
# Write header row
|
|
for col_idx, header in enumerate(headers, start=1):
|
|
cell = ws.cell(row=1, column=col_idx, value=header)
|
|
cell.font = header_font
|
|
cell.fill = header_fill
|
|
cell.alignment = header_alignment
|
|
cell.border = thin_border
|
|
|
|
# Set column widths
|
|
ws.column_dimensions["A"].width = 10 # index column
|
|
for col_idx in range(2, len(headers) + 1):
|
|
ws.column_dimensions[get_column_letter(col_idx)].width = 20
|
|
|
|
# Add one empty row with row number for user reference
|
|
for col_idx in range(1, len(headers) + 1):
|
|
cell = ws.cell(row=2, column=col_idx, value="")
|
|
cell.border = thin_border
|
|
if col_idx == 1:
|
|
cell.value = 1
|
|
cell.alignment = Alignment(horizontal="center")
|
|
|
|
# Save to bytes
|
|
output = io.BytesIO()
|
|
wb.save(output)
|
|
output.seek(0)
|
|
|
|
return output.getvalue()
|
|
|
|
@classmethod
|
|
def generate_retrieval_dataset_template(cls) -> tuple[bytes, str]:
|
|
"""Generate evaluation dataset XLSX template for knowledge base retrieval.
|
|
|
|
The template contains three columns: ``index``, ``query``, and
|
|
``expected_output``. Callers upload a filled copy and start an
|
|
evaluation run with ``target_type="dataset"``.
|
|
|
|
:returns: (xlsx_content_bytes, filename)
|
|
"""
|
|
wb = Workbook()
|
|
ws = wb.active
|
|
if ws is None:
|
|
ws = wb.create_sheet("Evaluation Dataset")
|
|
ws.title = "Evaluation Dataset"
|
|
|
|
header_font = Font(bold=True, color="FFFFFF")
|
|
header_fill = PatternFill(start_color="4472C4", end_color="4472C4", fill_type="solid")
|
|
header_alignment = Alignment(horizontal="center", vertical="center")
|
|
thin_border = Border(
|
|
left=Side(style="thin"),
|
|
right=Side(style="thin"),
|
|
top=Side(style="thin"),
|
|
bottom=Side(style="thin"),
|
|
)
|
|
|
|
headers = ["index", "query", "expected_output"]
|
|
for col_idx, header in enumerate(headers, start=1):
|
|
cell = ws.cell(row=1, column=col_idx, value=header)
|
|
cell.font = header_font
|
|
cell.fill = header_fill
|
|
cell.alignment = header_alignment
|
|
cell.border = thin_border
|
|
|
|
ws.column_dimensions["A"].width = 10
|
|
ws.column_dimensions["B"].width = 30
|
|
ws.column_dimensions["C"].width = 30
|
|
|
|
# Add one sample row
|
|
for col_idx in range(1, len(headers) + 1):
|
|
cell = ws.cell(row=2, column=col_idx, value="")
|
|
cell.border = thin_border
|
|
if col_idx == 1:
|
|
cell.value = 1
|
|
cell.alignment = Alignment(horizontal="center")
|
|
|
|
output = io.BytesIO()
|
|
wb.save(output)
|
|
output.seek(0)
|
|
return output.getvalue(), "retrieval-evaluation-dataset.xlsx"
|
|
|
|
# ---- Evaluation Configuration CRUD ----
|
|
|
|
@classmethod
|
|
def get_evaluation_config(
|
|
cls,
|
|
session: Session,
|
|
tenant_id: str,
|
|
target_type: str,
|
|
target_id: str,
|
|
) -> EvaluationConfiguration | None:
|
|
return (
|
|
session.query(EvaluationConfiguration)
|
|
.filter_by(tenant_id=tenant_id, target_type=target_type, target_id=target_id)
|
|
.first()
|
|
)
|
|
|
|
@classmethod
|
|
def save_evaluation_config(
|
|
cls,
|
|
session: Session,
|
|
tenant_id: str,
|
|
target_type: str,
|
|
target_id: str,
|
|
account_id: str,
|
|
data: EvaluationConfigData,
|
|
) -> EvaluationConfiguration:
|
|
config = cls.get_evaluation_config(session, tenant_id, target_type, target_id)
|
|
if config is None:
|
|
config = EvaluationConfiguration(
|
|
tenant_id=tenant_id,
|
|
target_type=target_type,
|
|
target_id=target_id,
|
|
created_by=account_id,
|
|
updated_by=account_id,
|
|
)
|
|
session.add(config)
|
|
|
|
config.evaluation_model_provider = data.evaluation_model_provider
|
|
config.evaluation_model = data.evaluation_model
|
|
config.metrics_config = json.dumps(
|
|
{
|
|
"default_metrics": [m.model_dump() for m in data.default_metrics],
|
|
"customized_metrics": data.customized_metrics.model_dump() if data.customized_metrics else None,
|
|
}
|
|
)
|
|
config.judgement_conditions = json.dumps(data.judgment_config.model_dump() if data.judgment_config else {})
|
|
config.customized_workflow_id = (
|
|
data.customized_metrics.evaluation_workflow_id if data.customized_metrics else None
|
|
)
|
|
config.updated_by = account_id
|
|
session.commit()
|
|
session.refresh(config)
|
|
return config
|
|
|
|
@classmethod
|
|
def list_targets_by_customized_workflow(
|
|
cls,
|
|
session: Session,
|
|
tenant_id: str,
|
|
customized_workflow_id: str,
|
|
) -> list[EvaluationConfiguration]:
|
|
"""Return all evaluation configs that reference the given workflow as customized metrics."""
|
|
from sqlalchemy import select
|
|
|
|
return list(
|
|
session.scalars(
|
|
select(EvaluationConfiguration).where(
|
|
EvaluationConfiguration.tenant_id == tenant_id,
|
|
EvaluationConfiguration.customized_workflow_id == customized_workflow_id,
|
|
)
|
|
).all()
|
|
)
|
|
|
|
# ---- Evaluation Run Management ----
|
|
|
|
@classmethod
|
|
def start_evaluation_run(
|
|
cls,
|
|
session: Session,
|
|
tenant_id: str,
|
|
target_type: str,
|
|
target_id: str,
|
|
account_id: str,
|
|
dataset_file_content: bytes,
|
|
dataset_filename: str,
|
|
run_request: EvaluationRunRequest,
|
|
) -> EvaluationRun:
|
|
"""Validate dataset, create run record, dispatch Celery task.
|
|
|
|
Saves the provided parameters as the latest EvaluationConfiguration
|
|
before creating the run.
|
|
"""
|
|
# Check framework is configured
|
|
evaluation_instance = EvaluationManager.get_evaluation_instance()
|
|
if evaluation_instance is None:
|
|
raise EvaluationFrameworkNotConfiguredError()
|
|
|
|
# Save as latest EvaluationConfiguration
|
|
config = cls.save_evaluation_config(
|
|
session=session,
|
|
tenant_id=tenant_id,
|
|
target_type=target_type,
|
|
target_id=target_id,
|
|
account_id=account_id,
|
|
data=run_request,
|
|
)
|
|
|
|
# Check concurrent run limit
|
|
active_runs = (
|
|
session.query(EvaluationRun)
|
|
.filter_by(tenant_id=tenant_id)
|
|
.filter(EvaluationRun.status.in_([EvaluationRunStatus.PENDING, EvaluationRunStatus.RUNNING]))
|
|
.count()
|
|
)
|
|
max_concurrent = dify_config.EVALUATION_MAX_CONCURRENT_RUNS
|
|
if active_runs >= max_concurrent:
|
|
raise EvaluationMaxConcurrentRunsError(f"Maximum concurrent runs ({max_concurrent}) reached.")
|
|
|
|
# Parse dataset
|
|
items = cls._parse_dataset(dataset_file_content, dataset_filename)
|
|
max_rows = dify_config.EVALUATION_MAX_DATASET_ROWS
|
|
if len(items) > max_rows:
|
|
raise EvaluationDatasetInvalidError(f"Dataset has {len(items)} rows, max is {max_rows}.")
|
|
|
|
# Create evaluation run
|
|
evaluation_run = EvaluationRun(
|
|
tenant_id=tenant_id,
|
|
target_type=target_type,
|
|
target_id=target_id,
|
|
evaluation_config_id=config.id,
|
|
status=EvaluationRunStatus.PENDING,
|
|
dataset_file_id=run_request.file_id,
|
|
total_items=len(items),
|
|
created_by=account_id,
|
|
)
|
|
session.add(evaluation_run)
|
|
session.commit()
|
|
session.refresh(evaluation_run)
|
|
|
|
# Build Celery task data
|
|
run_data = EvaluationRunData(
|
|
evaluation_run_id=evaluation_run.id,
|
|
tenant_id=tenant_id,
|
|
target_type=target_type,
|
|
target_id=target_id,
|
|
evaluation_model_provider=run_request.evaluation_model_provider,
|
|
evaluation_model=run_request.evaluation_model,
|
|
default_metrics=run_request.default_metrics,
|
|
customized_metrics=run_request.customized_metrics,
|
|
judgment_config=run_request.judgment_config,
|
|
input_list=items,
|
|
)
|
|
|
|
# Dispatch Celery task
|
|
from tasks.evaluation_task import run_evaluation
|
|
|
|
task = run_evaluation.delay(run_data.model_dump())
|
|
evaluation_run.celery_task_id = task.id
|
|
session.commit()
|
|
|
|
return evaluation_run
|
|
|
|
@classmethod
|
|
def start_stub_evaluation_run(
|
|
cls,
|
|
session: Session,
|
|
tenant_id: str,
|
|
target_type: str,
|
|
target_id: str,
|
|
account_id: str,
|
|
dataset_file_content: bytes,
|
|
dataset_filename: str,
|
|
run_request: EvaluationRunRequest,
|
|
) -> EvaluationRun:
|
|
"""Persist a completed synthetic run for frontend integration testing.
|
|
|
|
This temporary path keeps the existing read flows (`logs`, `run detail`,
|
|
and result-file download) working for app evaluations while the real
|
|
execution logic is moved to `/evaluation/run1` for backend iteration.
|
|
"""
|
|
from tasks.evaluation_task import (
|
|
_compute_metrics_summary,
|
|
_generate_result_xlsx,
|
|
_persist_results,
|
|
_store_result_file,
|
|
)
|
|
|
|
config = cls.save_evaluation_config(
|
|
session=session,
|
|
tenant_id=tenant_id,
|
|
target_type=target_type,
|
|
target_id=target_id,
|
|
account_id=account_id,
|
|
data=run_request,
|
|
)
|
|
|
|
items = cls._parse_dataset(dataset_file_content, dataset_filename)
|
|
max_rows = dify_config.EVALUATION_MAX_DATASET_ROWS
|
|
if len(items) > max_rows:
|
|
raise EvaluationDatasetInvalidError(f"Dataset has {len(items)} rows, max is {max_rows}.")
|
|
|
|
now = naive_utc_now()
|
|
results = cls._build_stub_results(input_list=items, run_request=run_request)
|
|
metrics_summary = _compute_metrics_summary(results, run_request.judgment_config)
|
|
|
|
evaluation_run = EvaluationRun(
|
|
tenant_id=tenant_id,
|
|
target_type=target_type,
|
|
target_id=target_id,
|
|
evaluation_config_id=config.id,
|
|
status=EvaluationRunStatus.COMPLETED,
|
|
dataset_file_id=run_request.file_id,
|
|
total_items=len(items),
|
|
completed_items=len(items),
|
|
failed_items=0,
|
|
metrics_summary=json.dumps(metrics_summary),
|
|
created_by=account_id,
|
|
started_at=now,
|
|
completed_at=now,
|
|
)
|
|
session.add(evaluation_run)
|
|
session.commit()
|
|
session.refresh(evaluation_run)
|
|
|
|
_persist_results(session, evaluation_run.id, results, items)
|
|
|
|
result_xlsx = _generate_result_xlsx(items, results)
|
|
result_file_id = _store_result_file(
|
|
tenant_id,
|
|
evaluation_run.id,
|
|
result_xlsx,
|
|
session,
|
|
created_by=account_id,
|
|
)
|
|
if result_file_id:
|
|
evaluation_run.result_file_id = result_file_id
|
|
session.commit()
|
|
session.refresh(evaluation_run)
|
|
|
|
return evaluation_run
|
|
|
|
@classmethod
|
|
def get_evaluation_runs(
|
|
cls,
|
|
session: Session,
|
|
tenant_id: str,
|
|
target_type: str,
|
|
target_id: str,
|
|
page: int = 1,
|
|
page_size: int = 20,
|
|
) -> tuple[list[EvaluationRun], int]:
|
|
"""Query evaluation run history with pagination."""
|
|
query = (
|
|
session.query(EvaluationRun)
|
|
.filter_by(tenant_id=tenant_id, target_type=target_type, target_id=target_id)
|
|
.order_by(EvaluationRun.created_at.desc())
|
|
)
|
|
total = query.count()
|
|
runs = query.offset((page - 1) * page_size).limit(page_size).all()
|
|
return runs, total
|
|
|
|
@classmethod
|
|
def get_evaluation_run_detail(
|
|
cls,
|
|
session: Session,
|
|
tenant_id: str,
|
|
run_id: str,
|
|
) -> EvaluationRun:
|
|
run = session.query(EvaluationRun).filter_by(id=run_id, tenant_id=tenant_id).first()
|
|
if not run:
|
|
raise EvaluationNotFoundError("Evaluation run not found.")
|
|
return run
|
|
|
|
@classmethod
|
|
def get_evaluation_run_items(
|
|
cls,
|
|
session: Session,
|
|
run_id: str,
|
|
page: int = 1,
|
|
page_size: int = 50,
|
|
) -> tuple[list[EvaluationRunItem], int]:
|
|
"""Query evaluation run items with pagination."""
|
|
query = (
|
|
session.query(EvaluationRunItem)
|
|
.filter_by(evaluation_run_id=run_id)
|
|
.order_by(EvaluationRunItem.item_index.asc())
|
|
)
|
|
total = query.count()
|
|
items = query.offset((page - 1) * page_size).limit(page_size).all()
|
|
return items, total
|
|
|
|
@classmethod
|
|
def cancel_evaluation_run(
|
|
cls,
|
|
session: Session,
|
|
tenant_id: str,
|
|
run_id: str,
|
|
) -> EvaluationRun:
|
|
run = cls.get_evaluation_run_detail(session, tenant_id, run_id)
|
|
if run.status not in (EvaluationRunStatus.PENDING, EvaluationRunStatus.RUNNING):
|
|
raise ValueError(f"Cannot cancel evaluation run in status: {run.status}")
|
|
|
|
run.status = EvaluationRunStatus.CANCELLED
|
|
|
|
# Revoke Celery task if running
|
|
if run.celery_task_id:
|
|
try:
|
|
from celery import current_app as celery_app
|
|
|
|
celery_app.control.revoke(run.celery_task_id, terminate=True)
|
|
except Exception:
|
|
logger.exception("Failed to revoke Celery task %s", run.celery_task_id)
|
|
|
|
session.commit()
|
|
return run
|
|
|
|
@classmethod
|
|
def get_supported_metrics(cls, category: EvaluationCategory) -> list[str]:
|
|
return EvaluationManager.get_supported_metrics(category)
|
|
|
|
@classmethod
|
|
def get_available_metrics(cls) -> list[str]:
|
|
"""Return the centrally-defined list of evaluation metrics."""
|
|
return [m.value for m in EvaluationMetricName]
|
|
|
|
@classmethod
|
|
def filter_console_default_metrics(
|
|
cls,
|
|
default_metrics: list[DefaultMetric | Mapping[str, Any]],
|
|
) -> list[DefaultMetric]:
|
|
"""Drop agent-only metrics/nodes from console-facing evaluation configs."""
|
|
filtered: list[DefaultMetric] = []
|
|
for raw_metric in default_metrics:
|
|
metric = raw_metric if isinstance(raw_metric, DefaultMetric) else DefaultMetric.model_validate(raw_metric)
|
|
if metric.metric in cls.CONSOLE_DISABLED_METRICS:
|
|
continue
|
|
|
|
visible_nodes = [node for node in metric.node_info_list if node.type not in cls.CONSOLE_DISABLED_CATEGORIES]
|
|
if not visible_nodes:
|
|
continue
|
|
|
|
filtered.append(metric.model_copy(update={"node_info_list": visible_nodes}))
|
|
return filtered
|
|
|
|
@classmethod
|
|
def filter_console_judgment_config(
|
|
cls,
|
|
judgment_config: JudgmentConfig | Mapping[str, Any] | None,
|
|
) -> JudgmentConfig | None:
|
|
"""Strip judgment conditions that reference metrics hidden from console evaluation."""
|
|
if judgment_config is None:
|
|
return None
|
|
|
|
config = (
|
|
judgment_config
|
|
if isinstance(judgment_config, JudgmentConfig)
|
|
else JudgmentConfig.model_validate(judgment_config)
|
|
)
|
|
visible_conditions = [
|
|
condition
|
|
for condition in config.conditions
|
|
if len(condition.variable_selector) < 2 or condition.variable_selector[1] not in cls.CONSOLE_DISABLED_METRICS
|
|
]
|
|
if not visible_conditions:
|
|
return None
|
|
return config.model_copy(update={"conditions": visible_conditions})
|
|
|
|
@classmethod
|
|
def serialize_console_default_metrics(
|
|
cls,
|
|
default_metrics: list[Mapping[str, Any]],
|
|
) -> list[dict[str, Any]]:
|
|
"""Return sanitized default metrics payload for console responses."""
|
|
return [metric.model_dump() for metric in cls.filter_console_default_metrics(default_metrics)]
|
|
|
|
@classmethod
|
|
def serialize_console_judgment_config(
|
|
cls,
|
|
judgment_config: Mapping[str, Any] | None,
|
|
) -> dict[str, Any] | None:
|
|
"""Return sanitized judgment config payload for console responses."""
|
|
filtered = cls.filter_console_judgment_config(judgment_config)
|
|
return filtered.model_dump() if filtered else None
|
|
|
|
@classmethod
|
|
def get_dataset_column_names(
|
|
cls,
|
|
target: Union[App, CustomizedSnippet],
|
|
target_type: str,
|
|
data: EvaluationConfigData,
|
|
) -> list[str]:
|
|
"""Build dataset column names from target inputs and the selected evaluation config."""
|
|
input_columns = cls._get_target_input_column_names(target, target_type)
|
|
expected_output_columns = cls._get_expected_output_column_names(data.default_metrics)
|
|
return ["index", *input_columns, *expected_output_columns]
|
|
|
|
@classmethod
|
|
def _get_target_input_column_names(
|
|
cls,
|
|
target: Union[App, CustomizedSnippet],
|
|
target_type: str,
|
|
) -> list[str]:
|
|
"""Resolve user-input variables for the target in workflow order."""
|
|
if target_type == EvaluationTargetType.APPS.value and isinstance(target, App):
|
|
input_fields = cls._get_app_input_fields(target)
|
|
elif target_type == EvaluationTargetType.SNIPPETS.value and isinstance(target, CustomizedSnippet):
|
|
input_fields = cls._get_snippet_input_fields(target)
|
|
else:
|
|
raise ValueError(f"Unsupported target type: {target_type}")
|
|
|
|
columns: list[str] = []
|
|
seen: set[str] = set()
|
|
for field in input_fields:
|
|
column_name = str(field.get("variable") or field.get("label") or "").strip()
|
|
if not column_name or column_name in seen:
|
|
continue
|
|
seen.add(column_name)
|
|
columns.append(column_name)
|
|
return columns
|
|
|
|
@classmethod
|
|
def _get_expected_output_column_names(
|
|
cls,
|
|
default_metrics: list[DefaultMetric | Mapping[str, Any]],
|
|
) -> list[str]:
|
|
"""Build one expected_output column per visible node that needs a reference answer."""
|
|
columns: list[str] = []
|
|
seen: set[str] = set()
|
|
for metric in cls.filter_console_default_metrics(default_metrics):
|
|
if metric.metric not in cls.METRICS_REQUIRING_EXPECTED_OUTPUT:
|
|
continue
|
|
for node_info in metric.node_info_list:
|
|
node_title = node_info.title or node_info.node_id
|
|
column_name = f"{node_title} : expected_output"
|
|
if column_name in seen:
|
|
continue
|
|
seen.add(column_name)
|
|
columns.append(column_name)
|
|
return columns
|
|
|
|
@classmethod
|
|
def _nodes_for_metrics_from_workflow(
|
|
cls,
|
|
workflow: Workflow,
|
|
metrics: list[str],
|
|
) -> dict[str, list[dict[str, str]]]:
|
|
node_type_to_nodes: dict[str, list[dict[str, str]]] = {}
|
|
for node_id, node_data in workflow.walk_nodes():
|
|
ntype = node_data.get("type", "")
|
|
if ntype in cls.CONSOLE_DISABLED_CATEGORIES:
|
|
continue
|
|
node_type_to_nodes.setdefault(ntype, []).append(
|
|
NodeInfo(node_id=node_id, type=ntype, title=node_data.get("title", "")).model_dump()
|
|
)
|
|
|
|
result: dict[str, list[dict[str, str]]] = {}
|
|
for metric in metrics:
|
|
required_node_type = METRIC_NODE_TYPE_MAPPING.get(metric)
|
|
if required_node_type is None:
|
|
result[metric] = []
|
|
continue
|
|
result[metric] = node_type_to_nodes.get(required_node_type, [])
|
|
return result
|
|
|
|
@classmethod
|
|
def _union_supported_metric_names(cls) -> list[str]:
|
|
"""Metric names the current evaluation framework supports for any :class:`EvaluationCategory`."""
|
|
ordered: list[str] = []
|
|
seen: set[str] = set()
|
|
for category in EvaluationCategory:
|
|
for name in cls.get_supported_metrics(category):
|
|
if name not in seen:
|
|
seen.add(name)
|
|
ordered.append(name)
|
|
return ordered
|
|
|
|
@classmethod
|
|
def get_default_metrics_with_nodes_for_published_target(
|
|
cls,
|
|
target: Union[App, CustomizedSnippet],
|
|
target_type: str,
|
|
) -> list[DefaultMetric]:
|
|
"""List default metrics and matching nodes using only the *published* workflow graph.
|
|
|
|
Metrics are those supported by the configured evaluation framework and present in
|
|
:data:`METRIC_NODE_TYPE_MAPPING`. Node lists are derived from the published workflow only
|
|
(no draft fallback).
|
|
"""
|
|
workflow = cls._resolve_published_workflow(target, target_type)
|
|
if not workflow:
|
|
return []
|
|
|
|
supported = cls._union_supported_metric_names()
|
|
metric_names = sorted(m for m in supported if m in METRIC_NODE_TYPE_MAPPING)
|
|
if not metric_names:
|
|
return []
|
|
|
|
nodes_by_metric = cls._nodes_for_metrics_from_workflow(workflow, metric_names)
|
|
return [
|
|
DefaultMetric(
|
|
metric=m,
|
|
value_type=METRIC_VALUE_TYPE_MAPPING.get(m, "number"),
|
|
node_info_list=[NodeInfo.model_validate(n) for n in nodes_by_metric.get(m, [])],
|
|
)
|
|
for m in metric_names
|
|
]
|
|
|
|
@classmethod
|
|
def get_nodes_for_metrics(
|
|
cls,
|
|
target: Union[App, CustomizedSnippet],
|
|
target_type: str,
|
|
metrics: list[str] | None = None,
|
|
) -> dict[str, list[dict[str, str]]]:
|
|
"""Return node info grouped by metric (or all nodes when *metrics* is empty).
|
|
|
|
:param target: App or CustomizedSnippet instance.
|
|
:param target_type: ``"apps"`` or ``"snippets"``.
|
|
:param metrics: Optional list of metric names to filter by.
|
|
When *None* or empty, returns ``{"all": [<every node>]}``.
|
|
:returns: ``{metric_name: [NodeInfo dict, ...]}`` or
|
|
``{"all": [NodeInfo dict, ...]}``.
|
|
"""
|
|
workflow = cls._resolve_workflow(target, target_type)
|
|
if not workflow:
|
|
return {"all": []} if not metrics else {m: [] for m in metrics}
|
|
|
|
if not metrics:
|
|
all_nodes = [
|
|
NodeInfo(node_id=node_id, type=node_data.get("type", ""), title=node_data.get("title", "")).model_dump()
|
|
for node_id, node_data in workflow.walk_nodes()
|
|
if node_data.get("type", "") not in cls.CONSOLE_DISABLED_CATEGORIES
|
|
]
|
|
return {"all": all_nodes}
|
|
|
|
return cls._nodes_for_metrics_from_workflow(workflow, metrics)
|
|
|
|
@classmethod
|
|
def _resolve_published_workflow(
|
|
cls,
|
|
target: Union[App, CustomizedSnippet],
|
|
target_type: str,
|
|
) -> Workflow | None:
|
|
"""Resolve only the published workflow for the target (no draft fallback)."""
|
|
if target_type == EvaluationTargetType.SNIPPETS.value and isinstance(target, CustomizedSnippet):
|
|
return SnippetService().get_published_workflow(snippet=target)
|
|
if target_type == EvaluationTargetType.APPS.value and isinstance(target, App):
|
|
return WorkflowService().get_published_workflow(app_model=target)
|
|
return None
|
|
|
|
@classmethod
|
|
def _resolve_workflow(
|
|
cls,
|
|
target: Union[App, CustomizedSnippet],
|
|
target_type: str,
|
|
) -> Workflow | None:
|
|
"""Resolve the *published* (preferred) or *draft* workflow for the target."""
|
|
if target_type == EvaluationTargetType.SNIPPETS.value and isinstance(target, CustomizedSnippet):
|
|
snippet_service = SnippetService()
|
|
workflow = snippet_service.get_published_workflow(snippet=target)
|
|
if not workflow:
|
|
workflow = snippet_service.get_draft_workflow(snippet=target)
|
|
return workflow
|
|
elif target_type == EvaluationTargetType.APPS.value and isinstance(target, App):
|
|
workflow_service = WorkflowService()
|
|
workflow = workflow_service.get_published_workflow(app_model=target)
|
|
if not workflow:
|
|
workflow = workflow_service.get_draft_workflow(app_model=target)
|
|
return workflow
|
|
return None
|
|
|
|
# ---- Category Resolution ----
|
|
|
|
@classmethod
|
|
def _resolve_evaluation_category(cls, default_metrics: list[DefaultMetric]) -> EvaluationCategory:
|
|
"""Derive evaluation category from default_metrics node_info types.
|
|
|
|
Uses the type of the first node_info found in default_metrics.
|
|
Falls back to LLM if no metrics are provided.
|
|
"""
|
|
for metric in default_metrics:
|
|
for node_info in metric.node_info_list:
|
|
try:
|
|
return EvaluationCategory(node_info.type)
|
|
except ValueError:
|
|
continue
|
|
return EvaluationCategory.LLM
|
|
|
|
@classmethod
|
|
def execute_targets(
|
|
cls,
|
|
tenant_id: str,
|
|
target_type: str,
|
|
target_id: str,
|
|
input_list: list[EvaluationDatasetInput],
|
|
max_workers: int = 5,
|
|
) -> tuple[list[dict[str, NodeRunResult]], list[str | None]]:
|
|
"""Execute the evaluation target for every test-data item in parallel.
|
|
|
|
:param tenant_id: Workspace / tenant ID.
|
|
:param target_type: ``"apps"`` or ``"snippets"``.
|
|
:param target_id: ID of the App or CustomizedSnippet.
|
|
:param input_list: All test-data items parsed from the dataset.
|
|
:param max_workers: Maximum number of parallel worker threads.
|
|
:return: Tuple of (node_results, workflow_run_ids).
|
|
node_results: ordered list of ``{node_id: NodeRunResult}`` mappings;
|
|
the *i*-th element corresponds to ``input_list[i]``.
|
|
workflow_run_ids: ordered list of workflow_run_id strings (or None)
|
|
for each input item.
|
|
"""
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
|
|
from flask import Flask, current_app
|
|
|
|
flask_app: Flask = current_app._get_current_object() # type: ignore
|
|
|
|
def _worker(item: EvaluationDatasetInput) -> tuple[dict[str, NodeRunResult], str | None]:
|
|
with flask_app.app_context():
|
|
from models.engine import db
|
|
|
|
with Session(db.engine, expire_on_commit=False) as thread_session:
|
|
try:
|
|
response = cls._run_single_target(
|
|
session=thread_session,
|
|
target_type=target_type,
|
|
target_id=target_id,
|
|
item=item,
|
|
)
|
|
|
|
workflow_run_id = cls._extract_workflow_run_id(response)
|
|
if not workflow_run_id:
|
|
logger.warning(
|
|
"No workflow_run_id for item %d (target=%s)",
|
|
item.index,
|
|
target_id,
|
|
)
|
|
return {}, None
|
|
|
|
node_results = cls._query_node_run_results(
|
|
session=thread_session,
|
|
tenant_id=tenant_id,
|
|
app_id=target_id,
|
|
workflow_run_id=workflow_run_id,
|
|
)
|
|
return node_results, workflow_run_id
|
|
except Exception:
|
|
logger.exception(
|
|
"Target execution failed for item %d (target=%s)",
|
|
item.index,
|
|
target_id,
|
|
)
|
|
return {}, None
|
|
|
|
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
futures = [executor.submit(_worker, item) for item in input_list]
|
|
ordered_results: list[dict[str, NodeRunResult]] = []
|
|
ordered_workflow_run_ids: list[str | None] = []
|
|
for future in futures:
|
|
try:
|
|
node_result, wf_run_id = future.result()
|
|
ordered_results.append(node_result)
|
|
ordered_workflow_run_ids.append(wf_run_id)
|
|
except Exception:
|
|
logger.exception("Unexpected error collecting target execution result")
|
|
ordered_results.append({})
|
|
ordered_workflow_run_ids.append(None)
|
|
|
|
return ordered_results, ordered_workflow_run_ids
|
|
|
|
@classmethod
|
|
def _run_single_target(
|
|
cls,
|
|
session: Session,
|
|
target_type: str,
|
|
target_id: str,
|
|
item: EvaluationDatasetInput,
|
|
) -> Mapping[str, object]:
|
|
"""Execute a single evaluation target with one test-data item.
|
|
|
|
Dispatches to the appropriate execution service based on
|
|
``target_type``:
|
|
|
|
* ``"snippets"`` → :meth:`SnippetGenerateService.run_published`
|
|
* ``"apps"`` → :meth:`WorkflowAppGenerator().generate` (blocking mode)
|
|
|
|
:returns: The blocking response mapping from the workflow engine.
|
|
:raises ValueError: If the target is not found or not published.
|
|
"""
|
|
from core.app.apps.workflow.app_generator import WorkflowAppGenerator
|
|
from core.app.entities.app_invoke_entities import InvokeFrom
|
|
from core.evaluation.runners import get_service_account_for_app, get_service_account_for_snippet
|
|
|
|
if target_type == EvaluationTargetType.SNIPPETS.value:
|
|
from services.snippet_generate_service import SnippetGenerateService
|
|
|
|
snippet = session.query(CustomizedSnippet).filter_by(id=target_id).first()
|
|
if not snippet:
|
|
raise ValueError(f"Snippet {target_id} not found")
|
|
|
|
service_account = get_service_account_for_snippet(session, target_id)
|
|
|
|
return SnippetGenerateService.run_published(
|
|
snippet=snippet,
|
|
user=service_account,
|
|
args={"inputs": item.inputs},
|
|
invoke_from=InvokeFrom.SERVICE_API,
|
|
)
|
|
else:
|
|
# target_type == "apps"
|
|
app = session.query(App).filter_by(id=target_id).first()
|
|
if not app:
|
|
raise ValueError(f"App {target_id} not found")
|
|
|
|
service_account = get_service_account_for_app(session, target_id)
|
|
|
|
workflow_service = WorkflowService()
|
|
workflow = workflow_service.get_published_workflow(app_model=app)
|
|
if not workflow:
|
|
raise ValueError(f"No published workflow for app {target_id}")
|
|
|
|
response: Mapping[str, object] = WorkflowAppGenerator().generate(
|
|
app_model=app,
|
|
workflow=workflow,
|
|
user=service_account,
|
|
args={"inputs": item.inputs},
|
|
invoke_from=InvokeFrom.SERVICE_API,
|
|
streaming=False,
|
|
call_depth=0,
|
|
)
|
|
return response
|
|
|
|
@staticmethod
|
|
def _extract_workflow_run_id(response: Mapping[str, object]) -> str | None:
|
|
"""Extract ``workflow_run_id`` from a blocking workflow response."""
|
|
wf_run_id = response.get("workflow_run_id")
|
|
if wf_run_id:
|
|
return str(wf_run_id)
|
|
data = response.get("data")
|
|
if isinstance(data, Mapping) and data.get("id"):
|
|
return str(data["id"])
|
|
return None
|
|
|
|
@staticmethod
|
|
def _query_node_run_results(
|
|
session: Session,
|
|
tenant_id: str,
|
|
app_id: str,
|
|
workflow_run_id: str,
|
|
) -> dict[str, NodeRunResult]:
|
|
"""Query all node execution records for a workflow run."""
|
|
from sqlalchemy import asc, select
|
|
|
|
from graphon.enums import WorkflowNodeExecutionStatus
|
|
from models.workflow import WorkflowNodeExecutionModel
|
|
|
|
stmt = (
|
|
WorkflowNodeExecutionModel.preload_offload_data(select(WorkflowNodeExecutionModel))
|
|
.where(
|
|
WorkflowNodeExecutionModel.tenant_id == tenant_id,
|
|
WorkflowNodeExecutionModel.app_id == app_id,
|
|
WorkflowNodeExecutionModel.workflow_run_id == workflow_run_id,
|
|
)
|
|
.order_by(asc(WorkflowNodeExecutionModel.created_at))
|
|
)
|
|
|
|
node_models: list[WorkflowNodeExecutionModel] = list(session.execute(stmt).scalars().all())
|
|
|
|
result: dict[str, NodeRunResult] = {}
|
|
for node in node_models:
|
|
# Convert string-keyed metadata to WorkflowNodeExecutionMetadataKey-keyed
|
|
raw_metadata = node.execution_metadata_dict
|
|
typed_metadata: dict[WorkflowNodeExecutionMetadataKey, object] = {}
|
|
for key, val in raw_metadata.items():
|
|
try:
|
|
typed_metadata[WorkflowNodeExecutionMetadataKey(key)] = val
|
|
except ValueError:
|
|
pass # skip unknown metadata keys
|
|
|
|
result[node.node_id] = NodeRunResult(
|
|
status=WorkflowNodeExecutionStatus(node.status),
|
|
inputs=node.inputs_dict or {},
|
|
process_data=node.process_data_dict or {},
|
|
outputs=node.outputs_dict or {},
|
|
metadata=typed_metadata,
|
|
error=node.error or "",
|
|
)
|
|
return result
|
|
|
|
# ---- Dataset Parsing ----
|
|
|
|
@classmethod
|
|
def _parse_dataset(cls, file_content: bytes, filename: str) -> list[EvaluationDatasetInput]:
|
|
"""Parse evaluation dataset from CSV or XLSX content.
|
|
|
|
Supported schemas:
|
|
|
|
- ``index,<input...>,expected_output`` for the legacy single-reference flow
|
|
- ``index,<input...>,<node title> : expected_output`` for node-specific references
|
|
|
|
Column parsing treats the *last* ``:`` as the separator so node titles
|
|
can themselves contain ``:`` characters.
|
|
"""
|
|
filename_lower = filename.lower()
|
|
if filename_lower.endswith(".csv"):
|
|
return cls._parse_csv_dataset(file_content)
|
|
return cls._parse_xlsx_dataset(file_content)
|
|
|
|
@classmethod
|
|
def _parse_xlsx_dataset(cls, xlsx_content: bytes) -> list[EvaluationDatasetInput]:
|
|
"""Parse evaluation dataset from XLSX bytes."""
|
|
wb = load_workbook(io.BytesIO(xlsx_content), read_only=True)
|
|
ws = wb.active
|
|
if ws is None:
|
|
raise EvaluationDatasetInvalidError("XLSX file has no active worksheet.")
|
|
|
|
rows = list(ws.iter_rows(values_only=True))
|
|
if len(rows) < 2:
|
|
raise EvaluationDatasetInvalidError("Dataset must have at least a header row and one data row.")
|
|
|
|
headers = [str(h).strip() if h is not None else "" for h in rows[0]]
|
|
if not headers or headers[0].lower() != "index":
|
|
raise EvaluationDatasetInvalidError("First column header must be 'index'.")
|
|
|
|
items = cls._rows_to_dataset_items(rows)
|
|
wb.close()
|
|
return items
|
|
|
|
@classmethod
|
|
def _parse_csv_dataset(cls, csv_content: bytes) -> list[EvaluationDatasetInput]:
|
|
"""Parse evaluation dataset from UTF-8 CSV bytes.
|
|
|
|
CSV follows the same schema as XLSX:
|
|
the first column must be `index`, remaining columns become inputs,
|
|
and expected-output columns can be either the legacy
|
|
``expected_output`` or node-scoped ``<node title> : expected_output``.
|
|
"""
|
|
try:
|
|
decoded = csv_content.decode("utf-8-sig")
|
|
except UnicodeDecodeError as e:
|
|
raise EvaluationDatasetInvalidError("CSV file must be UTF-8 encoded.") from e
|
|
|
|
reader = csv.reader(io.StringIO(decoded))
|
|
rows = list(reader)
|
|
if len(rows) < 2:
|
|
raise EvaluationDatasetInvalidError("Dataset must have at least a header row and one data row.")
|
|
|
|
headers = [str(h).strip() if h is not None else "" for h in rows[0]]
|
|
if not headers or headers[0].lower() != "index":
|
|
raise EvaluationDatasetInvalidError("First column header must be 'index'.")
|
|
|
|
return cls._rows_to_dataset_items(rows)
|
|
|
|
@classmethod
|
|
def _rows_to_dataset_items(cls, rows: list[list[Any] | tuple[Any, ...]]) -> list[EvaluationDatasetInput]:
|
|
"""Normalize spreadsheet rows into dataset items with index validation."""
|
|
headers = [str(h).strip() if h is not None else "" for h in rows[0]]
|
|
input_headers = headers[1:]
|
|
|
|
items: list[EvaluationDatasetInput] = []
|
|
seen_indices: set[int] = set()
|
|
for row_idx, row in enumerate(rows[1:], start=1):
|
|
values = list(row)
|
|
if all(v is None or str(v).strip() == "" for v in values):
|
|
continue
|
|
|
|
index_val = values[0] if values else row_idx
|
|
try:
|
|
index = int(str(index_val))
|
|
except (TypeError, ValueError):
|
|
index = row_idx
|
|
|
|
if index in seen_indices:
|
|
raise EvaluationDatasetInvalidError(f"Dataset index '{index}' is duplicated.")
|
|
seen_indices.add(index)
|
|
|
|
inputs: dict[str, Any] = {}
|
|
expected_output: str | None = None
|
|
expected_outputs: dict[str, str] = {}
|
|
for col_idx, header in enumerate(input_headers):
|
|
val = values[col_idx + 1] if col_idx + 1 < len(values) else None
|
|
string_value = str(val) if val is not None else ""
|
|
expected_output_target = cls._parse_expected_output_header(header)
|
|
if expected_output_target is None:
|
|
inputs[header] = string_value
|
|
continue
|
|
|
|
if expected_output_target == "":
|
|
expected_output = string_value
|
|
else:
|
|
expected_outputs[expected_output_target] = string_value
|
|
|
|
items.append(
|
|
EvaluationDatasetInput(
|
|
index=index,
|
|
inputs=inputs,
|
|
expected_output=expected_output,
|
|
expected_outputs=expected_outputs,
|
|
)
|
|
)
|
|
|
|
return items
|
|
|
|
@staticmethod
|
|
def _parse_expected_output_header(header: str) -> str | None:
|
|
"""Return the node title for an expected-output column, if any.
|
|
|
|
``expected_output`` keeps the legacy single-reference behaviour and
|
|
returns an empty string marker. Node-specific columns split on the last
|
|
``:`` so titles such as ``RAG: final judge`` remain valid.
|
|
"""
|
|
normalized_header = header.strip()
|
|
if normalized_header == "expected_output":
|
|
return ""
|
|
|
|
if ":" not in normalized_header:
|
|
return None
|
|
|
|
node_title, suffix = normalized_header.rsplit(":", 1)
|
|
if suffix.strip() != "expected_output":
|
|
return None
|
|
|
|
normalized_title = node_title.strip()
|
|
return normalized_title or None
|
|
|
|
@classmethod
|
|
def _build_stub_results(
|
|
cls,
|
|
input_list: list[EvaluationDatasetInput],
|
|
run_request: EvaluationRunRequest,
|
|
) -> list[EvaluationItemResult]:
|
|
"""Create deterministic synthetic results that match the real read models."""
|
|
results: list[EvaluationItemResult] = []
|
|
|
|
for item_position, item in enumerate(input_list):
|
|
metrics: list[EvaluationMetric] = []
|
|
|
|
for metric_position, default_metric in enumerate(run_request.default_metrics):
|
|
metric_value_type = default_metric.value_type or METRIC_VALUE_TYPE_MAPPING.get(
|
|
default_metric.metric, ""
|
|
)
|
|
for node_position, node_info in enumerate(default_metric.node_info_list):
|
|
metrics.append(
|
|
EvaluationMetric(
|
|
name=default_metric.metric,
|
|
value=cls._build_stub_metric_value(
|
|
item_index=item.index,
|
|
metric_position=metric_position,
|
|
node_position=node_position,
|
|
value_type=metric_value_type,
|
|
metric_name=default_metric.metric,
|
|
),
|
|
details={
|
|
"stubbed": True,
|
|
"source": "console-evaluation-run",
|
|
"value_type": metric_value_type,
|
|
},
|
|
node_info=node_info,
|
|
)
|
|
)
|
|
|
|
if run_request.customized_metrics:
|
|
for output_position, output_field in enumerate(run_request.customized_metrics.output_fields):
|
|
metrics.append(
|
|
EvaluationMetric(
|
|
name=output_field.variable,
|
|
value=cls._build_stub_metric_value(
|
|
item_index=item.index,
|
|
metric_position=len(metrics),
|
|
node_position=output_position,
|
|
value_type=output_field.value_type,
|
|
metric_name=output_field.variable,
|
|
),
|
|
details={
|
|
"stubbed": True,
|
|
"source": "console-evaluation-run",
|
|
"value_type": output_field.value_type,
|
|
"customized": True,
|
|
},
|
|
node_info=NodeInfo(
|
|
node_id=run_request.customized_metrics.evaluation_workflow_id,
|
|
type="customized",
|
|
title="customized",
|
|
),
|
|
)
|
|
)
|
|
|
|
judgment = cls._evaluate_stub_judgment(metrics, run_request.judgment_config)
|
|
if isinstance(item, EvaluationDatasetInput):
|
|
actual_output = item.get_expected_output_for_node(None)
|
|
else:
|
|
actual_output = getattr(item, "expected_output", None)
|
|
actual_output = actual_output or cls._build_stub_output(item_position, item.inputs)
|
|
|
|
results.append(
|
|
EvaluationItemResult(
|
|
index=item.index,
|
|
actual_output=actual_output,
|
|
metrics=metrics,
|
|
metadata={
|
|
"stubbed": True,
|
|
"source": "console-evaluation-run",
|
|
"row": item_position + 1,
|
|
},
|
|
judgment=judgment,
|
|
)
|
|
)
|
|
|
|
return results
|
|
|
|
@staticmethod
|
|
def _build_stub_output(item_position: int, inputs: dict[str, Any]) -> str:
|
|
"""Build a readable synthetic output for one dataset row."""
|
|
first_key = next(iter(inputs.keys()), "input")
|
|
first_value = inputs.get(first_key, "")
|
|
return f"Stub output #{item_position + 1}: processed {first_key}={first_value}"
|
|
|
|
@staticmethod
|
|
def _build_stub_metric_value(
|
|
item_index: int,
|
|
metric_position: int,
|
|
node_position: int,
|
|
value_type: str,
|
|
metric_name: str,
|
|
) -> Any:
|
|
"""Return a deterministic placeholder metric value by declared type."""
|
|
base_seed = item_index + metric_position + node_position
|
|
normalized_type = value_type.lower()
|
|
|
|
if normalized_type == "number" or not normalized_type:
|
|
return round(0.72 + (base_seed % 18) / 100, 3)
|
|
if normalized_type == "boolean":
|
|
return base_seed % 2 == 0
|
|
|
|
return f"stub-{metric_name}-{item_index}"
|
|
|
|
@staticmethod
|
|
def _evaluate_stub_judgment(
|
|
metrics: list[EvaluationMetric],
|
|
judgment_config: JudgmentConfig | None,
|
|
) -> JudgmentResult:
|
|
"""Apply the same judgment processor used by real evaluations."""
|
|
if not judgment_config or not judgment_config.conditions:
|
|
return JudgmentResult()
|
|
|
|
metric_values: dict[tuple[str, str], object] = {
|
|
(metric.node_info.node_id, metric.name): metric.value for metric in metrics if metric.node_info
|
|
}
|
|
return JudgmentProcessor.evaluate(metric_values, judgment_config)
|
|
|
|
@classmethod
|
|
def execute_retrieval_test_targets(
|
|
cls,
|
|
dataset_id: str,
|
|
account_id: str,
|
|
input_list: list[EvaluationDatasetInput],
|
|
max_workers: int = 5,
|
|
) -> list[NodeRunResult]:
|
|
"""Run hit testing against a knowledge base for every input item in parallel.
|
|
|
|
Each item must supply a ``query`` key in its ``inputs`` dict. The
|
|
retrieved segments are normalised into the same ``NodeRunResult`` format
|
|
that :class:`RetrievalEvaluationRunner` expects:
|
|
|
|
.. code-block:: python
|
|
|
|
NodeRunResult(
|
|
inputs={"query": "..."},
|
|
outputs={"result": [{"content": "...", "score": ...}, ...]},
|
|
)
|
|
|
|
:returns: Ordered list of ``NodeRunResult`` — one per input item.
|
|
If retrieval fails for an item the result has an empty ``result``
|
|
list so the runner can still persist a (metric-less) row.
|
|
"""
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
|
|
from flask import current_app
|
|
|
|
flask_app = current_app._get_current_object() # type: ignore
|
|
|
|
def _worker(item: EvaluationDatasetInput) -> NodeRunResult:
|
|
with flask_app.app_context():
|
|
from extensions.ext_database import db as flask_db
|
|
from models.account import Account
|
|
from models.dataset import Dataset
|
|
from services.hit_testing_service import HitTestingService
|
|
|
|
dataset = flask_db.session.query(Dataset).filter_by(id=dataset_id).first()
|
|
if not dataset:
|
|
raise ValueError(f"Dataset {dataset_id} not found")
|
|
|
|
account = flask_db.session.query(Account).filter_by(id=account_id).first()
|
|
if not account:
|
|
raise ValueError(f"Account {account_id} not found")
|
|
|
|
query = str(item.inputs.get("query", ""))
|
|
response = HitTestingService.retrieve(
|
|
dataset=dataset,
|
|
query=query,
|
|
account=account,
|
|
retrieval_model=None, # Use dataset's configured retrieval model
|
|
external_retrieval_model={},
|
|
limit=10,
|
|
)
|
|
|
|
records = response.get("records", [])
|
|
result_list = [
|
|
{
|
|
"content": r.get("segment", {}).get("content", "") or r.get("content", ""),
|
|
"score": r.get("score"),
|
|
}
|
|
for r in records
|
|
if r.get("segment", {}).get("content") or r.get("content")
|
|
]
|
|
|
|
return NodeRunResult(
|
|
inputs={"query": query},
|
|
outputs={"result": result_list},
|
|
)
|
|
|
|
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
futures = [executor.submit(_worker, item) for item in input_list]
|
|
results: list[NodeRunResult] = []
|
|
for item, future in zip(input_list, futures):
|
|
try:
|
|
results.append(future.result())
|
|
except Exception:
|
|
logger.exception("Retrieval test failed for item %d (dataset=%s)", item.index, dataset_id)
|
|
results.append(NodeRunResult(inputs={}, outputs={"result": []}))
|
|
|
|
return results
|