mirror of
https://github.com/langgenius/dify.git
synced 2026-05-13 08:57:28 +08:00
feat: Implement customized evaluation in BaseEvaluationInstance.
This commit is contained in:
parent
7149af3dac
commit
b160dce4db
@ -21,7 +21,7 @@ class BaseEvaluationInstance(ABC):
|
|||||||
def evaluate_llm(
|
def evaluate_llm(
|
||||||
self,
|
self,
|
||||||
items: list[EvaluationItemInput],
|
items: list[EvaluationItemInput],
|
||||||
metrics_config: dict,
|
default_metrics: list[dict[str, Any]],
|
||||||
model_provider: str,
|
model_provider: str,
|
||||||
model_name: str,
|
model_name: str,
|
||||||
tenant_id: str,
|
tenant_id: str,
|
||||||
@ -33,7 +33,7 @@ class BaseEvaluationInstance(ABC):
|
|||||||
def evaluate_retrieval(
|
def evaluate_retrieval(
|
||||||
self,
|
self,
|
||||||
items: list[EvaluationItemInput],
|
items: list[EvaluationItemInput],
|
||||||
metrics_config: dict,
|
default_metrics: list[dict[str, Any]],
|
||||||
model_provider: str,
|
model_provider: str,
|
||||||
model_name: str,
|
model_name: str,
|
||||||
tenant_id: str,
|
tenant_id: str,
|
||||||
@ -45,7 +45,7 @@ class BaseEvaluationInstance(ABC):
|
|||||||
def evaluate_agent(
|
def evaluate_agent(
|
||||||
self,
|
self,
|
||||||
items: list[EvaluationItemInput],
|
items: list[EvaluationItemInput],
|
||||||
metrics_config: dict,
|
default_metrics: list[dict[str, Any]],
|
||||||
model_provider: str,
|
model_provider: str,
|
||||||
model_name: str,
|
model_name: str,
|
||||||
tenant_id: str,
|
tenant_id: str,
|
||||||
@ -57,7 +57,7 @@ class BaseEvaluationInstance(ABC):
|
|||||||
def evaluate_workflow(
|
def evaluate_workflow(
|
||||||
self,
|
self,
|
||||||
items: list[EvaluationItemInput],
|
items: list[EvaluationItemInput],
|
||||||
metrics_config: dict,
|
default_metrics: list[dict[str, Any]],
|
||||||
model_provider: str,
|
model_provider: str,
|
||||||
model_name: str,
|
model_name: str,
|
||||||
tenant_id: str,
|
tenant_id: str,
|
||||||
@ -74,7 +74,7 @@ class BaseEvaluationInstance(ABC):
|
|||||||
self,
|
self,
|
||||||
items: list[EvaluationItemInput],
|
items: list[EvaluationItemInput],
|
||||||
results: list[EvaluationItemResult],
|
results: list[EvaluationItemResult],
|
||||||
metrics_config: dict,
|
customized_metrics: dict[str, Any],
|
||||||
tenant_id: str,
|
tenant_id: str,
|
||||||
) -> list[EvaluationItemResult]:
|
) -> list[EvaluationItemResult]:
|
||||||
"""Evaluate using a published workflow as the evaluator.
|
"""Evaluate using a published workflow as the evaluator.
|
||||||
@ -86,8 +86,8 @@ class BaseEvaluationInstance(ABC):
|
|||||||
Args:
|
Args:
|
||||||
items: Evaluation items with inputs, expected_output, context.
|
items: Evaluation items with inputs, expected_output, context.
|
||||||
results: Results from Phase 1 (with actual_output populated).
|
results: Results from Phase 1 (with actual_output populated).
|
||||||
metrics_config: Must contain ``workflow_id`` pointing to a
|
customized_metrics: Must contain ``evaluation_workflow_id``
|
||||||
published WORKFLOW-type App.
|
pointing to a published WORKFLOW-type App.
|
||||||
tenant_id: Tenant scope.
|
tenant_id: Tenant scope.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
@ -103,10 +103,10 @@ class BaseEvaluationInstance(ABC):
|
|||||||
from models.model import App
|
from models.model import App
|
||||||
from services.workflow_service import WorkflowService
|
from services.workflow_service import WorkflowService
|
||||||
|
|
||||||
workflow_id = metrics_config.get("workflow_id")
|
workflow_id = customized_metrics.get("evaluation_workflow_id")
|
||||||
if not workflow_id:
|
if not workflow_id:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"metrics_config must contain 'workflow_id' for customized evaluator"
|
"customized_metrics must contain 'evaluation_workflow_id' for customized evaluator"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Load the evaluator workflow resources using a dedicated session
|
# Load the evaluator workflow resources using a dedicated session
|
||||||
|
|||||||
@ -42,51 +42,51 @@ class RagasEvaluator(BaseEvaluationInstance):
|
|||||||
def evaluate_llm(
|
def evaluate_llm(
|
||||||
self,
|
self,
|
||||||
items: list[EvaluationItemInput],
|
items: list[EvaluationItemInput],
|
||||||
metrics_config: dict,
|
default_metrics: list[dict[str, Any]],
|
||||||
model_provider: str,
|
model_provider: str,
|
||||||
model_name: str,
|
model_name: str,
|
||||||
tenant_id: str,
|
tenant_id: str,
|
||||||
) -> list[EvaluationItemResult]:
|
) -> list[EvaluationItemResult]:
|
||||||
return self._evaluate(items, metrics_config, model_provider, model_name, tenant_id, EvaluationCategory.LLM)
|
return self._evaluate(items, default_metrics, model_provider, model_name, tenant_id, EvaluationCategory.LLM)
|
||||||
|
|
||||||
def evaluate_retrieval(
|
def evaluate_retrieval(
|
||||||
self,
|
self,
|
||||||
items: list[EvaluationItemInput],
|
items: list[EvaluationItemInput],
|
||||||
metrics_config: dict,
|
default_metrics: list[dict[str, Any]],
|
||||||
model_provider: str,
|
model_provider: str,
|
||||||
model_name: str,
|
model_name: str,
|
||||||
tenant_id: str,
|
tenant_id: str,
|
||||||
) -> list[EvaluationItemResult]:
|
) -> list[EvaluationItemResult]:
|
||||||
return self._evaluate(
|
return self._evaluate(
|
||||||
items, metrics_config, model_provider, model_name, tenant_id, EvaluationCategory.RETRIEVAL
|
items, default_metrics, model_provider, model_name, tenant_id, EvaluationCategory.RETRIEVAL
|
||||||
)
|
)
|
||||||
|
|
||||||
def evaluate_agent(
|
def evaluate_agent(
|
||||||
self,
|
self,
|
||||||
items: list[EvaluationItemInput],
|
items: list[EvaluationItemInput],
|
||||||
metrics_config: dict,
|
default_metrics: list[dict[str, Any]],
|
||||||
model_provider: str,
|
model_provider: str,
|
||||||
model_name: str,
|
model_name: str,
|
||||||
tenant_id: str,
|
tenant_id: str,
|
||||||
) -> list[EvaluationItemResult]:
|
) -> list[EvaluationItemResult]:
|
||||||
return self._evaluate(items, metrics_config, model_provider, model_name, tenant_id, EvaluationCategory.AGENT)
|
return self._evaluate(items, default_metrics, model_provider, model_name, tenant_id, EvaluationCategory.AGENT)
|
||||||
|
|
||||||
def evaluate_workflow(
|
def evaluate_workflow(
|
||||||
self,
|
self,
|
||||||
items: list[EvaluationItemInput],
|
items: list[EvaluationItemInput],
|
||||||
metrics_config: dict,
|
default_metrics: list[dict[str, Any]],
|
||||||
model_provider: str,
|
model_provider: str,
|
||||||
model_name: str,
|
model_name: str,
|
||||||
tenant_id: str,
|
tenant_id: str,
|
||||||
) -> list[EvaluationItemResult]:
|
) -> list[EvaluationItemResult]:
|
||||||
return self._evaluate(
|
return self._evaluate(
|
||||||
items, metrics_config, model_provider, model_name, tenant_id, EvaluationCategory.WORKFLOW
|
items, default_metrics, model_provider, model_name, tenant_id, EvaluationCategory.WORKFLOW
|
||||||
)
|
)
|
||||||
|
|
||||||
def _evaluate(
|
def _evaluate(
|
||||||
self,
|
self,
|
||||||
items: list[EvaluationItemInput],
|
items: list[EvaluationItemInput],
|
||||||
metrics_config: dict,
|
default_metrics: list[dict[str, Any]],
|
||||||
model_provider: str,
|
model_provider: str,
|
||||||
model_name: str,
|
model_name: str,
|
||||||
tenant_id: str,
|
tenant_id: str,
|
||||||
@ -98,7 +98,12 @@ class RagasEvaluator(BaseEvaluationInstance):
|
|||||||
string similarity if RAGAS import fails.
|
string similarity if RAGAS import fails.
|
||||||
"""
|
"""
|
||||||
model_wrapper = DifyModelWrapper(model_provider, model_name, tenant_id)
|
model_wrapper = DifyModelWrapper(model_provider, model_name, tenant_id)
|
||||||
requested_metrics = metrics_config.get("metrics", self.get_supported_metrics(category))
|
# Extract metric names from default_metrics list; each item has a "metric" key.
|
||||||
|
requested_metrics = (
|
||||||
|
[m["metric"] for m in default_metrics if "metric" in m]
|
||||||
|
if default_metrics
|
||||||
|
else self.get_supported_metrics(category)
|
||||||
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
return self._evaluate_with_ragas(items, requested_metrics, model_wrapper, category)
|
return self._evaluate_with_ragas(items, requested_metrics, model_wrapper, category)
|
||||||
@ -116,11 +121,6 @@ class RagasEvaluator(BaseEvaluationInstance):
|
|||||||
"""Evaluate using RAGAS library."""
|
"""Evaluate using RAGAS library."""
|
||||||
from ragas import evaluate as ragas_evaluate
|
from ragas import evaluate as ragas_evaluate
|
||||||
from ragas.dataset_schema import EvaluationDataset, SingleTurnSample
|
from ragas.dataset_schema import EvaluationDataset, SingleTurnSample
|
||||||
from ragas.llms import LangchainLLMWrapper
|
|
||||||
from ragas.metrics import (
|
|
||||||
Faithfulness,
|
|
||||||
ResponseRelevancy,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Build RAGAS dataset
|
# Build RAGAS dataset
|
||||||
samples = []
|
samples = []
|
||||||
|
|||||||
@ -1,5 +1,6 @@
|
|||||||
import logging
|
import logging
|
||||||
from typing import Any, Mapping, Union
|
from collections.abc import Mapping
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
from sqlalchemy.orm import Session
|
from sqlalchemy.orm import Session
|
||||||
|
|
||||||
@ -9,7 +10,7 @@ from core.evaluation.entities.evaluation_entity import (
|
|||||||
EvaluationItemResult,
|
EvaluationItemResult,
|
||||||
)
|
)
|
||||||
from core.evaluation.runners.base_evaluation_runner import BaseEvaluationRunner
|
from core.evaluation.runners.base_evaluation_runner import BaseEvaluationRunner
|
||||||
from models.model import App, AppMode
|
from models.model import App
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@ -29,8 +30,8 @@ class AgentEvaluationRunner(BaseEvaluationRunner):
|
|||||||
) -> EvaluationItemResult:
|
) -> EvaluationItemResult:
|
||||||
"""Execute agent app and collect response with tool call information."""
|
"""Execute agent app and collect response with tool call information."""
|
||||||
from core.app.apps.agent_chat.app_generator import AgentChatAppGenerator
|
from core.app.apps.agent_chat.app_generator import AgentChatAppGenerator
|
||||||
from core.evaluation.runners import get_service_account_for_app
|
|
||||||
from core.app.entities.app_invoke_entities import InvokeFrom
|
from core.app.entities.app_invoke_entities import InvokeFrom
|
||||||
|
from core.evaluation.runners import get_service_account_for_app
|
||||||
|
|
||||||
app = self.session.query(App).filter_by(id=target_id).first()
|
app = self.session.query(App).filter_by(id=target_id).first()
|
||||||
if not app:
|
if not app:
|
||||||
@ -67,7 +68,7 @@ class AgentEvaluationRunner(BaseEvaluationRunner):
|
|||||||
self,
|
self,
|
||||||
items: list[EvaluationItemInput],
|
items: list[EvaluationItemInput],
|
||||||
results: list[EvaluationItemResult],
|
results: list[EvaluationItemResult],
|
||||||
metrics_config: dict,
|
default_metrics: list[dict[str, Any]],
|
||||||
model_provider: str,
|
model_provider: str,
|
||||||
model_name: str,
|
model_name: str,
|
||||||
tenant_id: str,
|
tenant_id: str,
|
||||||
@ -90,7 +91,7 @@ class AgentEvaluationRunner(BaseEvaluationRunner):
|
|||||||
)
|
)
|
||||||
|
|
||||||
evaluated = self.evaluation_instance.evaluate_agent(
|
evaluated = self.evaluation_instance.evaluate_agent(
|
||||||
merged_items, metrics_config, model_provider, model_name, tenant_id
|
merged_items, default_metrics, model_provider, model_name, tenant_id
|
||||||
)
|
)
|
||||||
|
|
||||||
# Merge metrics back preserving metadata
|
# Merge metrics back preserving metadata
|
||||||
|
|||||||
@ -51,7 +51,7 @@ class BaseEvaluationRunner(ABC):
|
|||||||
self,
|
self,
|
||||||
items: list[EvaluationItemInput],
|
items: list[EvaluationItemInput],
|
||||||
results: list[EvaluationItemResult],
|
results: list[EvaluationItemResult],
|
||||||
metrics_config: dict,
|
default_metrics: list[dict[str, Any]],
|
||||||
model_provider: str,
|
model_provider: str,
|
||||||
model_name: str,
|
model_name: str,
|
||||||
tenant_id: str,
|
tenant_id: str,
|
||||||
@ -66,9 +66,10 @@ class BaseEvaluationRunner(ABC):
|
|||||||
target_id: str,
|
target_id: str,
|
||||||
target_type: str,
|
target_type: str,
|
||||||
items: list[EvaluationItemInput],
|
items: list[EvaluationItemInput],
|
||||||
metrics_config: dict,
|
default_metrics: list[dict[str, Any]],
|
||||||
model_provider: str,
|
customized_metrics: dict[str, Any] | None = None,
|
||||||
model_name: str,
|
model_provider: str = "",
|
||||||
|
model_name: str = "",
|
||||||
judgment_config: JudgmentConfig | None = None,
|
judgment_config: JudgmentConfig | None = None,
|
||||||
) -> list[EvaluationItemResult]:
|
) -> list[EvaluationItemResult]:
|
||||||
"""Orchestrate target execution + metric evaluation + judgment for all items."""
|
"""Orchestrate target execution + metric evaluation + judgment for all items."""
|
||||||
@ -106,13 +107,15 @@ class BaseEvaluationRunner(ABC):
|
|||||||
|
|
||||||
if successful_items and successful_results:
|
if successful_items and successful_results:
|
||||||
try:
|
try:
|
||||||
if _is_customized_evaluation(metrics_config):
|
if customized_metrics is not None:
|
||||||
|
# Customized workflow evaluation — target-type agnostic
|
||||||
evaluated_results = self._evaluate_customized(
|
evaluated_results = self._evaluate_customized(
|
||||||
successful_items, successful_results, metrics_config, tenant_id,
|
successful_items, successful_results, customized_metrics, tenant_id,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
|
# Framework-specific evaluation — delegate to subclass
|
||||||
evaluated_results = self.evaluate_metrics(
|
evaluated_results = self.evaluate_metrics(
|
||||||
successful_items, successful_results, metrics_config,
|
successful_items, successful_results, default_metrics,
|
||||||
model_provider, model_name, tenant_id,
|
model_provider, model_name, tenant_id,
|
||||||
)
|
)
|
||||||
# Merge evaluated metrics back into results
|
# Merge evaluated metrics back into results
|
||||||
@ -153,12 +156,18 @@ class BaseEvaluationRunner(ABC):
|
|||||||
self,
|
self,
|
||||||
items: list[EvaluationItemInput],
|
items: list[EvaluationItemInput],
|
||||||
results: list[EvaluationItemResult],
|
results: list[EvaluationItemResult],
|
||||||
metrics_config: dict,
|
customized_metrics: dict[str, Any],
|
||||||
tenant_id: str,
|
tenant_id: str,
|
||||||
) -> list[EvaluationItemResult]:
|
) -> list[EvaluationItemResult]:
|
||||||
"""Delegate to the instance's customized workflow evaluator."""
|
"""Delegate to the instance's customized workflow evaluator.
|
||||||
|
|
||||||
|
Unlike the framework path (which merges ``actual_output`` into
|
||||||
|
``context``), here we pass ``results`` directly — the instance's
|
||||||
|
``evaluate_with_customized_workflow()`` reads ``actual_output``
|
||||||
|
from each ``EvaluationItemResult``.
|
||||||
|
"""
|
||||||
evaluated = self.evaluation_instance.evaluate_with_customized_workflow(
|
evaluated = self.evaluation_instance.evaluate_with_customized_workflow(
|
||||||
items, results, metrics_config, tenant_id,
|
items, results, customized_metrics, tenant_id,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Merge metrics back preserving actual_output and metadata from Phase 1
|
# Merge metrics back preserving actual_output and metadata from Phase 1
|
||||||
@ -180,7 +189,6 @@ class BaseEvaluationRunner(ABC):
|
|||||||
final_results.append(result)
|
final_results.append(result)
|
||||||
return final_results
|
return final_results
|
||||||
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _apply_judgment(
|
def _apply_judgment(
|
||||||
results: list[EvaluationItemResult],
|
results: list[EvaluationItemResult],
|
||||||
@ -225,8 +233,3 @@ class BaseEvaluationRunner(ABC):
|
|||||||
result.model_copy(update={"judgment": judgment_result})
|
result.model_copy(update={"judgment": judgment_result})
|
||||||
)
|
)
|
||||||
return judged_results
|
return judged_results
|
||||||
|
|
||||||
|
|
||||||
def _is_customized_evaluation(metrics_config: dict[str, Any]) -> bool:
|
|
||||||
"""Check if metrics_config indicates a customized workflow evaluation."""
|
|
||||||
return bool(metrics_config.get("workflow_id"))
|
|
||||||
|
|||||||
@ -1,5 +1,6 @@
|
|||||||
import logging
|
import logging
|
||||||
from typing import Any, Mapping, Union
|
from collections.abc import Mapping
|
||||||
|
from typing import Any, Union
|
||||||
|
|
||||||
from sqlalchemy.orm import Session
|
from sqlalchemy.orm import Session
|
||||||
|
|
||||||
@ -30,8 +31,8 @@ class LLMEvaluationRunner(BaseEvaluationRunner):
|
|||||||
"""Execute the App/Snippet with the given inputs and collect the response."""
|
"""Execute the App/Snippet with the given inputs and collect the response."""
|
||||||
from core.app.apps.completion.app_generator import CompletionAppGenerator
|
from core.app.apps.completion.app_generator import CompletionAppGenerator
|
||||||
from core.app.apps.workflow.app_generator import WorkflowAppGenerator
|
from core.app.apps.workflow.app_generator import WorkflowAppGenerator
|
||||||
from core.evaluation.runners import get_service_account_for_app
|
|
||||||
from core.app.entities.app_invoke_entities import InvokeFrom
|
from core.app.entities.app_invoke_entities import InvokeFrom
|
||||||
|
from core.evaluation.runners import get_service_account_for_app
|
||||||
from services.workflow_service import WorkflowService
|
from services.workflow_service import WorkflowService
|
||||||
|
|
||||||
app = self.session.query(App).filter_by(id=target_id).first()
|
app = self.session.query(App).filter_by(id=target_id).first()
|
||||||
@ -89,7 +90,7 @@ class LLMEvaluationRunner(BaseEvaluationRunner):
|
|||||||
self,
|
self,
|
||||||
items: list[EvaluationItemInput],
|
items: list[EvaluationItemInput],
|
||||||
results: list[EvaluationItemResult],
|
results: list[EvaluationItemResult],
|
||||||
metrics_config: dict,
|
default_metrics: list[dict[str, Any]],
|
||||||
model_provider: str,
|
model_provider: str,
|
||||||
model_name: str,
|
model_name: str,
|
||||||
tenant_id: str,
|
tenant_id: str,
|
||||||
@ -98,7 +99,7 @@ class LLMEvaluationRunner(BaseEvaluationRunner):
|
|||||||
# Merge actual_output into items for evaluation
|
# Merge actual_output into items for evaluation
|
||||||
merged_items = self._merge_results_into_items(items, results)
|
merged_items = self._merge_results_into_items(items, results)
|
||||||
return self.evaluation_instance.evaluate_llm(
|
return self.evaluation_instance.evaluate_llm(
|
||||||
merged_items, metrics_config, model_provider, model_name, tenant_id
|
merged_items, default_metrics, model_provider, model_name, tenant_id
|
||||||
)
|
)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
|||||||
@ -58,7 +58,7 @@ class RetrievalEvaluationRunner(BaseEvaluationRunner):
|
|||||||
self,
|
self,
|
||||||
items: list[EvaluationItemInput],
|
items: list[EvaluationItemInput],
|
||||||
results: list[EvaluationItemResult],
|
results: list[EvaluationItemResult],
|
||||||
metrics_config: dict,
|
default_metrics: list[dict[str, Any]],
|
||||||
model_provider: str,
|
model_provider: str,
|
||||||
model_name: str,
|
model_name: str,
|
||||||
tenant_id: str,
|
tenant_id: str,
|
||||||
@ -80,7 +80,7 @@ class RetrievalEvaluationRunner(BaseEvaluationRunner):
|
|||||||
)
|
)
|
||||||
|
|
||||||
evaluated = self.evaluation_instance.evaluate_retrieval(
|
evaluated = self.evaluation_instance.evaluate_retrieval(
|
||||||
merged_items, metrics_config, model_provider, model_name, tenant_id
|
merged_items, default_metrics, model_provider, model_name, tenant_id
|
||||||
)
|
)
|
||||||
|
|
||||||
# Merge metrics back into original results (preserve actual_output and metadata)
|
# Merge metrics back into original results (preserve actual_output and metadata)
|
||||||
|
|||||||
@ -1,5 +1,6 @@
|
|||||||
import logging
|
import logging
|
||||||
from typing import Any, Mapping
|
from collections.abc import Mapping
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
from sqlalchemy.orm import Session
|
from sqlalchemy.orm import Session
|
||||||
|
|
||||||
@ -29,8 +30,8 @@ class WorkflowEvaluationRunner(BaseEvaluationRunner):
|
|||||||
) -> EvaluationItemResult:
|
) -> EvaluationItemResult:
|
||||||
"""Execute workflow and collect outputs."""
|
"""Execute workflow and collect outputs."""
|
||||||
from core.app.apps.workflow.app_generator import WorkflowAppGenerator
|
from core.app.apps.workflow.app_generator import WorkflowAppGenerator
|
||||||
from core.evaluation.runners import get_service_account_for_app
|
|
||||||
from core.app.entities.app_invoke_entities import InvokeFrom
|
from core.app.entities.app_invoke_entities import InvokeFrom
|
||||||
|
from core.evaluation.runners import get_service_account_for_app
|
||||||
from services.workflow_service import WorkflowService
|
from services.workflow_service import WorkflowService
|
||||||
|
|
||||||
app = self.session.query(App).filter_by(id=target_id).first()
|
app = self.session.query(App).filter_by(id=target_id).first()
|
||||||
@ -68,7 +69,7 @@ class WorkflowEvaluationRunner(BaseEvaluationRunner):
|
|||||||
self,
|
self,
|
||||||
items: list[EvaluationItemInput],
|
items: list[EvaluationItemInput],
|
||||||
results: list[EvaluationItemResult],
|
results: list[EvaluationItemResult],
|
||||||
metrics_config: dict,
|
default_metrics: list[dict[str, Any]],
|
||||||
model_provider: str,
|
model_provider: str,
|
||||||
model_name: str,
|
model_name: str,
|
||||||
tenant_id: str,
|
tenant_id: str,
|
||||||
@ -91,7 +92,7 @@ class WorkflowEvaluationRunner(BaseEvaluationRunner):
|
|||||||
)
|
)
|
||||||
|
|
||||||
evaluated = self.evaluation_instance.evaluate_workflow(
|
evaluated = self.evaluation_instance.evaluate_workflow(
|
||||||
merged_items, metrics_config, model_provider, model_name, tenant_id
|
merged_items, default_metrics, model_provider, model_name, tenant_id
|
||||||
)
|
)
|
||||||
|
|
||||||
# Merge metrics back preserving metadata
|
# Merge metrics back preserving metadata
|
||||||
|
|||||||
@ -1,8 +1,6 @@
|
|||||||
import io
|
import io
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
from configs import dify_config
|
|
||||||
from models.model import UploadFile
|
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
from celery import shared_task
|
from celery import shared_task
|
||||||
@ -10,6 +8,7 @@ from openpyxl import Workbook
|
|||||||
from openpyxl.styles import Alignment, Border, Font, PatternFill, Side
|
from openpyxl.styles import Alignment, Border, Font, PatternFill, Side
|
||||||
from openpyxl.utils import get_column_letter
|
from openpyxl.utils import get_column_letter
|
||||||
|
|
||||||
|
from configs import dify_config
|
||||||
from core.evaluation.entities.evaluation_entity import (
|
from core.evaluation.entities.evaluation_entity import (
|
||||||
EvaluationCategory,
|
EvaluationCategory,
|
||||||
EvaluationItemResult,
|
EvaluationItemResult,
|
||||||
@ -23,6 +22,7 @@ from core.evaluation.runners.workflow_evaluation_runner import WorkflowEvaluatio
|
|||||||
from extensions.ext_database import db
|
from extensions.ext_database import db
|
||||||
from libs.datetime_utils import naive_utc_now
|
from libs.datetime_utils import naive_utc_now
|
||||||
from models.evaluation import EvaluationRun, EvaluationRunStatus
|
from models.evaluation import EvaluationRun, EvaluationRunStatus
|
||||||
|
from models.model import UploadFile
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@ -86,6 +86,7 @@ def _execute_evaluation(session: Any, run_data: EvaluationRunData) -> None:
|
|||||||
customized_metrics=run_data.customized_metrics,
|
customized_metrics=run_data.customized_metrics,
|
||||||
model_provider=run_data.evaluation_model_provider,
|
model_provider=run_data.evaluation_model_provider,
|
||||||
model_name=run_data.evaluation_model,
|
model_name=run_data.evaluation_model,
|
||||||
|
judgment_config=run_data.judgment_config,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Compute summary metrics
|
# Compute summary metrics
|
||||||
@ -210,7 +211,13 @@ def _generate_result_xlsx(
|
|||||||
input_keys.append(key)
|
input_keys.append(key)
|
||||||
|
|
||||||
# Build headers
|
# Build headers
|
||||||
headers = ["index"] + input_keys + ["expected_output", "actual_output"] + all_metric_names + ["overall_score", "error"]
|
headers = (
|
||||||
|
["index"]
|
||||||
|
+ input_keys
|
||||||
|
+ ["expected_output", "actual_output"]
|
||||||
|
+ all_metric_names
|
||||||
|
+ ["overall_score", "error"]
|
||||||
|
)
|
||||||
|
|
||||||
# Write header row
|
# Write header row
|
||||||
for col_idx, header in enumerate(headers, start=1):
|
for col_idx, header in enumerate(headers, start=1):
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user