mirror of
https://github.com/langgenius/dify.git
synced 2026-05-11 23:18:39 +08:00
evaluation runtime
This commit is contained in:
parent
61e87a4ff4
commit
8ea3729fe9
@ -126,7 +126,7 @@ class RagasEvaluator(BaseEvaluationInstance):
|
||||
samples = []
|
||||
for item in items:
|
||||
sample = SingleTurnSample(
|
||||
user_input=self._inputs_to_query(item.inputs),
|
||||
user_input=self._inputs_format(item.inputs, category),
|
||||
response=item.expected_output or "",
|
||||
retrieved_contexts=item.context or [],
|
||||
)
|
||||
@ -233,14 +233,15 @@ class RagasEvaluator(BaseEvaluationInstance):
|
||||
return 0.0
|
||||
|
||||
@staticmethod
|
||||
def _inputs_to_query(inputs: dict[str, Any]) -> str:
|
||||
"""Convert input dict to a query string."""
|
||||
if "query" in inputs:
|
||||
return str(inputs["query"])
|
||||
if "question" in inputs:
|
||||
return str(inputs["question"])
|
||||
# Fallback: concatenate all input values
|
||||
return " ".join(str(v) for v in inputs.values())
|
||||
def _inputs_format(inputs: dict[str, Any], category: EvaluationCategory) -> str:
|
||||
"""Convert input dict to a prompt string."""
|
||||
match category:
|
||||
case EvaluationCategory.LLM:
|
||||
return str(inputs["prompt"])
|
||||
case EvaluationCategory.RETRIEVAL:
|
||||
return str(inputs["query"])
|
||||
case _:
|
||||
return ""
|
||||
|
||||
@staticmethod
|
||||
def _build_ragas_metrics(requested_metrics: list[str]) -> list[Any]:
|
||||
|
||||
@ -75,17 +75,45 @@ class LLMEvaluationRunner(BaseEvaluationRunner):
|
||||
def _merge_results_into_items(
|
||||
items: list[NodeRunResult],
|
||||
) -> list[EvaluationItemInput]:
|
||||
"""Create new items with actual_output set as expected_output context for metrics."""
|
||||
"""Create new items from NodeRunResult for ragas evaluation.
|
||||
|
||||
Extracts prompts from process_data and concatenates them into a single
|
||||
string with role prefixes (e.g. "system: ...\nuser: ...\nassistant: ...").
|
||||
The last assistant message in outputs is used as the actual output.
|
||||
"""
|
||||
merged = []
|
||||
for item in items:
|
||||
for i, item in enumerate(items):
|
||||
prompt = _format_prompts(item.process_data.get("prompts", []))
|
||||
output = _extract_llm_output(item.outputs)
|
||||
merged.append(
|
||||
EvaluationItemInput(
|
||||
index=item.index,
|
||||
inputs={
|
||||
"prompt": item.prompt,
|
||||
},
|
||||
output=item.output,
|
||||
expected_output=item.expected_output,
|
||||
index=i,
|
||||
inputs={"prompt": prompt},
|
||||
output=output,
|
||||
)
|
||||
)
|
||||
return merged
|
||||
|
||||
|
||||
def _format_prompts(prompts: list[dict[str, Any]]) -> str:
|
||||
"""Concatenate a list of prompt messages into a single string for evaluation.
|
||||
|
||||
Each message is formatted as "role: text" and joined with newlines.
|
||||
"""
|
||||
parts: list[str] = []
|
||||
for msg in prompts:
|
||||
role = msg.get("role", "unknown")
|
||||
text = msg.get("text", "")
|
||||
parts.append(f"{role}: {text}")
|
||||
return "\n".join(parts)
|
||||
|
||||
|
||||
def _extract_llm_output(outputs: Mapping[str, Any]) -> str:
|
||||
"""Extract the LLM output text from NodeRunResult.outputs."""
|
||||
if "text" in outputs:
|
||||
return str(outputs["text"])
|
||||
if "answer" in outputs:
|
||||
return str(outputs["answer"])
|
||||
# Fallback: first value
|
||||
values = list(outputs.values())
|
||||
return str(values[0]) if values else ""
|
||||
|
||||
@ -33,48 +33,39 @@ class RetrievalEvaluationRunner(BaseEvaluationRunner):
|
||||
tenant_id: str,
|
||||
) -> list[EvaluationItemResult]:
|
||||
"""Compute retrieval evaluation metrics."""
|
||||
# Merge retrieved contexts into items
|
||||
result_by_index = {r.index: r for r in results}
|
||||
if not node_run_result_list:
|
||||
return []
|
||||
if not default_metric:
|
||||
raise ValueError("Default metric is required for retrieval evaluation")
|
||||
|
||||
merged_items = []
|
||||
for item in items:
|
||||
result = result_by_index.get(item.index)
|
||||
contexts = result.metadata.get("retrieved_contexts", []) if result else []
|
||||
for i, node_result in enumerate(node_run_result_list):
|
||||
# Extract retrieved contexts from outputs
|
||||
outputs = node_result.outputs
|
||||
contexts = list(outputs.get("retrieved_contexts", []))
|
||||
query = self._extract_query(dict(node_result.inputs))
|
||||
# Extract retrieved content from result list
|
||||
result_list = outputs.get("result", [])
|
||||
output = "\n---\n".join(
|
||||
str(item.get("content", "")) for item in result_list if item.get("content")
|
||||
)
|
||||
|
||||
merged_items.append(
|
||||
EvaluationItemInput(
|
||||
index=item.index,
|
||||
inputs=item.inputs,
|
||||
expected_output=item.expected_output,
|
||||
index=i,
|
||||
inputs={"query": query},
|
||||
output=output,
|
||||
context=contexts,
|
||||
)
|
||||
)
|
||||
|
||||
evaluated = self.evaluation_instance.evaluate_retrieval(
|
||||
merged_items, default_metrics, model_provider, model_name, tenant_id
|
||||
return self.evaluation_instance.evaluate_retrieval(
|
||||
merged_items, default_metric.metric, model_provider, model_name, tenant_id
|
||||
)
|
||||
|
||||
# Merge metrics back into original results (preserve actual_output and metadata)
|
||||
eval_by_index = {r.index: r for r in evaluated}
|
||||
final_results = []
|
||||
for result in results:
|
||||
if result.index in eval_by_index:
|
||||
eval_result = eval_by_index[result.index]
|
||||
final_results.append(
|
||||
EvaluationItemResult(
|
||||
index=result.index,
|
||||
actual_output=result.actual_output,
|
||||
metrics=eval_result.metrics,
|
||||
metadata=result.metadata,
|
||||
error=result.error,
|
||||
)
|
||||
)
|
||||
else:
|
||||
final_results.append(result)
|
||||
return final_results
|
||||
|
||||
@staticmethod
|
||||
def _extract_query(inputs: dict[str, Any]) -> str:
|
||||
for key in ("query", "question", "input", "text"):
|
||||
for key in ("query"):
|
||||
if key in inputs:
|
||||
return str(inputs[key])
|
||||
values = list(inputs.values())
|
||||
return str(values[0]) if values else ""
|
||||
return ""
|
||||
|
||||
Loading…
Reference in New Issue
Block a user