evaluation runtime

This commit is contained in:
jyong 2026-03-11 19:57:46 +08:00
parent 61e87a4ff4
commit 8ea3729fe9
3 changed files with 69 additions and 49 deletions

View File

@ -126,7 +126,7 @@ class RagasEvaluator(BaseEvaluationInstance):
samples = []
for item in items:
sample = SingleTurnSample(
user_input=self._inputs_to_query(item.inputs),
user_input=self._inputs_format(item.inputs, category),
response=item.expected_output or "",
retrieved_contexts=item.context or [],
)
@ -233,14 +233,15 @@ class RagasEvaluator(BaseEvaluationInstance):
return 0.0
@staticmethod
def _inputs_to_query(inputs: dict[str, Any]) -> str:
"""Convert input dict to a query string."""
if "query" in inputs:
return str(inputs["query"])
if "question" in inputs:
return str(inputs["question"])
# Fallback: concatenate all input values
return " ".join(str(v) for v in inputs.values())
def _inputs_format(inputs: dict[str, Any], category: EvaluationCategory) -> str:
"""Convert input dict to a prompt string."""
match category:
case EvaluationCategory.LLM:
return str(inputs["prompt"])
case EvaluationCategory.RETRIEVAL:
return str(inputs["query"])
case _:
return ""
@staticmethod
def _build_ragas_metrics(requested_metrics: list[str]) -> list[Any]:

View File

@ -75,17 +75,45 @@ class LLMEvaluationRunner(BaseEvaluationRunner):
def _merge_results_into_items(
items: list[NodeRunResult],
) -> list[EvaluationItemInput]:
"""Create new items with actual_output set as expected_output context for metrics."""
"""Create new items from NodeRunResult for ragas evaluation.
Extracts prompts from process_data and concatenates them into a single
string with role prefixes (e.g. "system: ...\nuser: ...\nassistant: ...").
The last assistant message in outputs is used as the actual output.
"""
merged = []
for item in items:
for i, item in enumerate(items):
prompt = _format_prompts(item.process_data.get("prompts", []))
output = _extract_llm_output(item.outputs)
merged.append(
EvaluationItemInput(
index=item.index,
inputs={
"prompt": item.prompt,
},
output=item.output,
expected_output=item.expected_output,
index=i,
inputs={"prompt": prompt},
output=output,
)
)
return merged
def _format_prompts(prompts: list[dict[str, Any]]) -> str:
"""Concatenate a list of prompt messages into a single string for evaluation.
Each message is formatted as "role: text" and joined with newlines.
"""
parts: list[str] = []
for msg in prompts:
role = msg.get("role", "unknown")
text = msg.get("text", "")
parts.append(f"{role}: {text}")
return "\n".join(parts)
def _extract_llm_output(outputs: Mapping[str, Any]) -> str:
"""Extract the LLM output text from NodeRunResult.outputs."""
if "text" in outputs:
return str(outputs["text"])
if "answer" in outputs:
return str(outputs["answer"])
# Fallback: first value
values = list(outputs.values())
return str(values[0]) if values else ""

View File

@ -33,48 +33,39 @@ class RetrievalEvaluationRunner(BaseEvaluationRunner):
tenant_id: str,
) -> list[EvaluationItemResult]:
"""Compute retrieval evaluation metrics."""
# Merge retrieved contexts into items
result_by_index = {r.index: r for r in results}
if not node_run_result_list:
return []
if not default_metric:
raise ValueError("Default metric is required for retrieval evaluation")
merged_items = []
for item in items:
result = result_by_index.get(item.index)
contexts = result.metadata.get("retrieved_contexts", []) if result else []
for i, node_result in enumerate(node_run_result_list):
# Extract retrieved contexts from outputs
outputs = node_result.outputs
contexts = list(outputs.get("retrieved_contexts", []))
query = self._extract_query(dict(node_result.inputs))
# Extract retrieved content from result list
result_list = outputs.get("result", [])
output = "\n---\n".join(
str(item.get("content", "")) for item in result_list if item.get("content")
)
merged_items.append(
EvaluationItemInput(
index=item.index,
inputs=item.inputs,
expected_output=item.expected_output,
index=i,
inputs={"query": query},
output=output,
context=contexts,
)
)
evaluated = self.evaluation_instance.evaluate_retrieval(
merged_items, default_metrics, model_provider, model_name, tenant_id
return self.evaluation_instance.evaluate_retrieval(
merged_items, default_metric.metric, model_provider, model_name, tenant_id
)
# Merge metrics back into original results (preserve actual_output and metadata)
eval_by_index = {r.index: r for r in evaluated}
final_results = []
for result in results:
if result.index in eval_by_index:
eval_result = eval_by_index[result.index]
final_results.append(
EvaluationItemResult(
index=result.index,
actual_output=result.actual_output,
metrics=eval_result.metrics,
metadata=result.metadata,
error=result.error,
)
)
else:
final_results.append(result)
return final_results
@staticmethod
def _extract_query(inputs: dict[str, Any]) -> str:
for key in ("query", "question", "input", "text"):
for key in ("query"):
if key in inputs:
return str(inputs[key])
values = list(inputs.values())
return str(values[0]) if values else ""
return ""