diff --git a/api/core/evaluation/frameworks/ragas/ragas_evaluator.py b/api/core/evaluation/frameworks/ragas/ragas_evaluator.py index c494615d34..70a544a38f 100644 --- a/api/core/evaluation/frameworks/ragas/ragas_evaluator.py +++ b/api/core/evaluation/frameworks/ragas/ragas_evaluator.py @@ -126,7 +126,7 @@ class RagasEvaluator(BaseEvaluationInstance): samples = [] for item in items: sample = SingleTurnSample( - user_input=self._inputs_to_query(item.inputs), + user_input=self._inputs_format(item.inputs, category), response=item.expected_output or "", retrieved_contexts=item.context or [], ) @@ -233,14 +233,15 @@ class RagasEvaluator(BaseEvaluationInstance): return 0.0 @staticmethod - def _inputs_to_query(inputs: dict[str, Any]) -> str: - """Convert input dict to a query string.""" - if "query" in inputs: - return str(inputs["query"]) - if "question" in inputs: - return str(inputs["question"]) - # Fallback: concatenate all input values - return " ".join(str(v) for v in inputs.values()) + def _inputs_format(inputs: dict[str, Any], category: EvaluationCategory) -> str: + """Convert input dict to a prompt string.""" + match category: + case EvaluationCategory.LLM: + return str(inputs["prompt"]) + case EvaluationCategory.RETRIEVAL: + return str(inputs["query"]) + case _: + return "" @staticmethod def _build_ragas_metrics(requested_metrics: list[str]) -> list[Any]: diff --git a/api/core/evaluation/runners/llm_evaluation_runner.py b/api/core/evaluation/runners/llm_evaluation_runner.py index 5467f79849..2e412752b1 100644 --- a/api/core/evaluation/runners/llm_evaluation_runner.py +++ b/api/core/evaluation/runners/llm_evaluation_runner.py @@ -75,17 +75,45 @@ class LLMEvaluationRunner(BaseEvaluationRunner): def _merge_results_into_items( items: list[NodeRunResult], ) -> list[EvaluationItemInput]: - """Create new items with actual_output set as expected_output context for metrics.""" + """Create new items from NodeRunResult for ragas evaluation. + + Extracts prompts from process_data and concatenates them into a single + string with role prefixes (e.g. "system: ...\nuser: ...\nassistant: ..."). + The last assistant message in outputs is used as the actual output. + """ merged = [] - for item in items: + for i, item in enumerate(items): + prompt = _format_prompts(item.process_data.get("prompts", [])) + output = _extract_llm_output(item.outputs) merged.append( EvaluationItemInput( - index=item.index, - inputs={ - "prompt": item.prompt, - }, - output=item.output, - expected_output=item.expected_output, + index=i, + inputs={"prompt": prompt}, + output=output, ) ) return merged + + +def _format_prompts(prompts: list[dict[str, Any]]) -> str: + """Concatenate a list of prompt messages into a single string for evaluation. + + Each message is formatted as "role: text" and joined with newlines. + """ + parts: list[str] = [] + for msg in prompts: + role = msg.get("role", "unknown") + text = msg.get("text", "") + parts.append(f"{role}: {text}") + return "\n".join(parts) + + +def _extract_llm_output(outputs: Mapping[str, Any]) -> str: + """Extract the LLM output text from NodeRunResult.outputs.""" + if "text" in outputs: + return str(outputs["text"]) + if "answer" in outputs: + return str(outputs["answer"]) + # Fallback: first value + values = list(outputs.values()) + return str(values[0]) if values else "" diff --git a/api/core/evaluation/runners/retrieval_evaluation_runner.py b/api/core/evaluation/runners/retrieval_evaluation_runner.py index 311edee2d5..a11032ec63 100644 --- a/api/core/evaluation/runners/retrieval_evaluation_runner.py +++ b/api/core/evaluation/runners/retrieval_evaluation_runner.py @@ -33,48 +33,39 @@ class RetrievalEvaluationRunner(BaseEvaluationRunner): tenant_id: str, ) -> list[EvaluationItemResult]: """Compute retrieval evaluation metrics.""" - # Merge retrieved contexts into items - result_by_index = {r.index: r for r in results} + if not node_run_result_list: + return [] + if not default_metric: + raise ValueError("Default metric is required for retrieval evaluation") + merged_items = [] - for item in items: - result = result_by_index.get(item.index) - contexts = result.metadata.get("retrieved_contexts", []) if result else [] + for i, node_result in enumerate(node_run_result_list): + # Extract retrieved contexts from outputs + outputs = node_result.outputs + contexts = list(outputs.get("retrieved_contexts", [])) + query = self._extract_query(dict(node_result.inputs)) + # Extract retrieved content from result list + result_list = outputs.get("result", []) + output = "\n---\n".join( + str(item.get("content", "")) for item in result_list if item.get("content") + ) + merged_items.append( EvaluationItemInput( - index=item.index, - inputs=item.inputs, - expected_output=item.expected_output, + index=i, + inputs={"query": query}, + output=output, context=contexts, ) ) - evaluated = self.evaluation_instance.evaluate_retrieval( - merged_items, default_metrics, model_provider, model_name, tenant_id + return self.evaluation_instance.evaluate_retrieval( + merged_items, default_metric.metric, model_provider, model_name, tenant_id ) - # Merge metrics back into original results (preserve actual_output and metadata) - eval_by_index = {r.index: r for r in evaluated} - final_results = [] - for result in results: - if result.index in eval_by_index: - eval_result = eval_by_index[result.index] - final_results.append( - EvaluationItemResult( - index=result.index, - actual_output=result.actual_output, - metrics=eval_result.metrics, - metadata=result.metadata, - error=result.error, - ) - ) - else: - final_results.append(result) - return final_results - @staticmethod def _extract_query(inputs: dict[str, Any]) -> str: - for key in ("query", "question", "input", "text"): + for key in ("query"): if key in inputs: return str(inputs[key]) - values = list(inputs.values()) - return str(values[0]) if values else "" + return ""