diff --git a/api/core/app/apps/base_app_runner.py b/api/core/app/apps/base_app_runner.py index e2e6c11480..21399a200d 100644 --- a/api/core/app/apps/base_app_runner.py +++ b/api/core/app/apps/base_app_runner.py @@ -247,6 +247,7 @@ class AppRunner: model: str = "" prompt_messages: list[PromptMessage] = [] text = "" + reasoning_content = "" usage = None for result in invoke_result: if not agent: @@ -266,6 +267,10 @@ class AppRunner: else: text += content # failback to str + # Handle reasoning content from delta (e.g., Ollama's thinking field) + if result.delta.reasoning_content: + reasoning_content += result.delta.reasoning_content + if not model: model = result.model @@ -279,7 +284,11 @@ class AppRunner: usage = LLMUsage.empty_usage() llm_result = LLMResult( - model=model, prompt_messages=prompt_messages, message=AssistantPromptMessage(content=text), usage=usage + model=model, + prompt_messages=prompt_messages, + message=AssistantPromptMessage(content=text), + usage=usage, + reasoning_content=reasoning_content or None, ) queue_manager.publish( diff --git a/api/core/model_runtime/entities/llm_entities.py b/api/core/model_runtime/entities/llm_entities.py index 2c7c421eed..f106e85812 100644 --- a/api/core/model_runtime/entities/llm_entities.py +++ b/api/core/model_runtime/entities/llm_entities.py @@ -192,6 +192,7 @@ class LLMResultChunkDelta(BaseModel): message: AssistantPromptMessage usage: LLMUsage | None = None finish_reason: str | None = None + reasoning_content: str | None = None class LLMResultChunk(BaseModel): diff --git a/api/core/plugin/impl/base.py b/api/core/plugin/impl/base.py index 7bb2749afa..3c2f19f809 100644 --- a/api/core/plugin/impl/base.py +++ b/api/core/plugin/impl/base.py @@ -9,6 +9,7 @@ from pydantic import BaseModel from yarl import URL from configs import dify_config +from core.model_runtime.entities.llm_entities import LLMResultChunk from core.model_runtime.errors.invoke import ( InvokeAuthorizationError, InvokeBadRequestError, @@ -255,7 +256,17 @@ class BasePluginClient: """ for line in self._stream_request(method, path, params, headers, data, files): try: - rep = PluginDaemonBasicResponse[type_].model_validate_json(line) # type: ignore + line_data = json.loads(line) + + if isinstance(line_data, dict) and type_ is LLMResultChunk: + if "data" in line_data and isinstance(line_data["data"], dict): + data_dict = line_data["data"] + if "delta" in data_dict and isinstance(data_dict["delta"], dict): + delta_dict = data_dict["delta"] + if "thinking" in delta_dict and "reasoning_content" not in delta_dict: + delta_dict["reasoning_content"] = delta_dict.pop("thinking") + + rep = PluginDaemonBasicResponse[type_].model_validate(line_data) # type: ignore except (ValueError, TypeError): # TODO modify this when line_data has code and message try: diff --git a/api/core/workflow/nodes/llm/node.py b/api/core/workflow/nodes/llm/node.py index 04e2802191..0da34f0e81 100644 --- a/api/core/workflow/nodes/llm/node.py +++ b/api/core/workflow/nodes/llm/node.py @@ -444,6 +444,7 @@ class LLMNode(Node[LLMNodeData]): usage = LLMUsage.empty_usage() finish_reason = None full_text_buffer = io.StringIO() + reasoning_content_buffer = io.StringIO() # Initialize streaming metrics tracking start_time = request_start_time if request_start_time is not None else time.perf_counter() @@ -478,6 +479,15 @@ class LLMNode(Node[LLMNodeData]): is_final=False, ) + # Handle reasoning content from delta (e.g., Ollama's thinking field) + if result.delta.reasoning_content: + reasoning_content_buffer.write(result.delta.reasoning_content) + yield StreamChunkEvent( + selector=[node_id, "reasoning_content"], + chunk=result.delta.reasoning_content, + is_final=False, + ) + # Update the whole metadata if not model and result.model: model = result.model @@ -494,6 +504,7 @@ class LLMNode(Node[LLMNodeData]): # Extract reasoning content from tags in the main text full_text = full_text_buffer.getvalue() + streamed_reasoning_content = reasoning_content_buffer.getvalue() if reasoning_format == "tagged": # Keep tags in text for backward compatibility @@ -501,7 +512,8 @@ class LLMNode(Node[LLMNodeData]): reasoning_content = "" else: # Extract clean text and reasoning from tags - clean_text, reasoning_content = LLMNode._split_reasoning(full_text, reasoning_format) + clean_text, extracted_reasoning_content = LLMNode._split_reasoning(full_text, reasoning_format) + reasoning_content = streamed_reasoning_content or extracted_reasoning_content # Calculate streaming metrics end_time = time.perf_counter() @@ -1158,7 +1170,8 @@ class LLMNode(Node[LLMNodeData]): reasoning_content = "" else: # Extract clean text and reasoning from tags - clean_text, reasoning_content = LLMNode._split_reasoning(full_text, reasoning_format) + clean_text, extracted_reasoning_content = LLMNode._split_reasoning(full_text, reasoning_format) + reasoning_content = invoke_result.reasoning_content or extracted_reasoning_content event = ModelInvokeCompletedEvent( # Use clean_text for separated mode, full_text for tagged mode