From 24876bb05d8eaddc9e6537e3899a0066406e7e6b Mon Sep 17 00:00:00 2001 From: bymle Date: Fri, 5 Jun 2026 15:02:57 +0800 Subject: [PATCH] fix: avoid duplicating lines when merging text for summarization (#37093) Co-authored-by: bymle <229636660+bymle@users.noreply.github.com> --- api/core/plugin/backwards_invocation/model.py | 2 +- api/core/tools/builtin_tool/tool.py | 2 +- .../core/tools/test_builtin_tool_base.py | 31 +++++++++++++++++++ 3 files changed, 33 insertions(+), 2 deletions(-) diff --git a/api/core/plugin/backwards_invocation/model.py b/api/core/plugin/backwards_invocation/model.py index c92438960a..c03665272b 100644 --- a/api/core/plugin/backwards_invocation/model.py +++ b/api/core/plugin/backwards_invocation/model.py @@ -392,7 +392,7 @@ Here is the extra instruction you need to follow: else: if len(messages[-1]) + len(line) < max_tokens * 0.5: messages[-1] += line - if get_prompt_tokens(messages[-1] + line) > max_tokens * 0.7: + elif get_prompt_tokens(messages[-1] + line) > max_tokens * 0.7: messages.append(line) else: messages[-1] += line diff --git a/api/core/tools/builtin_tool/tool.py b/api/core/tools/builtin_tool/tool.py index d41503e1e6..1872cb46a9 100644 --- a/api/core/tools/builtin_tool/tool.py +++ b/api/core/tools/builtin_tool/tool.py @@ -135,7 +135,7 @@ class BuiltinTool(Tool): else: if len(messages[-1]) + len(j) < max_tokens * 0.5: messages[-1] += j - if get_prompt_tokens(messages[-1] + j) > max_tokens * 0.7: + elif get_prompt_tokens(messages[-1] + j) > max_tokens * 0.7: messages.append(j) else: messages[-1] += j diff --git a/api/tests/unit_tests/core/tools/test_builtin_tool_base.py b/api/tests/unit_tests/core/tools/test_builtin_tool_base.py index 1ff81f6120..e53468d9c5 100644 --- a/api/tests/unit_tests/core/tools/test_builtin_tool_base.py +++ b/api/tests/unit_tests/core/tools/test_builtin_tool_base.py @@ -129,3 +129,34 @@ def test_builtin_tool_summary_short_and_long_content_paths(): assert result assert "S" in result + + +def test_builtin_tool_summary_does_not_duplicate_lines_when_merging_chunks(): + """Each line must be placed into exactly one summarization chunk. + + The chunk-merge loop used two adjacent, non-mutually-exclusive ``if`` blocks, so a + line that fit the character budget was concatenated onto the current chunk AND then + appended a second time. That duplicated content in the text sent to the model. + """ + tool = _build_tool() + + captured_chunks: list[str] = [] + + def _record_invoke(user_id, prompt_messages, stop): + captured_chunks.append(prompt_messages[-1].content) + return SimpleNamespace(message=SimpleNamespace(content="S")) + + content = "\n".join(["a" * 20, "b" * 20, "c" * 20]) + + with patch.object(_BuiltinDummyTool, "get_max_tokens", return_value=100): + with patch.object( + _BuiltinDummyTool, + "get_prompt_tokens", + side_effect=lambda prompt_messages: len(prompt_messages[-1].content), + ): + with patch.object(_BuiltinDummyTool, "invoke_model", side_effect=_record_invoke): + tool.summary(user_id="u1", content=content) + + combined = "".join(captured_chunks) + for line in ("a" * 20, "b" * 20, "c" * 20): + assert combined.count(line) == 1, f"line was sent to the model {combined.count(line)} times, expected 1"