From 24876bb05d8eaddc9e6537e3899a0066406e7e6b Mon Sep 17 00:00:00 2001
From: bymle <by327@cornell.edu>
Date: Fri, 5 Jun 2026 15:02:57 +0800
Subject: [PATCH] fix: avoid duplicating lines when merging text for
 summarization (#37093)

Co-authored-by: bymle <229636660+bymle@users.noreply.github.com>
---
 api/core/plugin/backwards_invocation/model.py |  2 +-
 api/core/tools/builtin_tool/tool.py           |  2 +-
 .../core/tools/test_builtin_tool_base.py      | 31 +++++++++++++++++++
 3 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/api/core/plugin/backwards_invocation/model.py b/api/core/plugin/backwards_invocation/model.py
index c92438960a..c03665272b 100644
--- a/api/core/plugin/backwards_invocation/model.py
+++ b/api/core/plugin/backwards_invocation/model.py
@@ -392,7 +392,7 @@ Here is the extra instruction you need to follow:
             else:
                 if len(messages[-1]) + len(line) < max_tokens * 0.5:
                     messages[-1] += line
-                if get_prompt_tokens(messages[-1] + line) > max_tokens * 0.7:
+                elif get_prompt_tokens(messages[-1] + line) > max_tokens * 0.7:
                     messages.append(line)
                 else:
                     messages[-1] += line
diff --git a/api/core/tools/builtin_tool/tool.py b/api/core/tools/builtin_tool/tool.py
index d41503e1e6..1872cb46a9 100644
--- a/api/core/tools/builtin_tool/tool.py
+++ b/api/core/tools/builtin_tool/tool.py
@@ -135,7 +135,7 @@ class BuiltinTool(Tool):
             else:
                 if len(messages[-1]) + len(j) < max_tokens * 0.5:
                     messages[-1] += j
-                if get_prompt_tokens(messages[-1] + j) > max_tokens * 0.7:
+                elif get_prompt_tokens(messages[-1] + j) > max_tokens * 0.7:
                     messages.append(j)
                 else:
                     messages[-1] += j
diff --git a/api/tests/unit_tests/core/tools/test_builtin_tool_base.py b/api/tests/unit_tests/core/tools/test_builtin_tool_base.py
index 1ff81f6120..e53468d9c5 100644
--- a/api/tests/unit_tests/core/tools/test_builtin_tool_base.py
+++ b/api/tests/unit_tests/core/tools/test_builtin_tool_base.py
@@ -129,3 +129,34 @@ def test_builtin_tool_summary_short_and_long_content_paths():
 
     assert result
     assert "S" in result
+
+
+def test_builtin_tool_summary_does_not_duplicate_lines_when_merging_chunks():
+    """Each line must be placed into exactly one summarization chunk.
+
+    The chunk-merge loop used two adjacent, non-mutually-exclusive ``if`` blocks, so a
+    line that fit the character budget was concatenated onto the current chunk AND then
+    appended a second time. That duplicated content in the text sent to the model.
+    """
+    tool = _build_tool()
+
+    captured_chunks: list[str] = []
+
+    def _record_invoke(user_id, prompt_messages, stop):
+        captured_chunks.append(prompt_messages[-1].content)
+        return SimpleNamespace(message=SimpleNamespace(content="S"))
+
+    content = "\n".join(["a" * 20, "b" * 20, "c" * 20])
+
+    with patch.object(_BuiltinDummyTool, "get_max_tokens", return_value=100):
+        with patch.object(
+            _BuiltinDummyTool,
+            "get_prompt_tokens",
+            side_effect=lambda prompt_messages: len(prompt_messages[-1].content),
+        ):
+            with patch.object(_BuiltinDummyTool, "invoke_model", side_effect=_record_invoke):
+                tool.summary(user_id="u1", content=content)
+
+    combined = "".join(captured_chunks)
+    for line in ("a" * 20, "b" * 20, "c" * 20):
+        assert combined.count(line) == 1, f"line was sent to the model {combined.count(line)} times, expected 1"