fix: avoid duplicating lines when merging text for summarization (#37093)

Co-authored-by: bymle <229636660+bymle@users.noreply.github.com>
This commit is contained in:
bymle 2026-06-05 15:02:57 +08:00 committed by GitHub
parent 0cdd478f25
commit 24876bb05d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 33 additions and 2 deletions

View File

@ -392,7 +392,7 @@ Here is the extra instruction you need to follow:
else:
if len(messages[-1]) + len(line) < max_tokens * 0.5:
messages[-1] += line
if get_prompt_tokens(messages[-1] + line) > max_tokens * 0.7:
elif get_prompt_tokens(messages[-1] + line) > max_tokens * 0.7:
messages.append(line)
else:
messages[-1] += line

View File

@ -135,7 +135,7 @@ class BuiltinTool(Tool):
else:
if len(messages[-1]) + len(j) < max_tokens * 0.5:
messages[-1] += j
if get_prompt_tokens(messages[-1] + j) > max_tokens * 0.7:
elif get_prompt_tokens(messages[-1] + j) > max_tokens * 0.7:
messages.append(j)
else:
messages[-1] += j

View File

@ -129,3 +129,34 @@ def test_builtin_tool_summary_short_and_long_content_paths():
assert result
assert "S" in result
def test_builtin_tool_summary_does_not_duplicate_lines_when_merging_chunks():
"""Each line must be placed into exactly one summarization chunk.
The chunk-merge loop used two adjacent, non-mutually-exclusive ``if`` blocks, so a
line that fit the character budget was concatenated onto the current chunk AND then
appended a second time. That duplicated content in the text sent to the model.
"""
tool = _build_tool()
captured_chunks: list[str] = []
def _record_invoke(user_id, prompt_messages, stop):
captured_chunks.append(prompt_messages[-1].content)
return SimpleNamespace(message=SimpleNamespace(content="S"))
content = "\n".join(["a" * 20, "b" * 20, "c" * 20])
with patch.object(_BuiltinDummyTool, "get_max_tokens", return_value=100):
with patch.object(
_BuiltinDummyTool,
"get_prompt_tokens",
side_effect=lambda prompt_messages: len(prompt_messages[-1].content),
):
with patch.object(_BuiltinDummyTool, "invoke_model", side_effect=_record_invoke):
tool.summary(user_id="u1", content=content)
combined = "".join(captured_chunks)
for line in ("a" * 20, "b" * 20, "c" * 20):
assert combined.count(line) == 1, f"line was sent to the model {combined.count(line)} times, expected 1"