From 2b6882bd978255852cf0af2588199fe3645bafe8 Mon Sep 17 00:00:00 2001 From: znn Date: Fri, 10 Oct 2025 13:31:33 +0530 Subject: [PATCH] fix chunks 2 (#26623) Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> --- api/core/rag/splitter/fixed_text_splitter.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/api/core/rag/splitter/fixed_text_splitter.py b/api/core/rag/splitter/fixed_text_splitter.py index 8356861242..801d2a2a52 100644 --- a/api/core/rag/splitter/fixed_text_splitter.py +++ b/api/core/rag/splitter/fixed_text_splitter.py @@ -2,6 +2,7 @@ from __future__ import annotations +import re from typing import Any from core.model_manager import ModelInstance @@ -52,7 +53,7 @@ class FixedRecursiveCharacterTextSplitter(EnhanceRecursiveCharacterTextSplitter) """Create a new TextSplitter.""" super().__init__(**kwargs) self._fixed_separator = fixed_separator - self._separators = separators or ["\n\n", "\n", " ", ""] + self._separators = separators or ["\n\n", "\n", "。", ". ", " ", ""] def split_text(self, text: str) -> list[str]: """Split incoming text and return chunks.""" @@ -90,16 +91,19 @@ class FixedRecursiveCharacterTextSplitter(EnhanceRecursiveCharacterTextSplitter) # Now that we have the separator, split the text if separator: if separator == " ": - splits = text.split() + splits = re.split(r" +", text) else: splits = text.split(separator) splits = [item + separator if i < len(splits) else item for i, item in enumerate(splits)] else: splits = list(text) - splits = [s for s in splits if (s not in {"", "\n"})] + if separator == "\n": + splits = [s for s in splits if s != ""] + else: + splits = [s for s in splits if (s not in {"", "\n"})] _good_splits = [] _good_splits_lengths = [] # cache the lengths of the splits - _separator = "" if self._keep_separator else separator + _separator = separator if self._keep_separator else "" s_lens = self._length_function(splits) if separator != "": for s, s_len in zip(splits, s_lens):