From d299e75e1bee8b67dd804a0f2afb13cd861a2921 Mon Sep 17 00:00:00 2001 From: Guangdong Liu Date: Mon, 13 Oct 2025 10:22:59 +0800 Subject: [PATCH] refactor: use dynamic max characters for chunking in extractors (#26782) --- .../rag/extractor/unstructured/unstructured_doc_extractor.py | 4 +++- .../rag/extractor/unstructured/unstructured_eml_extractor.py | 4 +++- .../rag/extractor/unstructured/unstructured_epub_extractor.py | 4 +++- .../extractor/unstructured/unstructured_markdown_extractor.py | 4 +++- .../rag/extractor/unstructured/unstructured_msg_extractor.py | 4 +++- .../rag/extractor/unstructured/unstructured_xml_extractor.py | 4 +++- 6 files changed, 18 insertions(+), 6 deletions(-) diff --git a/api/core/rag/extractor/unstructured/unstructured_doc_extractor.py b/api/core/rag/extractor/unstructured/unstructured_doc_extractor.py index 5199208f70..7dd8beaa46 100644 --- a/api/core/rag/extractor/unstructured/unstructured_doc_extractor.py +++ b/api/core/rag/extractor/unstructured/unstructured_doc_extractor.py @@ -1,6 +1,7 @@ import logging import os +from configs import dify_config from core.rag.extractor.extractor_base import BaseExtractor from core.rag.models.document import Document @@ -49,7 +50,8 @@ class UnstructuredWordExtractor(BaseExtractor): from unstructured.chunking.title import chunk_by_title - chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000) + max_characters = dify_config.INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH + chunks = chunk_by_title(elements, max_characters=max_characters, combine_text_under_n_chars=max_characters) documents = [] for chunk in chunks: text = chunk.text.strip() diff --git a/api/core/rag/extractor/unstructured/unstructured_eml_extractor.py b/api/core/rag/extractor/unstructured/unstructured_eml_extractor.py index ad04bd0bd1..d97d4c3a48 100644 --- a/api/core/rag/extractor/unstructured/unstructured_eml_extractor.py +++ b/api/core/rag/extractor/unstructured/unstructured_eml_extractor.py @@ -4,6 +4,7 @@ import logging from bs4 import BeautifulSoup +from configs import dify_config from core.rag.extractor.extractor_base import BaseExtractor from core.rag.models.document import Document @@ -46,7 +47,8 @@ class UnstructuredEmailExtractor(BaseExtractor): from unstructured.chunking.title import chunk_by_title - chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000) + max_characters = dify_config.INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH + chunks = chunk_by_title(elements, max_characters=max_characters, combine_text_under_n_chars=max_characters) documents = [] for chunk in chunks: text = chunk.text.strip() diff --git a/api/core/rag/extractor/unstructured/unstructured_epub_extractor.py b/api/core/rag/extractor/unstructured/unstructured_epub_extractor.py index fc14ee6275..3061d957ac 100644 --- a/api/core/rag/extractor/unstructured/unstructured_epub_extractor.py +++ b/api/core/rag/extractor/unstructured/unstructured_epub_extractor.py @@ -2,6 +2,7 @@ import logging import pypandoc # type: ignore +from configs import dify_config from core.rag.extractor.extractor_base import BaseExtractor from core.rag.models.document import Document @@ -40,7 +41,8 @@ class UnstructuredEpubExtractor(BaseExtractor): from unstructured.chunking.title import chunk_by_title - chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000) + max_characters = dify_config.INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH + chunks = chunk_by_title(elements, max_characters=max_characters, combine_text_under_n_chars=max_characters) documents = [] for chunk in chunks: text = chunk.text.strip() diff --git a/api/core/rag/extractor/unstructured/unstructured_markdown_extractor.py b/api/core/rag/extractor/unstructured/unstructured_markdown_extractor.py index 23030d7739..b6d8c47111 100644 --- a/api/core/rag/extractor/unstructured/unstructured_markdown_extractor.py +++ b/api/core/rag/extractor/unstructured/unstructured_markdown_extractor.py @@ -1,5 +1,6 @@ import logging +from configs import dify_config from core.rag.extractor.extractor_base import BaseExtractor from core.rag.models.document import Document @@ -32,7 +33,8 @@ class UnstructuredMarkdownExtractor(BaseExtractor): elements = partition_md(filename=self._file_path) from unstructured.chunking.title import chunk_by_title - chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000) + max_characters = dify_config.INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH + chunks = chunk_by_title(elements, max_characters=max_characters, combine_text_under_n_chars=max_characters) documents = [] for chunk in chunks: text = chunk.text.strip() diff --git a/api/core/rag/extractor/unstructured/unstructured_msg_extractor.py b/api/core/rag/extractor/unstructured/unstructured_msg_extractor.py index f29e639d1b..ae60fc7981 100644 --- a/api/core/rag/extractor/unstructured/unstructured_msg_extractor.py +++ b/api/core/rag/extractor/unstructured/unstructured_msg_extractor.py @@ -1,5 +1,6 @@ import logging +from configs import dify_config from core.rag.extractor.extractor_base import BaseExtractor from core.rag.models.document import Document @@ -31,7 +32,8 @@ class UnstructuredMsgExtractor(BaseExtractor): elements = partition_msg(filename=self._file_path) from unstructured.chunking.title import chunk_by_title - chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000) + max_characters = dify_config.INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH + chunks = chunk_by_title(elements, max_characters=max_characters, combine_text_under_n_chars=max_characters) documents = [] for chunk in chunks: text = chunk.text.strip() diff --git a/api/core/rag/extractor/unstructured/unstructured_xml_extractor.py b/api/core/rag/extractor/unstructured/unstructured_xml_extractor.py index d75e166f1b..2d4846d85e 100644 --- a/api/core/rag/extractor/unstructured/unstructured_xml_extractor.py +++ b/api/core/rag/extractor/unstructured/unstructured_xml_extractor.py @@ -1,5 +1,6 @@ import logging +from configs import dify_config from core.rag.extractor.extractor_base import BaseExtractor from core.rag.models.document import Document @@ -32,7 +33,8 @@ class UnstructuredXmlExtractor(BaseExtractor): from unstructured.chunking.title import chunk_by_title - chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000) + max_characters = dify_config.INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH + chunks = chunk_by_title(elements, max_characters=max_characters, combine_text_under_n_chars=max_characters) documents = [] for chunk in chunks: text = chunk.text.strip()