mirror of https://github.com/langgenius/dify.git
refactor: use dynamic max characters for chunking in extractors (#26782)
This commit is contained in:
parent
f86b6658c9
commit
d299e75e1b
|
|
@ -1,6 +1,7 @@
|
|||
import logging
|
||||
import os
|
||||
|
||||
from configs import dify_config
|
||||
from core.rag.extractor.extractor_base import BaseExtractor
|
||||
from core.rag.models.document import Document
|
||||
|
||||
|
|
@ -49,7 +50,8 @@ class UnstructuredWordExtractor(BaseExtractor):
|
|||
|
||||
from unstructured.chunking.title import chunk_by_title
|
||||
|
||||
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
|
||||
max_characters = dify_config.INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH
|
||||
chunks = chunk_by_title(elements, max_characters=max_characters, combine_text_under_n_chars=max_characters)
|
||||
documents = []
|
||||
for chunk in chunks:
|
||||
text = chunk.text.strip()
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@ import logging
|
|||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from configs import dify_config
|
||||
from core.rag.extractor.extractor_base import BaseExtractor
|
||||
from core.rag.models.document import Document
|
||||
|
||||
|
|
@ -46,7 +47,8 @@ class UnstructuredEmailExtractor(BaseExtractor):
|
|||
|
||||
from unstructured.chunking.title import chunk_by_title
|
||||
|
||||
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
|
||||
max_characters = dify_config.INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH
|
||||
chunks = chunk_by_title(elements, max_characters=max_characters, combine_text_under_n_chars=max_characters)
|
||||
documents = []
|
||||
for chunk in chunks:
|
||||
text = chunk.text.strip()
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ import logging
|
|||
|
||||
import pypandoc # type: ignore
|
||||
|
||||
from configs import dify_config
|
||||
from core.rag.extractor.extractor_base import BaseExtractor
|
||||
from core.rag.models.document import Document
|
||||
|
||||
|
|
@ -40,7 +41,8 @@ class UnstructuredEpubExtractor(BaseExtractor):
|
|||
|
||||
from unstructured.chunking.title import chunk_by_title
|
||||
|
||||
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
|
||||
max_characters = dify_config.INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH
|
||||
chunks = chunk_by_title(elements, max_characters=max_characters, combine_text_under_n_chars=max_characters)
|
||||
documents = []
|
||||
for chunk in chunks:
|
||||
text = chunk.text.strip()
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
import logging
|
||||
|
||||
from configs import dify_config
|
||||
from core.rag.extractor.extractor_base import BaseExtractor
|
||||
from core.rag.models.document import Document
|
||||
|
||||
|
|
@ -32,7 +33,8 @@ class UnstructuredMarkdownExtractor(BaseExtractor):
|
|||
elements = partition_md(filename=self._file_path)
|
||||
from unstructured.chunking.title import chunk_by_title
|
||||
|
||||
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
|
||||
max_characters = dify_config.INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH
|
||||
chunks = chunk_by_title(elements, max_characters=max_characters, combine_text_under_n_chars=max_characters)
|
||||
documents = []
|
||||
for chunk in chunks:
|
||||
text = chunk.text.strip()
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
import logging
|
||||
|
||||
from configs import dify_config
|
||||
from core.rag.extractor.extractor_base import BaseExtractor
|
||||
from core.rag.models.document import Document
|
||||
|
||||
|
|
@ -31,7 +32,8 @@ class UnstructuredMsgExtractor(BaseExtractor):
|
|||
elements = partition_msg(filename=self._file_path)
|
||||
from unstructured.chunking.title import chunk_by_title
|
||||
|
||||
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
|
||||
max_characters = dify_config.INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH
|
||||
chunks = chunk_by_title(elements, max_characters=max_characters, combine_text_under_n_chars=max_characters)
|
||||
documents = []
|
||||
for chunk in chunks:
|
||||
text = chunk.text.strip()
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
import logging
|
||||
|
||||
from configs import dify_config
|
||||
from core.rag.extractor.extractor_base import BaseExtractor
|
||||
from core.rag.models.document import Document
|
||||
|
||||
|
|
@ -32,7 +33,8 @@ class UnstructuredXmlExtractor(BaseExtractor):
|
|||
|
||||
from unstructured.chunking.title import chunk_by_title
|
||||
|
||||
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
|
||||
max_characters = dify_config.INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH
|
||||
chunks = chunk_by_title(elements, max_characters=max_characters, combine_text_under_n_chars=max_characters)
|
||||
documents = []
|
||||
for chunk in chunks:
|
||||
text = chunk.text.strip()
|
||||
|
|
|
|||
Loading…
Reference in New Issue