mirror of
https://github.com/langgenius/dify.git
synced 2026-05-06 10:06:51 +08:00
fix: skip empty documents before vector embedding (#35763)
Co-authored-by: Asuka Minato <i@asukaminato.eu.org> Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
This commit is contained in:
parent
81090effe2
commit
4b7dc17546
@ -144,8 +144,20 @@ class Vector:
|
||||
def get_vector_factory(vector_type: str) -> type[AbstractVectorFactory]:
|
||||
return get_vector_factory_class(vector_type)
|
||||
|
||||
@staticmethod
|
||||
def _filter_empty_text_documents(documents: list[Document]) -> list[Document]:
|
||||
filtered_documents = [document for document in documents if document.page_content.strip()]
|
||||
skipped_count = len(documents) - len(filtered_documents)
|
||||
if skipped_count:
|
||||
logger.warning("skip %d empty documents before vector embedding", skipped_count)
|
||||
return filtered_documents
|
||||
|
||||
def create(self, texts: list | None = None, **kwargs):
|
||||
if texts:
|
||||
texts = self._filter_empty_text_documents(texts)
|
||||
if not texts:
|
||||
return
|
||||
|
||||
start = time.time()
|
||||
logger.info("start embedding %s texts %s", len(texts), start)
|
||||
batch_size = 1000
|
||||
@ -203,8 +215,14 @@ class Vector:
|
||||
logger.info("Embedding %s files took %s s", len(file_documents), time.time() - start)
|
||||
|
||||
def add_texts(self, documents: list[Document], **kwargs):
|
||||
documents = self._filter_empty_text_documents(documents)
|
||||
if not documents:
|
||||
return
|
||||
|
||||
if kwargs.get("duplicate_check", False):
|
||||
documents = self._filter_duplicate_texts(documents)
|
||||
if not documents:
|
||||
return
|
||||
|
||||
embeddings = self._embeddings.embed_documents([document.page_content for document in documents])
|
||||
self._vector_processor.create(texts=documents, embeddings=embeddings, **kwargs)
|
||||
|
||||
@ -316,6 +316,33 @@ def test_create_batches_texts_and_skips_empty_input(vector_factory_module):
|
||||
vector._vector_processor.create.assert_not_called()
|
||||
|
||||
|
||||
def test_create_skips_empty_text_documents_before_embedding(vector_factory_module):
|
||||
vector = vector_factory_module.Vector.__new__(vector_factory_module.Vector)
|
||||
vector._embeddings = MagicMock()
|
||||
vector._embeddings.embed_documents.return_value = [[0.1], [0.2]]
|
||||
vector._vector_processor = MagicMock()
|
||||
|
||||
docs = [
|
||||
Document(page_content="foo", metadata={"doc_id": "id-1"}),
|
||||
Document(page_content="", metadata={"doc_id": "id-empty"}),
|
||||
Document(page_content=" \n", metadata={"doc_id": "id-blank"}),
|
||||
Document(page_content="bar", metadata={"doc_id": "id-2"}),
|
||||
]
|
||||
|
||||
vector.create(texts=docs, request_id="r-1")
|
||||
|
||||
vector._embeddings.embed_documents.assert_called_once_with(["foo", "bar"])
|
||||
vector._vector_processor.create.assert_called_once_with(
|
||||
texts=[docs[0], docs[3]], embeddings=[[0.1], [0.2]], request_id="r-1"
|
||||
)
|
||||
|
||||
vector._embeddings.embed_documents.reset_mock()
|
||||
vector._vector_processor.create.reset_mock()
|
||||
vector.create(texts=[docs[1], docs[2]])
|
||||
vector._embeddings.embed_documents.assert_not_called()
|
||||
vector._vector_processor.create.assert_not_called()
|
||||
|
||||
|
||||
def test_create_multimodal_filters_missing_uploads(vector_factory_module, monkeypatch):
|
||||
class _Field:
|
||||
def in_(self, value):
|
||||
@ -396,6 +423,48 @@ def test_add_texts_with_optional_duplicate_check(vector_factory_module):
|
||||
vector._vector_processor.create.assert_called_once()
|
||||
|
||||
|
||||
def test_add_texts_skips_empty_text_documents(vector_factory_module):
|
||||
vector = vector_factory_module.Vector.__new__(vector_factory_module.Vector)
|
||||
vector._embeddings = MagicMock()
|
||||
vector._embeddings.embed_documents.return_value = [[0.1]]
|
||||
vector._vector_processor = MagicMock()
|
||||
|
||||
docs = [
|
||||
Document(page_content="keep", metadata={"doc_id": "id-1"}),
|
||||
Document(page_content="", metadata={"doc_id": "id-empty"}),
|
||||
]
|
||||
|
||||
vector.add_texts(docs, source="api")
|
||||
|
||||
vector._embeddings.embed_documents.assert_called_once_with(["keep"])
|
||||
vector._vector_processor.create.assert_called_once_with(texts=[docs[0]], embeddings=[[0.1]], source="api")
|
||||
|
||||
vector._embeddings.embed_documents.reset_mock()
|
||||
vector._vector_processor.create.reset_mock()
|
||||
vector.add_texts([docs[1]])
|
||||
vector._embeddings.embed_documents.assert_not_called()
|
||||
vector._vector_processor.create.assert_not_called()
|
||||
|
||||
|
||||
def test_add_texts_filters_empty_documents_before_duplicate_check(vector_factory_module):
|
||||
vector = vector_factory_module.Vector.__new__(vector_factory_module.Vector)
|
||||
vector._embeddings = MagicMock()
|
||||
vector._embeddings.embed_documents.return_value = [[0.1]]
|
||||
vector._vector_processor = MagicMock()
|
||||
vector._filter_duplicate_texts = MagicMock(return_value=[])
|
||||
|
||||
docs = [
|
||||
Document(page_content="keep", metadata={"doc_id": "id-1"}),
|
||||
Document(page_content=" ", metadata={"doc_id": "id-empty"}),
|
||||
]
|
||||
|
||||
vector.add_texts(docs, duplicate_check=True)
|
||||
|
||||
vector._filter_duplicate_texts.assert_called_once_with([docs[0]])
|
||||
vector._embeddings.embed_documents.assert_not_called()
|
||||
vector._vector_processor.create.assert_not_called()
|
||||
|
||||
|
||||
def test_vector_delegation_methods(vector_factory_module):
|
||||
vector = vector_factory_module.Vector.__new__(vector_factory_module.Vector)
|
||||
vector._embeddings = MagicMock()
|
||||
|
||||
Loading…
Reference in New Issue
Block a user