From b48a10d7ec4bbb1a9f9c59a9f5a072df17393e9e Mon Sep 17 00:00:00 2001 From: eux Date: Thu, 29 Jan 2026 11:12:18 +0800 Subject: [PATCH] feat(qdrant): implement full-text search with multi-keyword support (#31658) --- .../datasource/vdb/qdrant/qdrant_vector.py | 98 ++++++++++++------- .../vdb/qdrant/test_qdrant.py | 77 +++++++++++++++ 2 files changed, 142 insertions(+), 33 deletions(-) diff --git a/api/core/rag/datasource/vdb/qdrant/qdrant_vector.py b/api/core/rag/datasource/vdb/qdrant/qdrant_vector.py index f8c62b908a..4a4a458f2e 100644 --- a/api/core/rag/datasource/vdb/qdrant/qdrant_vector.py +++ b/api/core/rag/datasource/vdb/qdrant/qdrant_vector.py @@ -391,46 +391,78 @@ class QdrantVector(BaseVector): return docs def search_by_full_text(self, query: str, **kwargs: Any) -> list[Document]: - """Return docs most similar by bm25. + """Return docs most similar by full-text search. + + Searches each keyword separately and merges results to ensure documents + matching ANY keyword are returned (OR logic). Results are capped at top_k. + + Args: + query: Search query text. Multi-word queries are split into keywords, + with each keyword searched separately. Limited to 10 keywords. + **kwargs: Additional search parameters (top_k, document_ids_filter) + Returns: - List of documents most similar to the query text and distance for each. + List of up to top_k unique documents matching any query keyword. """ from qdrant_client.http import models - scroll_filter = models.Filter( - must=[ - models.FieldCondition( - key="group_id", - match=models.MatchValue(value=self._group_id), - ), - models.FieldCondition( - key="page_content", - match=models.MatchText(text=query), - ), - ] - ) + # Build base must conditions (AND logic) for metadata filters + base_must_conditions: list = [ + models.FieldCondition( + key="group_id", + match=models.MatchValue(value=self._group_id), + ), + ] + document_ids_filter = kwargs.get("document_ids_filter") if document_ids_filter: - if scroll_filter.must: - scroll_filter.must.append( - models.FieldCondition( - key="metadata.document_id", - match=models.MatchAny(any=document_ids_filter), - ) + base_must_conditions.append( + models.FieldCondition( + key="metadata.document_id", + match=models.MatchAny(any=document_ids_filter), ) - response = self._client.scroll( - collection_name=self._collection_name, - scroll_filter=scroll_filter, - limit=kwargs.get("top_k", 2), - with_payload=True, - with_vectors=True, - ) - results = response[0] - documents = [] - for result in results: - if result: - document = self._document_from_scored_point(result, Field.CONTENT_KEY, Field.METADATA_KEY) - documents.append(document) + ) + + # Split query into keywords, deduplicate and limit to prevent DoS + keywords = list(dict.fromkeys(kw.strip() for kw in query.strip().split() if kw.strip()))[:10] + + if not keywords: + return [] + + top_k = kwargs.get("top_k", 2) + seen_ids: set[str | int] = set() + documents: list[Document] = [] + + # Search each keyword separately and merge results. + # This ensures each keyword gets its own search, preventing one keyword's + # results from completely overshadowing another's due to scroll ordering. + for keyword in keywords: + scroll_filter = models.Filter( + must=[ + *base_must_conditions, + models.FieldCondition( + key="page_content", + match=models.MatchText(text=keyword), + ), + ] + ) + + response = self._client.scroll( + collection_name=self._collection_name, + scroll_filter=scroll_filter, + limit=top_k, + with_payload=True, + with_vectors=True, + ) + results = response[0] + + for result in results: + if result and result.id not in seen_ids: + seen_ids.add(result.id) + document = self._document_from_scored_point(result, Field.CONTENT_KEY, Field.METADATA_KEY) + documents.append(document) + if len(documents) >= top_k: + return documents return documents diff --git a/api/tests/integration_tests/vdb/qdrant/test_qdrant.py b/api/tests/integration_tests/vdb/qdrant/test_qdrant.py index fe0e03f7b8..a2bf10001a 100644 --- a/api/tests/integration_tests/vdb/qdrant/test_qdrant.py +++ b/api/tests/integration_tests/vdb/qdrant/test_qdrant.py @@ -1,3 +1,5 @@ +import uuid + from core.rag.datasource.vdb.qdrant.qdrant_vector import QdrantConfig, QdrantVector from core.rag.models.document import Document from tests.integration_tests.vdb.test_vector_store import ( @@ -18,6 +20,10 @@ class QdrantVectorTest(AbstractVectorTest): api_key="difyai123456", ), ) + # Additional doc IDs for multi-keyword search tests + self.doc_apple_id = "" + self.doc_banana_id = "" + self.doc_both_id = "" def search_by_vector(self): super().search_by_vector() @@ -27,6 +33,77 @@ class QdrantVectorTest(AbstractVectorTest): ) assert len(hits_by_vector) == 0 + def _create_document(self, content: str, doc_id: str) -> Document: + """Create a document with the given content and doc_id.""" + return Document( + page_content=content, + metadata={ + "doc_id": doc_id, + "doc_hash": doc_id, + "document_id": doc_id, + "dataset_id": self.dataset_id, + }, + ) + + def setup_multi_keyword_documents(self): + """Create test documents with different keyword combinations for multi-keyword search tests.""" + self.doc_apple_id = str(uuid.uuid4()) + self.doc_banana_id = str(uuid.uuid4()) + self.doc_both_id = str(uuid.uuid4()) + + documents = [ + self._create_document("This document contains apple only", self.doc_apple_id), + self._create_document("This document contains banana only", self.doc_banana_id), + self._create_document("This document contains both apple and banana", self.doc_both_id), + ] + embeddings = [self.example_embedding] * len(documents) + + self.vector.add_texts(documents=documents, embeddings=embeddings) + + def search_by_full_text_multi_keyword(self): + """Test multi-keyword search returns docs matching ANY keyword (OR logic).""" + # First verify single keyword searches work correctly + hits_apple = self.vector.search_by_full_text(query="apple", top_k=10) + apple_ids = {doc.metadata["doc_id"] for doc in hits_apple} + assert self.doc_apple_id in apple_ids, "Document with 'apple' should be found" + assert self.doc_both_id in apple_ids, "Document with 'apple and banana' should be found" + + hits_banana = self.vector.search_by_full_text(query="banana", top_k=10) + banana_ids = {doc.metadata["doc_id"] for doc in hits_banana} + assert self.doc_banana_id in banana_ids, "Document with 'banana' should be found" + assert self.doc_both_id in banana_ids, "Document with 'apple and banana' should be found" + + # Test multi-keyword search returns all matching documents + hits = self.vector.search_by_full_text(query="apple banana", top_k=10) + doc_ids = {doc.metadata["doc_id"] for doc in hits} + + assert self.doc_apple_id in doc_ids, "Document with 'apple' should be found in multi-keyword search" + assert self.doc_banana_id in doc_ids, "Document with 'banana' should be found in multi-keyword search" + assert self.doc_both_id in doc_ids, "Document with both keywords should be found" + # Expect 3 results: doc_apple (apple only), doc_banana (banana only), doc_both (contains both) + assert len(hits) == 3, f"Expected 3 documents, got {len(hits)}" + + # Test keyword order independence + hits_ba = self.vector.search_by_full_text(query="banana apple", top_k=10) + ids_ba = {doc.metadata["doc_id"] for doc in hits_ba} + assert doc_ids == ids_ba, "Keyword order should not affect search results" + + # Test no duplicates in results + doc_id_list = [doc.metadata["doc_id"] for doc in hits] + assert len(doc_id_list) == len(set(doc_id_list)), "Search results should not contain duplicates" + + def run_all_tests(self): + self.create_vector() + self.search_by_vector() + self.search_by_full_text() + self.text_exists() + self.get_ids_by_metadata_field() + # Multi-keyword search tests + self.setup_multi_keyword_documents() + self.search_by_full_text_multi_keyword() + # Cleanup - delete_vector() removes the entire collection + self.delete_vector() + def test_qdrant_vector(setup_mock_redis): QdrantVectorTest().run_all_tests()