mirror of https://github.com/langgenius/dify.git
feat(qdrant): implement full-text search with multi-keyword support (#31658)
This commit is contained in:
parent
91532ef429
commit
b48a10d7ec
|
|
@ -391,46 +391,78 @@ class QdrantVector(BaseVector):
|
|||
return docs
|
||||
|
||||
def search_by_full_text(self, query: str, **kwargs: Any) -> list[Document]:
|
||||
"""Return docs most similar by bm25.
|
||||
"""Return docs most similar by full-text search.
|
||||
|
||||
Searches each keyword separately and merges results to ensure documents
|
||||
matching ANY keyword are returned (OR logic). Results are capped at top_k.
|
||||
|
||||
Args:
|
||||
query: Search query text. Multi-word queries are split into keywords,
|
||||
with each keyword searched separately. Limited to 10 keywords.
|
||||
**kwargs: Additional search parameters (top_k, document_ids_filter)
|
||||
|
||||
Returns:
|
||||
List of documents most similar to the query text and distance for each.
|
||||
List of up to top_k unique documents matching any query keyword.
|
||||
"""
|
||||
from qdrant_client.http import models
|
||||
|
||||
scroll_filter = models.Filter(
|
||||
must=[
|
||||
models.FieldCondition(
|
||||
key="group_id",
|
||||
match=models.MatchValue(value=self._group_id),
|
||||
),
|
||||
models.FieldCondition(
|
||||
key="page_content",
|
||||
match=models.MatchText(text=query),
|
||||
),
|
||||
]
|
||||
)
|
||||
# Build base must conditions (AND logic) for metadata filters
|
||||
base_must_conditions: list = [
|
||||
models.FieldCondition(
|
||||
key="group_id",
|
||||
match=models.MatchValue(value=self._group_id),
|
||||
),
|
||||
]
|
||||
|
||||
document_ids_filter = kwargs.get("document_ids_filter")
|
||||
if document_ids_filter:
|
||||
if scroll_filter.must:
|
||||
scroll_filter.must.append(
|
||||
models.FieldCondition(
|
||||
key="metadata.document_id",
|
||||
match=models.MatchAny(any=document_ids_filter),
|
||||
)
|
||||
base_must_conditions.append(
|
||||
models.FieldCondition(
|
||||
key="metadata.document_id",
|
||||
match=models.MatchAny(any=document_ids_filter),
|
||||
)
|
||||
response = self._client.scroll(
|
||||
collection_name=self._collection_name,
|
||||
scroll_filter=scroll_filter,
|
||||
limit=kwargs.get("top_k", 2),
|
||||
with_payload=True,
|
||||
with_vectors=True,
|
||||
)
|
||||
results = response[0]
|
||||
documents = []
|
||||
for result in results:
|
||||
if result:
|
||||
document = self._document_from_scored_point(result, Field.CONTENT_KEY, Field.METADATA_KEY)
|
||||
documents.append(document)
|
||||
)
|
||||
|
||||
# Split query into keywords, deduplicate and limit to prevent DoS
|
||||
keywords = list(dict.fromkeys(kw.strip() for kw in query.strip().split() if kw.strip()))[:10]
|
||||
|
||||
if not keywords:
|
||||
return []
|
||||
|
||||
top_k = kwargs.get("top_k", 2)
|
||||
seen_ids: set[str | int] = set()
|
||||
documents: list[Document] = []
|
||||
|
||||
# Search each keyword separately and merge results.
|
||||
# This ensures each keyword gets its own search, preventing one keyword's
|
||||
# results from completely overshadowing another's due to scroll ordering.
|
||||
for keyword in keywords:
|
||||
scroll_filter = models.Filter(
|
||||
must=[
|
||||
*base_must_conditions,
|
||||
models.FieldCondition(
|
||||
key="page_content",
|
||||
match=models.MatchText(text=keyword),
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
response = self._client.scroll(
|
||||
collection_name=self._collection_name,
|
||||
scroll_filter=scroll_filter,
|
||||
limit=top_k,
|
||||
with_payload=True,
|
||||
with_vectors=True,
|
||||
)
|
||||
results = response[0]
|
||||
|
||||
for result in results:
|
||||
if result and result.id not in seen_ids:
|
||||
seen_ids.add(result.id)
|
||||
document = self._document_from_scored_point(result, Field.CONTENT_KEY, Field.METADATA_KEY)
|
||||
documents.append(document)
|
||||
if len(documents) >= top_k:
|
||||
return documents
|
||||
|
||||
return documents
|
||||
|
||||
|
|
|
|||
|
|
@ -1,3 +1,5 @@
|
|||
import uuid
|
||||
|
||||
from core.rag.datasource.vdb.qdrant.qdrant_vector import QdrantConfig, QdrantVector
|
||||
from core.rag.models.document import Document
|
||||
from tests.integration_tests.vdb.test_vector_store import (
|
||||
|
|
@ -18,6 +20,10 @@ class QdrantVectorTest(AbstractVectorTest):
|
|||
api_key="difyai123456",
|
||||
),
|
||||
)
|
||||
# Additional doc IDs for multi-keyword search tests
|
||||
self.doc_apple_id = ""
|
||||
self.doc_banana_id = ""
|
||||
self.doc_both_id = ""
|
||||
|
||||
def search_by_vector(self):
|
||||
super().search_by_vector()
|
||||
|
|
@ -27,6 +33,77 @@ class QdrantVectorTest(AbstractVectorTest):
|
|||
)
|
||||
assert len(hits_by_vector) == 0
|
||||
|
||||
def _create_document(self, content: str, doc_id: str) -> Document:
|
||||
"""Create a document with the given content and doc_id."""
|
||||
return Document(
|
||||
page_content=content,
|
||||
metadata={
|
||||
"doc_id": doc_id,
|
||||
"doc_hash": doc_id,
|
||||
"document_id": doc_id,
|
||||
"dataset_id": self.dataset_id,
|
||||
},
|
||||
)
|
||||
|
||||
def setup_multi_keyword_documents(self):
|
||||
"""Create test documents with different keyword combinations for multi-keyword search tests."""
|
||||
self.doc_apple_id = str(uuid.uuid4())
|
||||
self.doc_banana_id = str(uuid.uuid4())
|
||||
self.doc_both_id = str(uuid.uuid4())
|
||||
|
||||
documents = [
|
||||
self._create_document("This document contains apple only", self.doc_apple_id),
|
||||
self._create_document("This document contains banana only", self.doc_banana_id),
|
||||
self._create_document("This document contains both apple and banana", self.doc_both_id),
|
||||
]
|
||||
embeddings = [self.example_embedding] * len(documents)
|
||||
|
||||
self.vector.add_texts(documents=documents, embeddings=embeddings)
|
||||
|
||||
def search_by_full_text_multi_keyword(self):
|
||||
"""Test multi-keyword search returns docs matching ANY keyword (OR logic)."""
|
||||
# First verify single keyword searches work correctly
|
||||
hits_apple = self.vector.search_by_full_text(query="apple", top_k=10)
|
||||
apple_ids = {doc.metadata["doc_id"] for doc in hits_apple}
|
||||
assert self.doc_apple_id in apple_ids, "Document with 'apple' should be found"
|
||||
assert self.doc_both_id in apple_ids, "Document with 'apple and banana' should be found"
|
||||
|
||||
hits_banana = self.vector.search_by_full_text(query="banana", top_k=10)
|
||||
banana_ids = {doc.metadata["doc_id"] for doc in hits_banana}
|
||||
assert self.doc_banana_id in banana_ids, "Document with 'banana' should be found"
|
||||
assert self.doc_both_id in banana_ids, "Document with 'apple and banana' should be found"
|
||||
|
||||
# Test multi-keyword search returns all matching documents
|
||||
hits = self.vector.search_by_full_text(query="apple banana", top_k=10)
|
||||
doc_ids = {doc.metadata["doc_id"] for doc in hits}
|
||||
|
||||
assert self.doc_apple_id in doc_ids, "Document with 'apple' should be found in multi-keyword search"
|
||||
assert self.doc_banana_id in doc_ids, "Document with 'banana' should be found in multi-keyword search"
|
||||
assert self.doc_both_id in doc_ids, "Document with both keywords should be found"
|
||||
# Expect 3 results: doc_apple (apple only), doc_banana (banana only), doc_both (contains both)
|
||||
assert len(hits) == 3, f"Expected 3 documents, got {len(hits)}"
|
||||
|
||||
# Test keyword order independence
|
||||
hits_ba = self.vector.search_by_full_text(query="banana apple", top_k=10)
|
||||
ids_ba = {doc.metadata["doc_id"] for doc in hits_ba}
|
||||
assert doc_ids == ids_ba, "Keyword order should not affect search results"
|
||||
|
||||
# Test no duplicates in results
|
||||
doc_id_list = [doc.metadata["doc_id"] for doc in hits]
|
||||
assert len(doc_id_list) == len(set(doc_id_list)), "Search results should not contain duplicates"
|
||||
|
||||
def run_all_tests(self):
|
||||
self.create_vector()
|
||||
self.search_by_vector()
|
||||
self.search_by_full_text()
|
||||
self.text_exists()
|
||||
self.get_ids_by_metadata_field()
|
||||
# Multi-keyword search tests
|
||||
self.setup_multi_keyword_documents()
|
||||
self.search_by_full_text_multi_keyword()
|
||||
# Cleanup - delete_vector() removes the entire collection
|
||||
self.delete_vector()
|
||||
|
||||
|
||||
def test_qdrant_vector(setup_mock_redis):
|
||||
QdrantVectorTest().run_all_tests()
|
||||
|
|
|
|||
Loading…
Reference in New Issue