mirror of
https://github.com/langgenius/dify.git
synced 2026-04-25 17:47:30 +08:00
fix: apply score threshold after reranking in hybrid search (#35263)
This commit is contained in:
parent
0fea760143
commit
54e51be665
@ -195,6 +195,23 @@ class RetrievalService:
|
|||||||
)
|
)
|
||||||
return all_documents
|
return all_documents
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _filter_documents_by_vector_score_threshold(
|
||||||
|
cls, documents: list[Document], score_threshold: float | None
|
||||||
|
) -> list[Document]:
|
||||||
|
"""Keep documents whose stored retrieval score meets the threshold.
|
||||||
|
|
||||||
|
Used when hybrid search skips early vector thresholding but no rerank
|
||||||
|
runner applies a threshold afterward (same rule as ``calculate_vector_score``).
|
||||||
|
"""
|
||||||
|
if score_threshold is None:
|
||||||
|
return documents
|
||||||
|
return [
|
||||||
|
document
|
||||||
|
for document in documents
|
||||||
|
if document.metadata and document.metadata.get("score", 0) >= score_threshold
|
||||||
|
]
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _deduplicate_documents(cls, documents: list[Document]) -> list[Document]:
|
def _deduplicate_documents(cls, documents: list[Document]) -> list[Document]:
|
||||||
"""Deduplicate documents in O(n) while preserving first-seen order.
|
"""Deduplicate documents in O(n) while preserving first-seen order.
|
||||||
@ -294,13 +311,20 @@ class RetrievalService:
|
|||||||
|
|
||||||
vector = Vector(dataset=dataset)
|
vector = Vector(dataset=dataset)
|
||||||
documents = []
|
documents = []
|
||||||
|
# Hybrid search merges keyword / full-text / vector hits and then reranks
|
||||||
|
# (weighted fusion or reranking model). Applying the user score threshold at
|
||||||
|
# vector retrieval time uses embedding similarity, which is not comparable to
|
||||||
|
# reranked or fused scores and incorrectly drops high-quality chunks (#35233).
|
||||||
|
embedding_score_threshold = (
|
||||||
|
0.0 if retrieval_method == RetrievalMethod.HYBRID_SEARCH else score_threshold
|
||||||
|
)
|
||||||
if query_type == QueryType.TEXT_QUERY:
|
if query_type == QueryType.TEXT_QUERY:
|
||||||
documents.extend(
|
documents.extend(
|
||||||
vector.search_by_vector(
|
vector.search_by_vector(
|
||||||
query,
|
query,
|
||||||
search_type="similarity_score_threshold",
|
search_type="similarity_score_threshold",
|
||||||
top_k=top_k,
|
top_k=top_k,
|
||||||
score_threshold=score_threshold,
|
score_threshold=embedding_score_threshold,
|
||||||
filter={"group_id": [dataset.id]},
|
filter={"group_id": [dataset.id]},
|
||||||
document_ids_filter=document_ids_filter,
|
document_ids_filter=document_ids_filter,
|
||||||
)
|
)
|
||||||
@ -312,7 +336,7 @@ class RetrievalService:
|
|||||||
vector.search_by_file(
|
vector.search_by_file(
|
||||||
file_id=query,
|
file_id=query,
|
||||||
top_k=top_k,
|
top_k=top_k,
|
||||||
score_threshold=score_threshold,
|
score_threshold=embedding_score_threshold,
|
||||||
filter={"group_id": [dataset.id]},
|
filter={"group_id": [dataset.id]},
|
||||||
document_ids_filter=document_ids_filter,
|
document_ids_filter=document_ids_filter,
|
||||||
)
|
)
|
||||||
@ -844,6 +868,10 @@ class RetrievalService:
|
|||||||
top_n=top_k,
|
top_n=top_k,
|
||||||
query_type=QueryType.TEXT_QUERY if query else QueryType.IMAGE_QUERY,
|
query_type=QueryType.TEXT_QUERY if query else QueryType.IMAGE_QUERY,
|
||||||
)
|
)
|
||||||
|
if not data_post_processor.rerank_runner and score_threshold:
|
||||||
|
all_documents_item = self._filter_documents_by_vector_score_threshold(
|
||||||
|
all_documents_item, score_threshold
|
||||||
|
)
|
||||||
|
|
||||||
all_documents.extend(all_documents_item)
|
all_documents.extend(all_documents_item)
|
||||||
|
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user