fix: apply score threshold after reranking in hybrid search (#35263)

This commit is contained in:
Ayush Baluni 2026-04-16 11:51:11 +05:30 committed by GitHub
parent 0fea760143
commit 54e51be665
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -195,6 +195,23 @@ class RetrievalService:
)
return all_documents
@classmethod
def _filter_documents_by_vector_score_threshold(
cls, documents: list[Document], score_threshold: float | None
) -> list[Document]:
"""Keep documents whose stored retrieval score meets the threshold.
Used when hybrid search skips early vector thresholding but no rerank
runner applies a threshold afterward (same rule as ``calculate_vector_score``).
"""
if score_threshold is None:
return documents
return [
document
for document in documents
if document.metadata and document.metadata.get("score", 0) >= score_threshold
]
@classmethod
def _deduplicate_documents(cls, documents: list[Document]) -> list[Document]:
"""Deduplicate documents in O(n) while preserving first-seen order.
@ -294,13 +311,20 @@ class RetrievalService:
vector = Vector(dataset=dataset)
documents = []
# Hybrid search merges keyword / full-text / vector hits and then reranks
# (weighted fusion or reranking model). Applying the user score threshold at
# vector retrieval time uses embedding similarity, which is not comparable to
# reranked or fused scores and incorrectly drops high-quality chunks (#35233).
embedding_score_threshold = (
0.0 if retrieval_method == RetrievalMethod.HYBRID_SEARCH else score_threshold
)
if query_type == QueryType.TEXT_QUERY:
documents.extend(
vector.search_by_vector(
query,
search_type="similarity_score_threshold",
top_k=top_k,
score_threshold=score_threshold,
score_threshold=embedding_score_threshold,
filter={"group_id": [dataset.id]},
document_ids_filter=document_ids_filter,
)
@ -312,7 +336,7 @@ class RetrievalService:
vector.search_by_file(
file_id=query,
top_k=top_k,
score_threshold=score_threshold,
score_threshold=embedding_score_threshold,
filter={"group_id": [dataset.id]},
document_ids_filter=document_ids_filter,
)
@ -844,6 +868,10 @@ class RetrievalService:
top_n=top_k,
query_type=QueryType.TEXT_QUERY if query else QueryType.IMAGE_QUERY,
)
if not data_post_processor.rerank_runner and score_threshold:
all_documents_item = self._filter_documents_by_vector_score_threshold(
all_documents_item, score_threshold
)
all_documents.extend(all_documents_item)