diff --git a/api/core/rag/datasource/retrieval_service.py b/api/core/rag/datasource/retrieval_service.py index f978e072f3..fa242a730e 100644 --- a/api/core/rag/datasource/retrieval_service.py +++ b/api/core/rag/datasource/retrieval_service.py @@ -195,6 +195,23 @@ class RetrievalService: ) return all_documents + @classmethod + def _filter_documents_by_vector_score_threshold( + cls, documents: list[Document], score_threshold: float | None + ) -> list[Document]: + """Keep documents whose stored retrieval score meets the threshold. + + Used when hybrid search skips early vector thresholding but no rerank + runner applies a threshold afterward (same rule as ``calculate_vector_score``). + """ + if score_threshold is None: + return documents + return [ + document + for document in documents + if document.metadata and document.metadata.get("score", 0) >= score_threshold + ] + @classmethod def _deduplicate_documents(cls, documents: list[Document]) -> list[Document]: """Deduplicate documents in O(n) while preserving first-seen order. @@ -294,13 +311,20 @@ class RetrievalService: vector = Vector(dataset=dataset) documents = [] + # Hybrid search merges keyword / full-text / vector hits and then reranks + # (weighted fusion or reranking model). Applying the user score threshold at + # vector retrieval time uses embedding similarity, which is not comparable to + # reranked or fused scores and incorrectly drops high-quality chunks (#35233). + embedding_score_threshold = ( + 0.0 if retrieval_method == RetrievalMethod.HYBRID_SEARCH else score_threshold + ) if query_type == QueryType.TEXT_QUERY: documents.extend( vector.search_by_vector( query, search_type="similarity_score_threshold", top_k=top_k, - score_threshold=score_threshold, + score_threshold=embedding_score_threshold, filter={"group_id": [dataset.id]}, document_ids_filter=document_ids_filter, ) @@ -312,7 +336,7 @@ class RetrievalService: vector.search_by_file( file_id=query, top_k=top_k, - score_threshold=score_threshold, + score_threshold=embedding_score_threshold, filter={"group_id": [dataset.id]}, document_ids_filter=document_ids_filter, ) @@ -844,6 +868,10 @@ class RetrievalService: top_n=top_k, query_type=QueryType.TEXT_QUERY if query else QueryType.IMAGE_QUERY, ) + if not data_post_processor.rerank_runner and score_threshold: + all_documents_item = self._filter_documents_by_vector_score_threshold( + all_documents_item, score_threshold + ) all_documents.extend(all_documents_item)