fix: apply score threshold after reranking in hybrid search (#35263)

2026-05-13 08:57:28 +08:00 · 2026-04-16 11:51:11 +05:30 · 2026-04-16 11:51:11 +05:30 · 54e51be665
commit 54e51be665
parent 0fea760143
1 changed files with 30 additions and 2 deletions
--- a/api/core/rag/datasource/retrieval_service.py
+++ b/api/core/rag/datasource/retrieval_service.py
@ -195,6 +195,23 @@ class RetrievalService:
        )
        return all_documents

+    @classmethod
+    def _filter_documents_by_vector_score_threshold(
+        cls, documents: list[Document], score_threshold: float | None
+    ) -> list[Document]:
+        """Keep documents whose stored retrieval score meets the threshold.
+
+        Used when hybrid search skips early vector thresholding but no rerank
+        runner applies a threshold afterward (same rule as ``calculate_vector_score``).
+        """
+        if score_threshold is None:
+            return documents
+        return [
+            document
+            for document in documents
+            if document.metadata and document.metadata.get("score", 0) >= score_threshold
+        ]
+
    @classmethod
    def _deduplicate_documents(cls, documents: list[Document]) -> list[Document]:
        """Deduplicate documents in O(n) while preserving first-seen order.
@ -294,13 +311,20 @@ class RetrievalService:

                vector = Vector(dataset=dataset)
                documents = []
+                # Hybrid search merges keyword / full-text / vector hits and then reranks
+                # (weighted fusion or reranking model). Applying the user score threshold at
+                # vector retrieval time uses embedding similarity, which is not comparable to
+                # reranked or fused scores and incorrectly drops high-quality chunks (#35233).
+                embedding_score_threshold = (
+                    0.0 if retrieval_method == RetrievalMethod.HYBRID_SEARCH else score_threshold
+                )
                if query_type == QueryType.TEXT_QUERY:
                    documents.extend(
                        vector.search_by_vector(
                            query,
                            search_type="similarity_score_threshold",
                            top_k=top_k,
-                            score_threshold=score_threshold,
+                            score_threshold=embedding_score_threshold,
                            filter={"group_id": [dataset.id]},
                            document_ids_filter=document_ids_filter,
                        )
@ -312,7 +336,7 @@ class RetrievalService:
                        vector.search_by_file(
                            file_id=query,
                            top_k=top_k,
-                            score_threshold=score_threshold,
+                            score_threshold=embedding_score_threshold,
                            filter={"group_id": [dataset.id]},
                            document_ids_filter=document_ids_filter,
                        )
@ -844,6 +868,10 @@ class RetrievalService:
                    top_n=top_k,
                    query_type=QueryType.TEXT_QUERY if query else QueryType.IMAGE_QUERY,
                )
+                if not data_post_processor.rerank_runner and score_threshold:
+                    all_documents_item = self._filter_documents_by_vector_score_threshold(
+                        all_documents_item, score_threshold
+                    )

            all_documents.extend(all_documents_item)