Merge 8818acba31 into 6d0e36479b

2025-12-29 15:03:22 +08:00 · 2025-12-29 15:03:22 +08:00 · 526f36e8c7
parent 6d0e36479b 8818acba31
commit 526f36e8c7
3 changed files with 102 additions and 37 deletions
--- a/api/core/rag/datasource/retrieval_service.py
+++ b/api/core/rag/datasource/retrieval_service.py
@ -1,4 +1,5 @@
 import concurrent.futures
+import logging
 from concurrent.futures import ThreadPoolExecutor
 from typing import Any

@ -36,6 +37,8 @@ default_retrieval_model = {
    "score_threshold_enabled": False,
 }

+logger = logging.getLogger(__name__)
+

 class RetrievalService:
    # Cache precompiled regular expressions to avoid repeated compilation
@ -106,7 +109,12 @@ class RetrievalService:
                        )
                    )

-            concurrent.futures.wait(futures, timeout=3600, return_when=concurrent.futures.ALL_COMPLETED)
+            if futures:
+                for future in concurrent.futures.as_completed(futures, timeout=3600):
+                    if exceptions:
+                        for f in futures:
+                            f.cancel()
+                        break

        if exceptions:
            raise ValueError(";\n".join(exceptions))
@ -210,6 +218,7 @@ class RetrievalService:
                )
                all_documents.extend(documents)
            except Exception as e:
+                logger.error(e, exc_info=True)
                exceptions.append(str(e))

    @classmethod
@ -303,6 +312,7 @@ class RetrievalService:
                    else:
                        all_documents.extend(documents)
            except Exception as e:
+                logger.error(e, exc_info=True)
                exceptions.append(str(e))

    @classmethod
@ -351,6 +361,7 @@ class RetrievalService:
                    else:
                        all_documents.extend(documents)
            except Exception as e:
+                logger.error(e, exc_info=True)
                exceptions.append(str(e))

    @staticmethod
@ -663,7 +674,14 @@ class RetrievalService:
                            document_ids_filter=document_ids_filter,
                        )
                    )
-                concurrent.futures.wait(futures, timeout=300, return_when=concurrent.futures.ALL_COMPLETED)
+                # Use as_completed for early error propagation - cancel remaining futures on first error
+                if futures:
+                    for future in concurrent.futures.as_completed(futures, timeout=300):
+                        if future.exception():
+                            # Cancel remaining futures to avoid unnecessary waiting
+                            for f in futures:
+                                f.cancel()
+                            break

            if exceptions:
                raise ValueError(";\n".join(exceptions))
--- a/api/core/rag/retrieval/dataset_retrieval.py
+++ b/api/core/rag/retrieval/dataset_retrieval.py
@ -516,6 +516,9 @@ class DatasetRetrieval:
                    ].embedding_model_provider
                    weights["vector_setting"]["embedding_model_name"] = available_datasets[0].embedding_model
        with measure_time() as timer:
+            cancel_event = threading.Event()
+            thread_exceptions: list[Exception] = []
+
            if query:
                query_thread = threading.Thread(
                    target=self._multiple_retrieve_thread,
@ -534,6 +537,8 @@ class DatasetRetrieval:
                        "score_threshold": score_threshold,
                        "query": query,
                        "attachment_id": None,
+                        "cancel_event": cancel_event,
+                        "thread_exceptions": thread_exceptions,
                    },
                )
                all_threads.append(query_thread)
@ -557,12 +562,25 @@ class DatasetRetrieval:
                            "score_threshold": score_threshold,
                            "query": None,
                            "attachment_id": attachment_id,
+                            "cancel_event": cancel_event,
+                            "thread_exceptions": thread_exceptions,
                        },
                    )
                    all_threads.append(attachment_thread)
                    attachment_thread.start()
+
+            # Poll threads with short timeout to detect errors quickly (fail-fast)
+            while any(t.is_alive() for t in all_threads):
                for thread in all_threads:
-                thread.join()
+                    thread.join(timeout=0.1)
+                    if thread_exceptions:
+                        cancel_event.set()
+                        break
+                if thread_exceptions:
+                    break
+
+            if thread_exceptions:
+                raise thread_exceptions[0]
        self._on_query(query, attachment_ids, dataset_ids, app_id, user_from, user_id)

        if all_documents:
@ -1404,12 +1422,18 @@ class DatasetRetrieval:
        score_threshold: float,
        query: str | None,
        attachment_id: str | None,
+        cancel_event: threading.Event | None = None,
+        thread_exceptions: list[Exception] | None = None,
    ):
+        try:
            with flask_app.app_context():
                threads = []
                all_documents_item: list[Document] = []
                index_type = None
                for dataset in available_datasets:
+                    # Check for cancellation signal
+                    if cancel_event and cancel_event.is_set():
+                        break
                    index_type = dataset.indexing_technique
                    document_ids_filter = None
                    if dataset.provider != "external":
@ -1436,8 +1460,15 @@ class DatasetRetrieval:
                    )
                    threads.append(retrieval_thread)
                    retrieval_thread.start()
+
+                # Poll threads with short timeout to respond quickly to cancellation
+                while any(t.is_alive() for t in threads):
                    for thread in threads:
-                thread.join()
+                        thread.join(timeout=0.1)
+                        if cancel_event and cancel_event.is_set():
+                            break
+                    if cancel_event and cancel_event.is_set():
+                        break

            if reranking_enable:
                # do rerank for searched documents
@ -1470,3 +1501,8 @@ class DatasetRetrieval:
                    all_documents_item = all_documents_item[:top_k] if top_k else all_documents_item
            if all_documents_item:
                all_documents.extend(all_documents_item)
+        except Exception as e:
+            if cancel_event:
+                cancel_event.set()
+            if thread_exceptions is not None:
+                thread_exceptions.append(e)
--- a/api/tests/unit_tests/core/rag/retrieval/test_dataset_retrieval.py
+++ b/api/tests/unit_tests/core/rag/retrieval/test_dataset_retrieval.py
@ -421,6 +421,17 @@ class TestRetrievalService:
            # In real code, this waits for all futures to complete
            # In tests, futures complete immediately, so wait is a no-op
            with patch("core.rag.datasource.retrieval_service.concurrent.futures.wait"):
+                # Mock concurrent.futures.as_completed for early error propagation
+                # In real code, this yields futures as they complete
+                # In tests, we yield all futures immediately since they're already done
+                def mock_as_completed(futures_list, timeout=None):
+                    """Mock as_completed that yields futures immediately."""
+                    yield from futures_list
+
+                with patch(
+                    "core.rag.datasource.retrieval_service.concurrent.futures.as_completed",
+                    side_effect=mock_as_completed,
+                ):
                    yield mock_executor

    # ==================== Vector Search Tests ====================