From 7bc5c89e3c5e496a60dd42ce5a54fc3e8a93a1f3 Mon Sep 17 00:00:00 2001 From: EvanYao <2869018789@qq.com> Date: Wed, 20 May 2026 11:56:49 +0800 Subject: [PATCH] fix: prevent recursion error when SharePoint folder is empty (#36372) --- .../app/apps/pipeline/pipeline_generator.py | 34 ++++- .../apps/pipeline/test_pipeline_generator.py | 126 ++++++++++++++++++ 2 files changed, 158 insertions(+), 2 deletions(-) diff --git a/api/core/app/apps/pipeline/pipeline_generator.py b/api/core/app/apps/pipeline/pipeline_generator.py index 4a76d0809e..47b950ca08 100644 --- a/api/core/app/apps/pipeline/pipeline_generator.py +++ b/api/core/app/apps/pipeline/pipeline_generator.py @@ -791,10 +791,25 @@ class PipelineGenerator(BaseAppGenerator): all_files: list, datasource_info: Mapping[str, Any], next_page_parameters: dict[str, Any] | None = None, + _visited_folder_ids: set[str] | None = None, ): """ Get files in a folder. + + Recursively lists all files inside the given folder prefix. + ``_visited_folder_ids`` tracks folders already expanded so that a + self-referencing folder (where the API returns the folder as its own + child) cannot cause infinite recursion. """ + if _visited_folder_ids is None: + _visited_folder_ids = set() + + # Guard: skip folders we have already expanded to prevent infinite + # recursion from self-referencing folder entries in the API response. + if prefix in _visited_folder_ids: + return + _visited_folder_ids.add(prefix) + result_generator = datasource_runtime.online_drive_browse_files( user_id=user_id, request=OnlineDriveBrowseFilesRequest( @@ -806,10 +821,14 @@ class PipelineGenerator(BaseAppGenerator): provider_type=datasource_runtime.datasource_provider_type(), ) is_truncated = False + has_files = False for result in result_generator: for files in result.result: for file in files.files: + has_files = True if file.type == "folder": + if file.id in _visited_folder_ids: + continue self._get_files_in_folder( datasource_runtime, file.id, @@ -818,6 +837,7 @@ class PipelineGenerator(BaseAppGenerator): all_files, datasource_info, None, + _visited_folder_ids, ) else: all_files.append( @@ -830,7 +850,17 @@ class PipelineGenerator(BaseAppGenerator): is_truncated = files.is_truncated next_page_parameters = files.next_page_parameters - if is_truncated: + # Guard: only follow pagination when the API actually returned files. + # An empty folder that incorrectly reports ``is_truncated=True`` would + # otherwise recurse forever on the same empty page. + if is_truncated and has_files: self._get_files_in_folder( - datasource_runtime, prefix, bucket, user_id, all_files, datasource_info, next_page_parameters + datasource_runtime, + prefix, + bucket, + user_id, + all_files, + datasource_info, + next_page_parameters, + _visited_folder_ids, ) diff --git a/api/tests/unit_tests/core/app/apps/pipeline/test_pipeline_generator.py b/api/tests/unit_tests/core/app/apps/pipeline/test_pipeline_generator.py index dd91243a37..06fd9e4806 100644 --- a/api/tests/unit_tests/core/app/apps/pipeline/test_pipeline_generator.py +++ b/api/tests/unit_tests/core/app/apps/pipeline/test_pipeline_generator.py @@ -717,3 +717,129 @@ def test_get_files_in_folder_recurses_and_collects(generator): ) assert {f["id"] for f in all_files} == {"f1", "f2"} + + +def test_get_files_in_folder_handles_empty_folder(generator): + """An empty folder must return an empty file list without recursion errors.""" + + class FilesPage: + def __init__(self, files, is_truncated=False, next_page_parameters=None): + self.files = files + self.is_truncated = is_truncated + self.next_page_parameters = next_page_parameters + + class Result: + def __init__(self, result): + self.result = result + + class Runtime: + def datasource_provider_type(self): + return DatasourceProviderType.ONLINE_DRIVE + + def online_drive_browse_files(self, user_id, request, provider_type): + # Empty folder: returns a page with no files, not truncated + return iter([Result([FilesPage([], False, None)])]) + + runtime = Runtime() + all_files: list = [] + + generator._get_files_in_folder( + datasource_runtime=runtime, + prefix="empty-folder", + bucket="b", + user_id="user", + all_files=all_files, + datasource_info={}, + ) + + assert all_files == [] + + +def test_get_files_in_folder_handles_empty_folder_with_false_truncation(generator): + """An empty folder that incorrectly reports is_truncated=True must not recurse forever.""" + + call_count = 0 + + class FilesPage: + def __init__(self, files, is_truncated=False, next_page_parameters=None): + self.files = files + self.is_truncated = is_truncated + self.next_page_parameters = next_page_parameters + + class Result: + def __init__(self, result): + self.result = result + + class Runtime: + def datasource_provider_type(self): + return DatasourceProviderType.ONLINE_DRIVE + + def online_drive_browse_files(self, user_id, request, provider_type): + nonlocal call_count + call_count += 1 + # Empty folder that incorrectly claims truncation + return iter([Result([FilesPage([], True, {"page": 2})])]) + + runtime = Runtime() + all_files: list = [] + + generator._get_files_in_folder( + datasource_runtime=runtime, + prefix="buggy-folder", + bucket="b", + user_id="user", + all_files=all_files, + datasource_info={}, + ) + + assert all_files == [] + # Should only be called once -- the empty-page guard prevents further recursion + assert call_count == 1 + + +def test_get_files_in_folder_handles_self_referencing_folder(generator): + """A folder that lists itself as a child must not recurse infinitely.""" + + class File: + def __init__(self, id, name, type): + self.id = id + self.name = name + self.type = type + + class FilesPage: + def __init__(self, files, is_truncated=False, next_page_parameters=None): + self.files = files + self.is_truncated = is_truncated + self.next_page_parameters = next_page_parameters + + class Result: + def __init__(self, result): + self.result = result + + call_count = 0 + + class Runtime: + def datasource_provider_type(self): + return DatasourceProviderType.ONLINE_DRIVE + + def online_drive_browse_files(self, user_id, request, provider_type): + nonlocal call_count + call_count += 1 + # The folder returns itself as a child (self-reference) + return iter([Result([FilesPage([File("self-ref", "myfolder", "folder")], False, None)])]) + + runtime = Runtime() + all_files: list = [] + + generator._get_files_in_folder( + datasource_runtime=runtime, + prefix="self-ref", + bucket="b", + user_id="user", + all_files=all_files, + datasource_info={}, + ) + + assert all_files == [] + # Should only be called once -- the visited-set guard prevents re-entry + assert call_count == 1