From 70eb98d6c5f405d57e5f2fdec20f70e59f928bf5 Mon Sep 17 00:00:00 2001 From: Brandon Date: Tue, 5 May 2026 23:03:22 -0400 Subject: [PATCH] fix(file_factory): drop doubled dot when standardizing datasource file extension (#35808) Co-authored-by: Beandon13 --- api/factories/file_factory/builders.py | 2 +- .../unit_tests/factories/test_file_factory.py | 92 +++++++++++++++++++ 2 files changed, 93 insertions(+), 1 deletion(-) diff --git a/api/factories/file_factory/builders.py b/api/factories/file_factory/builders.py index 1d2ad4d445..4fb976f0e7 100644 --- a/api/factories/file_factory/builders.py +++ b/api/factories/file_factory/builders.py @@ -298,7 +298,7 @@ def _build_from_datasource_file( raise ValueError(f"DatasourceFile {mapping.get('datasource_file_id')} not found") extension = "." + datasource_file.key.split(".")[-1] if "." in datasource_file.key else ".bin" - detected_file_type = standardize_file_type(extension="." + extension, mime_type=datasource_file.mime_type) + detected_file_type = standardize_file_type(extension=extension, mime_type=datasource_file.mime_type) file_type = _resolve_file_type( detected_file_type=detected_file_type, specified_type=mapping.get("type"), diff --git a/api/tests/unit_tests/factories/test_file_factory.py b/api/tests/unit_tests/factories/test_file_factory.py index c2835c4124..293be925ae 100644 --- a/api/tests/unit_tests/factories/test_file_factory.py +++ b/api/tests/unit_tests/factories/test_file_factory.py @@ -1,8 +1,11 @@ import re +from unittest.mock import MagicMock import pytest +from factories.file_factory import builders from factories.file_factory.remote import extract_filename, get_remote_file_info +from graphon.file import FileTransferMethod class _FakeResponse: @@ -291,3 +294,92 @@ class TestExtractFilename: """Test bare path (not full URL) with query string.""" result = extract_filename("/path/to/file.txt?extra=params", None) assert result == "file.txt" + + +class TestBuildFromDatasourceFile: + """Tests for _build_from_datasource_file extension handling.""" + + @staticmethod + def _patch_session(monkeypatch: pytest.MonkeyPatch, datasource_file): + """Stub session_factory.create_session() so it returns the given UploadFile-shaped record.""" + session = MagicMock() + session.scalar.return_value = datasource_file + ctx = MagicMock() + ctx.__enter__ = MagicMock(return_value=session) + ctx.__exit__ = MagicMock(return_value=False) + monkeypatch.setattr(builders.session_factory, "create_session", lambda: ctx) + + def _make_datasource_file(self, *, key: str, mime_type: str = "text/csv"): + f = MagicMock() + f.id = "file-id" + f.key = key + f.name = key.split("/")[-1] + f.mime_type = mime_type + f.size = 123 + f.source_url = f"https://example.com/{key}" + return f + + def test_extension_passed_without_doubled_dot(self, monkeypatch: pytest.MonkeyPatch): + """Regression: standardize_file_type must receive the extension exactly once-prefixed. + + Previously the call was ``standardize_file_type(extension="." + extension, ...)`` while + ``extension`` already had a leading dot, producing ``"..csv"``. The mitigating + ``lstrip(".")`` inside ``standardize_file_type`` masked the bug from end users, but the + argument shape itself was wrong and showed up in any caller that didn't strip dots. + """ + captured: dict = {} + + def fake_standardize(*, extension: str = "", mime_type: str = ""): + from graphon.file import FileType + + captured["extension"] = extension + captured["mime_type"] = mime_type + return FileType.DOCUMENT + + monkeypatch.setattr(builders, "standardize_file_type", fake_standardize) + + datasource_file = self._make_datasource_file(key="folder/data.csv", mime_type="text/csv") + self._patch_session(monkeypatch, datasource_file) + + access_controller = MagicMock() + access_controller.apply_upload_file_filters = lambda stmt: stmt + + file = builders._build_from_datasource_file( + mapping={"datasource_file_id": "file-id", "transfer_method": "datasource_file"}, + tenant_id="tenant-id", + transfer_method=FileTransferMethod.DATASOURCE_FILE, + access_controller=access_controller, + ) + + assert captured["extension"] == ".csv", ( + f"standardize_file_type received {captured['extension']!r}; expected single-dot '.csv'" + ) + assert captured["mime_type"] == "text/csv" + assert file.extension == ".csv" + + def test_extension_falls_back_to_bin_when_key_has_no_dot(self, monkeypatch: pytest.MonkeyPatch): + captured: dict = {} + + def fake_standardize(*, extension: str = "", mime_type: str = ""): + from graphon.file import FileType + + captured["extension"] = extension + return FileType.CUSTOM + + monkeypatch.setattr(builders, "standardize_file_type", fake_standardize) + + datasource_file = self._make_datasource_file(key="dotless-key", mime_type="application/octet-stream") + self._patch_session(monkeypatch, datasource_file) + + access_controller = MagicMock() + access_controller.apply_upload_file_filters = lambda stmt: stmt + + file = builders._build_from_datasource_file( + mapping={"datasource_file_id": "file-id", "transfer_method": "datasource_file"}, + tenant_id="tenant-id", + transfer_method=FileTransferMethod.DATASOURCE_FILE, + access_controller=access_controller, + ) + + assert captured["extension"] == ".bin" + assert file.extension == ".bin"