From 6841a09667d87dc1ad4c4bbfbc6bcc79dd3aafe7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E3=83=AA=E3=82=A4=E3=83=8E=20Lin?= Date: Thu, 25 Sep 2025 10:39:44 +0800 Subject: [PATCH] fix: remote filename will be 'inline' if Content-Disposition: inline (#25877) Fixed the issue that filename will be 'inline' if response header contains `Content-Disposition: inline` while retrieving file by url. Co-authored-by: crazywoola <100913391+crazywoola@users.noreply.github.com> Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> --- api/factories/file_factory.py | 41 +++++-- .../unit_tests/factories/test_file_factory.py | 115 ++++++++++++++++++ 2 files changed, 147 insertions(+), 9 deletions(-) create mode 100644 api/tests/unit_tests/factories/test_file_factory.py diff --git a/api/factories/file_factory.py b/api/factories/file_factory.py index 588168bd39..d66c757249 100644 --- a/api/factories/file_factory.py +++ b/api/factories/file_factory.py @@ -8,6 +8,7 @@ from typing import Any import httpx from sqlalchemy import select from sqlalchemy.orm import Session +from werkzeug.http import parse_options_header from constants import AUDIO_EXTENSIONS, DOCUMENT_EXTENSIONS, IMAGE_EXTENSIONS, VIDEO_EXTENSIONS from core.file import File, FileBelongsTo, FileTransferMethod, FileType, FileUploadConfig, helpers @@ -247,6 +248,25 @@ def _build_from_remote_url( ) +def _extract_filename(url_path: str, content_disposition: str | None) -> str | None: + filename = None + # Try to extract from Content-Disposition header first + if content_disposition: + _, params = parse_options_header(content_disposition) + # RFC 5987 https://datatracker.ietf.org/doc/html/rfc5987: filename* takes precedence over filename + filename = params.get("filename*") or params.get("filename") + # Fallback to URL path if no filename from header + if not filename: + filename = os.path.basename(url_path) + return filename or None + + +def _guess_mime_type(filename: str) -> str: + """Guess MIME type from filename, returning empty string if None.""" + guessed_mime, _ = mimetypes.guess_type(filename) + return guessed_mime or "" + + def _get_remote_file_info(url: str): file_size = -1 parsed_url = urllib.parse.urlparse(url) @@ -254,23 +274,26 @@ def _get_remote_file_info(url: str): filename = os.path.basename(url_path) # Initialize mime_type from filename as fallback - mime_type, _ = mimetypes.guess_type(filename) - if mime_type is None: - mime_type = "" + mime_type = _guess_mime_type(filename) resp = ssrf_proxy.head(url, follow_redirects=True) if resp.status_code == httpx.codes.OK: - if content_disposition := resp.headers.get("Content-Disposition"): - filename = str(content_disposition.split("filename=")[-1].strip('"')) - # Re-guess mime_type from updated filename - mime_type, _ = mimetypes.guess_type(filename) - if mime_type is None: - mime_type = "" + content_disposition = resp.headers.get("Content-Disposition") + extracted_filename = _extract_filename(url_path, content_disposition) + if extracted_filename: + filename = extracted_filename + mime_type = _guess_mime_type(filename) file_size = int(resp.headers.get("Content-Length", file_size)) # Fallback to Content-Type header if mime_type is still empty if not mime_type: mime_type = resp.headers.get("Content-Type", "").split(";")[0].strip() + if not filename: + extension = mimetypes.guess_extension(mime_type) or ".bin" + filename = f"{uuid.uuid4().hex}{extension}" + if not mime_type: + mime_type = _guess_mime_type(filename) + return mime_type, filename, file_size diff --git a/api/tests/unit_tests/factories/test_file_factory.py b/api/tests/unit_tests/factories/test_file_factory.py new file mode 100644 index 0000000000..777fe5a6e7 --- /dev/null +++ b/api/tests/unit_tests/factories/test_file_factory.py @@ -0,0 +1,115 @@ +import re + +import pytest + +from factories.file_factory import _get_remote_file_info + + +class _FakeResponse: + def __init__(self, status_code: int, headers: dict[str, str]): + self.status_code = status_code + self.headers = headers + + +def _mock_head(monkeypatch: pytest.MonkeyPatch, headers: dict[str, str], status_code: int = 200): + def _fake_head(url: str, follow_redirects: bool = True): + return _FakeResponse(status_code=status_code, headers=headers) + + monkeypatch.setattr("factories.file_factory.ssrf_proxy.head", _fake_head) + + +class TestGetRemoteFileInfo: + """Tests for _get_remote_file_info focusing on filename extraction rules.""" + + def test_inline_no_filename(self, monkeypatch: pytest.MonkeyPatch): + _mock_head( + monkeypatch, + { + "Content-Disposition": "inline", + "Content-Type": "application/pdf", + "Content-Length": "123", + }, + ) + mime_type, filename, size = _get_remote_file_info("http://example.com/some/path/file.pdf") + assert filename == "file.pdf" + assert mime_type == "application/pdf" + assert size == 123 + + def test_attachment_no_filename(self, monkeypatch: pytest.MonkeyPatch): + _mock_head( + monkeypatch, + { + "Content-Disposition": "attachment", + "Content-Type": "application/octet-stream", + "Content-Length": "456", + }, + ) + mime_type, filename, size = _get_remote_file_info("http://example.com/downloads/data.bin") + assert filename == "data.bin" + assert mime_type == "application/octet-stream" + assert size == 456 + + def test_attachment_quoted_space_filename(self, monkeypatch: pytest.MonkeyPatch): + _mock_head( + monkeypatch, + { + "Content-Disposition": 'attachment; filename="file name.jpg"', + "Content-Type": "image/jpeg", + "Content-Length": "789", + }, + ) + mime_type, filename, size = _get_remote_file_info("http://example.com/ignored") + assert filename == "file name.jpg" + assert mime_type == "image/jpeg" + assert size == 789 + + def test_attachment_filename_star_percent20(self, monkeypatch: pytest.MonkeyPatch): + _mock_head( + monkeypatch, + { + "Content-Disposition": "attachment; filename*=UTF-8''file%20name.jpg", + "Content-Type": "image/jpeg", + }, + ) + mime_type, filename, _ = _get_remote_file_info("http://example.com/ignored") + assert filename == "file name.jpg" + assert mime_type == "image/jpeg" + + def test_attachment_filename_star_chinese(self, monkeypatch: pytest.MonkeyPatch): + _mock_head( + monkeypatch, + { + "Content-Disposition": "attachment; filename*=UTF-8''%E6%B5%8B%E8%AF%95%E6%96%87%E4%BB%B6.jpg", + "Content-Type": "image/jpeg", + }, + ) + mime_type, filename, _ = _get_remote_file_info("http://example.com/ignored") + assert filename == "测试文件.jpg" + assert mime_type == "image/jpeg" + + def test_filename_from_url_when_no_header(self, monkeypatch: pytest.MonkeyPatch): + _mock_head( + monkeypatch, + { + # No Content-Disposition + "Content-Type": "text/plain", + "Content-Length": "12", + }, + ) + mime_type, filename, size = _get_remote_file_info("http://example.com/static/file.txt") + assert filename == "file.txt" + assert mime_type == "text/plain" + assert size == 12 + + def test_no_filename_in_url_or_header_generates_uuid_bin(self, monkeypatch: pytest.MonkeyPatch): + _mock_head( + monkeypatch, + { + "Content-Disposition": "inline", + "Content-Type": "application/octet-stream", + }, + ) + mime_type, filename, _ = _get_remote_file_info("http://example.com/test/") + # Should generate a random hex filename with .bin extension + assert re.match(r"^[0-9a-f]{32}\.bin$", filename) is not None + assert mime_type == "application/octet-stream"