fix: Clean upload filenames parsed from URLs (#35706)

Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
2026-06-19 08:31:07 +08:00 · 2026-05-02 08:45:31 -06:00 · 2026-05-02 08:45:31 -06:00 · ff5c2c57a1
commit ff5c2c57a1
parent 955c25589d
3 changed files with 77 additions and 5 deletions
--- a/api/controllers/common/helpers.py
+++ b/api/controllers/common/helpers.py
@ -41,7 +41,8 @@ def guess_file_info_from_response(response: httpx.Response):
    # Try to extract filename from URL
    parsed_url = urllib.parse.urlparse(url)
    url_path = parsed_url.path
-    filename = os.path.basename(url_path)
+    # Decode percent-encoded characters in the path segment
+    filename = urllib.parse.unquote(os.path.basename(url_path))

    # If filename couldn't be extracted, use Content-Disposition header
    if not filename:
--- a/api/factories/file_factory/remote.py
+++ b/api/factories/file_factory/remote.py
@ -19,8 +19,13 @@ from werkzeug.http import parse_options_header
 from core.helper import ssrf_proxy


-def extract_filename(url_path: str, content_disposition: str | None) -> str | None:
-    """Extract a safe filename from Content-Disposition or the request URL path."""
+def extract_filename(url_or_path: str, content_disposition: str | None) -> str | None:
+    """Extract a safe filename from Content-Disposition or the request URL path.
+
+    Handles full URLs, paths with query strings, hash fragments, and percent-encoded segments.
+    Query strings and hash fragments are stripped from the URL before extracting the basename.
+    Percent-encoded characters in the path are decoded safely.
+    """
    filename: str | None = None
    if content_disposition:
        filename_star_match = re.search(r"filename\*=([^;]+)", content_disposition)
@ -47,8 +52,13 @@ def extract_filename(url_path: str, content_disposition: str | None) -> str | No
                filename = urllib.parse.unquote(raw)

    if not filename:
-        candidate = os.path.basename(url_path)
-        filename = urllib.parse.unquote(candidate) if candidate else None
+        # Parse the URL to extract just the path, stripping query strings and fragments
+        # This handles both full URLs and bare paths
+        parsed = urllib.parse.urlparse(url_or_path)
+        path = parsed.path
+        candidate = os.path.basename(path)
+        # Decode percent-encoded characters, with safe fallback for malformed input
+        filename = urllib.parse.unquote(candidate, errors="replace") if candidate else None

    if filename:
        filename = os.path.basename(filename)
--- a/api/tests/unit_tests/factories/test_file_factory.py
+++ b/api/tests/unit_tests/factories/test_file_factory.py
@ -230,3 +230,64 @@ class TestExtractFilename:
            "http://example.com/", 'attachment; filename="file%20with%20quotes%20%26%20encoding.txt"'
        )
        assert result == "file with quotes & encoding.txt"
+
+    def test_url_with_query_string(self):
+        """Test that query strings are stripped from URL basename."""
+        result = extract_filename("http://example.com/path/file.txt?signature=abc123&expires=12345", None)
+        assert result == "file.txt"
+
+    def test_url_with_hash_fragment(self):
+        """Test that hash fragments are stripped from URL basename."""
+        result = extract_filename("http://example.com/path/file.txt#section", None)
+        assert result == "file.txt"
+
+    def test_url_with_query_and_fragment(self):
+        """Test that both query strings and hash fragments are stripped."""
+        result = extract_filename("http://example.com/path/file.txt?token=xyz#section", None)
+        assert result == "file.txt"
+
+    def test_signed_url_preserves_filename(self):
+        """Test that signed URL parameters don't affect filename extraction."""
+        result = extract_filename(
+            "http://storage.example.com/bucket/documents/report.pdf?AWSAccessKeyId=xxx&Signature=yyy&Expires=12345",
+            None,
+        )
+        assert result == "report.pdf"
+
+    def test_percent_encoded_filename_with_query_string(self):
+        """Test percent-encoded filename with query string is decoded correctly."""
+        result = extract_filename("http://example.com/path/my%20file.txt?download=true", None)
+        assert result == "my file.txt"
+
+    def test_percent_encoded_filename_with_fragment(self):
+        """Test percent-encoded filename with fragment is decoded correctly."""
+        result = extract_filename("http://example.com/path/my%20file.txt#page=1", None)
+        assert result == "my file.txt"
+
+    def test_complex_percent_encoding_with_query(self):
+        """Test complex percent-encoded filename with query parameters."""
+        result = extract_filename("http://example.com/docs/%E4%B8%AD%E6%96%87%E6%96%87%E4%BB%B6.pdf?v=1", None)
+        assert result == "中文文件.pdf"
+
+    def test_url_with_special_chars_in_query(self):
+        """Test that special characters in query string don't affect filename."""
+        result = extract_filename("http://example.com/file.bin?name=test&path=/some/path", None)
+        assert result == "file.bin"
+
+    def test_malformed_percent_encoding_safe_fallback(self):
+        """Test that malformed percent-encoding is handled safely."""
+        result = extract_filename("http://example.com/path/file%20name%GG.txt?x=1", None)
+        # %GG is invalid, should be replaced with replacement character
+
+        assert "file" in result
+        assert ".txt" in result
+
+    def test_empty_path_with_query_returns_none(self):
+        """Test that empty path with query string returns None."""
+        result = extract_filename("http://example.com/?query=value", None)
+        assert result is None
+
+    def test_path_only_with_query_string(self):
+        """Test bare path (not full URL) with query string."""
+        result = extract_filename("/path/to/file.txt?extra=params", None)
+        assert result == "file.txt"