fix: Clean upload filenames parsed from URLs (#35706)

Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
This commit is contained in:
Jonathan Chang 2026-05-02 08:45:31 -06:00 committed by GitHub
parent 955c25589d
commit ff5c2c57a1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 77 additions and 5 deletions

View File

@ -41,7 +41,8 @@ def guess_file_info_from_response(response: httpx.Response):
# Try to extract filename from URL
parsed_url = urllib.parse.urlparse(url)
url_path = parsed_url.path
filename = os.path.basename(url_path)
# Decode percent-encoded characters in the path segment
filename = urllib.parse.unquote(os.path.basename(url_path))
# If filename couldn't be extracted, use Content-Disposition header
if not filename:

View File

@ -19,8 +19,13 @@ from werkzeug.http import parse_options_header
from core.helper import ssrf_proxy
def extract_filename(url_path: str, content_disposition: str | None) -> str | None:
"""Extract a safe filename from Content-Disposition or the request URL path."""
def extract_filename(url_or_path: str, content_disposition: str | None) -> str | None:
"""Extract a safe filename from Content-Disposition or the request URL path.
Handles full URLs, paths with query strings, hash fragments, and percent-encoded segments.
Query strings and hash fragments are stripped from the URL before extracting the basename.
Percent-encoded characters in the path are decoded safely.
"""
filename: str | None = None
if content_disposition:
filename_star_match = re.search(r"filename\*=([^;]+)", content_disposition)
@ -47,8 +52,13 @@ def extract_filename(url_path: str, content_disposition: str | None) -> str | No
filename = urllib.parse.unquote(raw)
if not filename:
candidate = os.path.basename(url_path)
filename = urllib.parse.unquote(candidate) if candidate else None
# Parse the URL to extract just the path, stripping query strings and fragments
# This handles both full URLs and bare paths
parsed = urllib.parse.urlparse(url_or_path)
path = parsed.path
candidate = os.path.basename(path)
# Decode percent-encoded characters, with safe fallback for malformed input
filename = urllib.parse.unquote(candidate, errors="replace") if candidate else None
if filename:
filename = os.path.basename(filename)

View File

@ -230,3 +230,64 @@ class TestExtractFilename:
"http://example.com/", 'attachment; filename="file%20with%20quotes%20%26%20encoding.txt"'
)
assert result == "file with quotes & encoding.txt"
def test_url_with_query_string(self):
"""Test that query strings are stripped from URL basename."""
result = extract_filename("http://example.com/path/file.txt?signature=abc123&expires=12345", None)
assert result == "file.txt"
def test_url_with_hash_fragment(self):
"""Test that hash fragments are stripped from URL basename."""
result = extract_filename("http://example.com/path/file.txt#section", None)
assert result == "file.txt"
def test_url_with_query_and_fragment(self):
"""Test that both query strings and hash fragments are stripped."""
result = extract_filename("http://example.com/path/file.txt?token=xyz#section", None)
assert result == "file.txt"
def test_signed_url_preserves_filename(self):
"""Test that signed URL parameters don't affect filename extraction."""
result = extract_filename(
"http://storage.example.com/bucket/documents/report.pdf?AWSAccessKeyId=xxx&Signature=yyy&Expires=12345",
None,
)
assert result == "report.pdf"
def test_percent_encoded_filename_with_query_string(self):
"""Test percent-encoded filename with query string is decoded correctly."""
result = extract_filename("http://example.com/path/my%20file.txt?download=true", None)
assert result == "my file.txt"
def test_percent_encoded_filename_with_fragment(self):
"""Test percent-encoded filename with fragment is decoded correctly."""
result = extract_filename("http://example.com/path/my%20file.txt#page=1", None)
assert result == "my file.txt"
def test_complex_percent_encoding_with_query(self):
"""Test complex percent-encoded filename with query parameters."""
result = extract_filename("http://example.com/docs/%E4%B8%AD%E6%96%87%E6%96%87%E4%BB%B6.pdf?v=1", None)
assert result == "中文文件.pdf"
def test_url_with_special_chars_in_query(self):
"""Test that special characters in query string don't affect filename."""
result = extract_filename("http://example.com/file.bin?name=test&path=/some/path", None)
assert result == "file.bin"
def test_malformed_percent_encoding_safe_fallback(self):
"""Test that malformed percent-encoding is handled safely."""
result = extract_filename("http://example.com/path/file%20name%GG.txt?x=1", None)
# %GG is invalid, should be replaced with replacement character
assert "file" in result
assert ".txt" in result
def test_empty_path_with_query_returns_none(self):
"""Test that empty path with query string returns None."""
result = extract_filename("http://example.com/?query=value", None)
assert result is None
def test_path_only_with_query_string(self):
"""Test bare path (not full URL) with query string."""
result = extract_filename("/path/to/file.txt?extra=params", None)
assert result == "file.txt"