diff --git a/api/controllers/common/helpers.py b/api/controllers/common/helpers.py index ef89e66980..84903733b5 100644 --- a/api/controllers/common/helpers.py +++ b/api/controllers/common/helpers.py @@ -41,7 +41,8 @@ def guess_file_info_from_response(response: httpx.Response): # Try to extract filename from URL parsed_url = urllib.parse.urlparse(url) url_path = parsed_url.path - filename = os.path.basename(url_path) + # Decode percent-encoded characters in the path segment + filename = urllib.parse.unquote(os.path.basename(url_path)) # If filename couldn't be extracted, use Content-Disposition header if not filename: diff --git a/api/factories/file_factory/remote.py b/api/factories/file_factory/remote.py index e5a7186007..9b8f94b1f3 100644 --- a/api/factories/file_factory/remote.py +++ b/api/factories/file_factory/remote.py @@ -19,8 +19,13 @@ from werkzeug.http import parse_options_header from core.helper import ssrf_proxy -def extract_filename(url_path: str, content_disposition: str | None) -> str | None: - """Extract a safe filename from Content-Disposition or the request URL path.""" +def extract_filename(url_or_path: str, content_disposition: str | None) -> str | None: + """Extract a safe filename from Content-Disposition or the request URL path. + + Handles full URLs, paths with query strings, hash fragments, and percent-encoded segments. + Query strings and hash fragments are stripped from the URL before extracting the basename. + Percent-encoded characters in the path are decoded safely. + """ filename: str | None = None if content_disposition: filename_star_match = re.search(r"filename\*=([^;]+)", content_disposition) @@ -47,8 +52,13 @@ def extract_filename(url_path: str, content_disposition: str | None) -> str | No filename = urllib.parse.unquote(raw) if not filename: - candidate = os.path.basename(url_path) - filename = urllib.parse.unquote(candidate) if candidate else None + # Parse the URL to extract just the path, stripping query strings and fragments + # This handles both full URLs and bare paths + parsed = urllib.parse.urlparse(url_or_path) + path = parsed.path + candidate = os.path.basename(path) + # Decode percent-encoded characters, with safe fallback for malformed input + filename = urllib.parse.unquote(candidate, errors="replace") if candidate else None if filename: filename = os.path.basename(filename) diff --git a/api/tests/unit_tests/factories/test_file_factory.py b/api/tests/unit_tests/factories/test_file_factory.py index 5b105d6084..c2835c4124 100644 --- a/api/tests/unit_tests/factories/test_file_factory.py +++ b/api/tests/unit_tests/factories/test_file_factory.py @@ -230,3 +230,64 @@ class TestExtractFilename: "http://example.com/", 'attachment; filename="file%20with%20quotes%20%26%20encoding.txt"' ) assert result == "file with quotes & encoding.txt" + + def test_url_with_query_string(self): + """Test that query strings are stripped from URL basename.""" + result = extract_filename("http://example.com/path/file.txt?signature=abc123&expires=12345", None) + assert result == "file.txt" + + def test_url_with_hash_fragment(self): + """Test that hash fragments are stripped from URL basename.""" + result = extract_filename("http://example.com/path/file.txt#section", None) + assert result == "file.txt" + + def test_url_with_query_and_fragment(self): + """Test that both query strings and hash fragments are stripped.""" + result = extract_filename("http://example.com/path/file.txt?token=xyz#section", None) + assert result == "file.txt" + + def test_signed_url_preserves_filename(self): + """Test that signed URL parameters don't affect filename extraction.""" + result = extract_filename( + "http://storage.example.com/bucket/documents/report.pdf?AWSAccessKeyId=xxx&Signature=yyy&Expires=12345", + None, + ) + assert result == "report.pdf" + + def test_percent_encoded_filename_with_query_string(self): + """Test percent-encoded filename with query string is decoded correctly.""" + result = extract_filename("http://example.com/path/my%20file.txt?download=true", None) + assert result == "my file.txt" + + def test_percent_encoded_filename_with_fragment(self): + """Test percent-encoded filename with fragment is decoded correctly.""" + result = extract_filename("http://example.com/path/my%20file.txt#page=1", None) + assert result == "my file.txt" + + def test_complex_percent_encoding_with_query(self): + """Test complex percent-encoded filename with query parameters.""" + result = extract_filename("http://example.com/docs/%E4%B8%AD%E6%96%87%E6%96%87%E4%BB%B6.pdf?v=1", None) + assert result == "中文文件.pdf" + + def test_url_with_special_chars_in_query(self): + """Test that special characters in query string don't affect filename.""" + result = extract_filename("http://example.com/file.bin?name=test&path=/some/path", None) + assert result == "file.bin" + + def test_malformed_percent_encoding_safe_fallback(self): + """Test that malformed percent-encoding is handled safely.""" + result = extract_filename("http://example.com/path/file%20name%GG.txt?x=1", None) + # %GG is invalid, should be replaced with replacement character + + assert "file" in result + assert ".txt" in result + + def test_empty_path_with_query_returns_none(self): + """Test that empty path with query string returns None.""" + result = extract_filename("http://example.com/?query=value", None) + assert result is None + + def test_path_only_with_query_string(self): + """Test bare path (not full URL) with query string.""" + result = extract_filename("/path/to/file.txt?extra=params", None) + assert result == "file.txt"