mirror of
https://github.com/langgenius/dify.git
synced 2026-06-19 08:31:07 +08:00
fix: Clean upload filenames parsed from URLs (#35706)
Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
This commit is contained in:
parent
955c25589d
commit
ff5c2c57a1
@ -41,7 +41,8 @@ def guess_file_info_from_response(response: httpx.Response):
|
||||
# Try to extract filename from URL
|
||||
parsed_url = urllib.parse.urlparse(url)
|
||||
url_path = parsed_url.path
|
||||
filename = os.path.basename(url_path)
|
||||
# Decode percent-encoded characters in the path segment
|
||||
filename = urllib.parse.unquote(os.path.basename(url_path))
|
||||
|
||||
# If filename couldn't be extracted, use Content-Disposition header
|
||||
if not filename:
|
||||
|
||||
@ -19,8 +19,13 @@ from werkzeug.http import parse_options_header
|
||||
from core.helper import ssrf_proxy
|
||||
|
||||
|
||||
def extract_filename(url_path: str, content_disposition: str | None) -> str | None:
|
||||
"""Extract a safe filename from Content-Disposition or the request URL path."""
|
||||
def extract_filename(url_or_path: str, content_disposition: str | None) -> str | None:
|
||||
"""Extract a safe filename from Content-Disposition or the request URL path.
|
||||
|
||||
Handles full URLs, paths with query strings, hash fragments, and percent-encoded segments.
|
||||
Query strings and hash fragments are stripped from the URL before extracting the basename.
|
||||
Percent-encoded characters in the path are decoded safely.
|
||||
"""
|
||||
filename: str | None = None
|
||||
if content_disposition:
|
||||
filename_star_match = re.search(r"filename\*=([^;]+)", content_disposition)
|
||||
@ -47,8 +52,13 @@ def extract_filename(url_path: str, content_disposition: str | None) -> str | No
|
||||
filename = urllib.parse.unquote(raw)
|
||||
|
||||
if not filename:
|
||||
candidate = os.path.basename(url_path)
|
||||
filename = urllib.parse.unquote(candidate) if candidate else None
|
||||
# Parse the URL to extract just the path, stripping query strings and fragments
|
||||
# This handles both full URLs and bare paths
|
||||
parsed = urllib.parse.urlparse(url_or_path)
|
||||
path = parsed.path
|
||||
candidate = os.path.basename(path)
|
||||
# Decode percent-encoded characters, with safe fallback for malformed input
|
||||
filename = urllib.parse.unquote(candidate, errors="replace") if candidate else None
|
||||
|
||||
if filename:
|
||||
filename = os.path.basename(filename)
|
||||
|
||||
@ -230,3 +230,64 @@ class TestExtractFilename:
|
||||
"http://example.com/", 'attachment; filename="file%20with%20quotes%20%26%20encoding.txt"'
|
||||
)
|
||||
assert result == "file with quotes & encoding.txt"
|
||||
|
||||
def test_url_with_query_string(self):
|
||||
"""Test that query strings are stripped from URL basename."""
|
||||
result = extract_filename("http://example.com/path/file.txt?signature=abc123&expires=12345", None)
|
||||
assert result == "file.txt"
|
||||
|
||||
def test_url_with_hash_fragment(self):
|
||||
"""Test that hash fragments are stripped from URL basename."""
|
||||
result = extract_filename("http://example.com/path/file.txt#section", None)
|
||||
assert result == "file.txt"
|
||||
|
||||
def test_url_with_query_and_fragment(self):
|
||||
"""Test that both query strings and hash fragments are stripped."""
|
||||
result = extract_filename("http://example.com/path/file.txt?token=xyz#section", None)
|
||||
assert result == "file.txt"
|
||||
|
||||
def test_signed_url_preserves_filename(self):
|
||||
"""Test that signed URL parameters don't affect filename extraction."""
|
||||
result = extract_filename(
|
||||
"http://storage.example.com/bucket/documents/report.pdf?AWSAccessKeyId=xxx&Signature=yyy&Expires=12345",
|
||||
None,
|
||||
)
|
||||
assert result == "report.pdf"
|
||||
|
||||
def test_percent_encoded_filename_with_query_string(self):
|
||||
"""Test percent-encoded filename with query string is decoded correctly."""
|
||||
result = extract_filename("http://example.com/path/my%20file.txt?download=true", None)
|
||||
assert result == "my file.txt"
|
||||
|
||||
def test_percent_encoded_filename_with_fragment(self):
|
||||
"""Test percent-encoded filename with fragment is decoded correctly."""
|
||||
result = extract_filename("http://example.com/path/my%20file.txt#page=1", None)
|
||||
assert result == "my file.txt"
|
||||
|
||||
def test_complex_percent_encoding_with_query(self):
|
||||
"""Test complex percent-encoded filename with query parameters."""
|
||||
result = extract_filename("http://example.com/docs/%E4%B8%AD%E6%96%87%E6%96%87%E4%BB%B6.pdf?v=1", None)
|
||||
assert result == "中文文件.pdf"
|
||||
|
||||
def test_url_with_special_chars_in_query(self):
|
||||
"""Test that special characters in query string don't affect filename."""
|
||||
result = extract_filename("http://example.com/file.bin?name=test&path=/some/path", None)
|
||||
assert result == "file.bin"
|
||||
|
||||
def test_malformed_percent_encoding_safe_fallback(self):
|
||||
"""Test that malformed percent-encoding is handled safely."""
|
||||
result = extract_filename("http://example.com/path/file%20name%GG.txt?x=1", None)
|
||||
# %GG is invalid, should be replaced with replacement character
|
||||
|
||||
assert "file" in result
|
||||
assert ".txt" in result
|
||||
|
||||
def test_empty_path_with_query_returns_none(self):
|
||||
"""Test that empty path with query string returns None."""
|
||||
result = extract_filename("http://example.com/?query=value", None)
|
||||
assert result is None
|
||||
|
||||
def test_path_only_with_query_string(self):
|
||||
"""Test bare path (not full URL) with query string."""
|
||||
result = extract_filename("/path/to/file.txt?extra=params", None)
|
||||
assert result == "file.txt"
|
||||
|
||||
Loading…
Reference in New Issue
Block a user