mirror of https://github.com/langgenius/dify.git
fix: remote filename will be 'inline' if Content-Disposition: inline (#25877)
Fixed the issue that filename will be 'inline' if response header contains `Content-Disposition: inline` while retrieving file by url. Co-authored-by: crazywoola <100913391+crazywoola@users.noreply.github.com> Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
This commit is contained in:
parent
e937c8c72e
commit
6841a09667
|
|
@ -8,6 +8,7 @@ from typing import Any
|
|||
import httpx
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.orm import Session
|
||||
from werkzeug.http import parse_options_header
|
||||
|
||||
from constants import AUDIO_EXTENSIONS, DOCUMENT_EXTENSIONS, IMAGE_EXTENSIONS, VIDEO_EXTENSIONS
|
||||
from core.file import File, FileBelongsTo, FileTransferMethod, FileType, FileUploadConfig, helpers
|
||||
|
|
@ -247,6 +248,25 @@ def _build_from_remote_url(
|
|||
)
|
||||
|
||||
|
||||
def _extract_filename(url_path: str, content_disposition: str | None) -> str | None:
|
||||
filename = None
|
||||
# Try to extract from Content-Disposition header first
|
||||
if content_disposition:
|
||||
_, params = parse_options_header(content_disposition)
|
||||
# RFC 5987 https://datatracker.ietf.org/doc/html/rfc5987: filename* takes precedence over filename
|
||||
filename = params.get("filename*") or params.get("filename")
|
||||
# Fallback to URL path if no filename from header
|
||||
if not filename:
|
||||
filename = os.path.basename(url_path)
|
||||
return filename or None
|
||||
|
||||
|
||||
def _guess_mime_type(filename: str) -> str:
|
||||
"""Guess MIME type from filename, returning empty string if None."""
|
||||
guessed_mime, _ = mimetypes.guess_type(filename)
|
||||
return guessed_mime or ""
|
||||
|
||||
|
||||
def _get_remote_file_info(url: str):
|
||||
file_size = -1
|
||||
parsed_url = urllib.parse.urlparse(url)
|
||||
|
|
@ -254,23 +274,26 @@ def _get_remote_file_info(url: str):
|
|||
filename = os.path.basename(url_path)
|
||||
|
||||
# Initialize mime_type from filename as fallback
|
||||
mime_type, _ = mimetypes.guess_type(filename)
|
||||
if mime_type is None:
|
||||
mime_type = ""
|
||||
mime_type = _guess_mime_type(filename)
|
||||
|
||||
resp = ssrf_proxy.head(url, follow_redirects=True)
|
||||
if resp.status_code == httpx.codes.OK:
|
||||
if content_disposition := resp.headers.get("Content-Disposition"):
|
||||
filename = str(content_disposition.split("filename=")[-1].strip('"'))
|
||||
# Re-guess mime_type from updated filename
|
||||
mime_type, _ = mimetypes.guess_type(filename)
|
||||
if mime_type is None:
|
||||
mime_type = ""
|
||||
content_disposition = resp.headers.get("Content-Disposition")
|
||||
extracted_filename = _extract_filename(url_path, content_disposition)
|
||||
if extracted_filename:
|
||||
filename = extracted_filename
|
||||
mime_type = _guess_mime_type(filename)
|
||||
file_size = int(resp.headers.get("Content-Length", file_size))
|
||||
# Fallback to Content-Type header if mime_type is still empty
|
||||
if not mime_type:
|
||||
mime_type = resp.headers.get("Content-Type", "").split(";")[0].strip()
|
||||
|
||||
if not filename:
|
||||
extension = mimetypes.guess_extension(mime_type) or ".bin"
|
||||
filename = f"{uuid.uuid4().hex}{extension}"
|
||||
if not mime_type:
|
||||
mime_type = _guess_mime_type(filename)
|
||||
|
||||
return mime_type, filename, file_size
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,115 @@
|
|||
import re
|
||||
|
||||
import pytest
|
||||
|
||||
from factories.file_factory import _get_remote_file_info
|
||||
|
||||
|
||||
class _FakeResponse:
|
||||
def __init__(self, status_code: int, headers: dict[str, str]):
|
||||
self.status_code = status_code
|
||||
self.headers = headers
|
||||
|
||||
|
||||
def _mock_head(monkeypatch: pytest.MonkeyPatch, headers: dict[str, str], status_code: int = 200):
|
||||
def _fake_head(url: str, follow_redirects: bool = True):
|
||||
return _FakeResponse(status_code=status_code, headers=headers)
|
||||
|
||||
monkeypatch.setattr("factories.file_factory.ssrf_proxy.head", _fake_head)
|
||||
|
||||
|
||||
class TestGetRemoteFileInfo:
|
||||
"""Tests for _get_remote_file_info focusing on filename extraction rules."""
|
||||
|
||||
def test_inline_no_filename(self, monkeypatch: pytest.MonkeyPatch):
|
||||
_mock_head(
|
||||
monkeypatch,
|
||||
{
|
||||
"Content-Disposition": "inline",
|
||||
"Content-Type": "application/pdf",
|
||||
"Content-Length": "123",
|
||||
},
|
||||
)
|
||||
mime_type, filename, size = _get_remote_file_info("http://example.com/some/path/file.pdf")
|
||||
assert filename == "file.pdf"
|
||||
assert mime_type == "application/pdf"
|
||||
assert size == 123
|
||||
|
||||
def test_attachment_no_filename(self, monkeypatch: pytest.MonkeyPatch):
|
||||
_mock_head(
|
||||
monkeypatch,
|
||||
{
|
||||
"Content-Disposition": "attachment",
|
||||
"Content-Type": "application/octet-stream",
|
||||
"Content-Length": "456",
|
||||
},
|
||||
)
|
||||
mime_type, filename, size = _get_remote_file_info("http://example.com/downloads/data.bin")
|
||||
assert filename == "data.bin"
|
||||
assert mime_type == "application/octet-stream"
|
||||
assert size == 456
|
||||
|
||||
def test_attachment_quoted_space_filename(self, monkeypatch: pytest.MonkeyPatch):
|
||||
_mock_head(
|
||||
monkeypatch,
|
||||
{
|
||||
"Content-Disposition": 'attachment; filename="file name.jpg"',
|
||||
"Content-Type": "image/jpeg",
|
||||
"Content-Length": "789",
|
||||
},
|
||||
)
|
||||
mime_type, filename, size = _get_remote_file_info("http://example.com/ignored")
|
||||
assert filename == "file name.jpg"
|
||||
assert mime_type == "image/jpeg"
|
||||
assert size == 789
|
||||
|
||||
def test_attachment_filename_star_percent20(self, monkeypatch: pytest.MonkeyPatch):
|
||||
_mock_head(
|
||||
monkeypatch,
|
||||
{
|
||||
"Content-Disposition": "attachment; filename*=UTF-8''file%20name.jpg",
|
||||
"Content-Type": "image/jpeg",
|
||||
},
|
||||
)
|
||||
mime_type, filename, _ = _get_remote_file_info("http://example.com/ignored")
|
||||
assert filename == "file name.jpg"
|
||||
assert mime_type == "image/jpeg"
|
||||
|
||||
def test_attachment_filename_star_chinese(self, monkeypatch: pytest.MonkeyPatch):
|
||||
_mock_head(
|
||||
monkeypatch,
|
||||
{
|
||||
"Content-Disposition": "attachment; filename*=UTF-8''%E6%B5%8B%E8%AF%95%E6%96%87%E4%BB%B6.jpg",
|
||||
"Content-Type": "image/jpeg",
|
||||
},
|
||||
)
|
||||
mime_type, filename, _ = _get_remote_file_info("http://example.com/ignored")
|
||||
assert filename == "测试文件.jpg"
|
||||
assert mime_type == "image/jpeg"
|
||||
|
||||
def test_filename_from_url_when_no_header(self, monkeypatch: pytest.MonkeyPatch):
|
||||
_mock_head(
|
||||
monkeypatch,
|
||||
{
|
||||
# No Content-Disposition
|
||||
"Content-Type": "text/plain",
|
||||
"Content-Length": "12",
|
||||
},
|
||||
)
|
||||
mime_type, filename, size = _get_remote_file_info("http://example.com/static/file.txt")
|
||||
assert filename == "file.txt"
|
||||
assert mime_type == "text/plain"
|
||||
assert size == 12
|
||||
|
||||
def test_no_filename_in_url_or_header_generates_uuid_bin(self, monkeypatch: pytest.MonkeyPatch):
|
||||
_mock_head(
|
||||
monkeypatch,
|
||||
{
|
||||
"Content-Disposition": "inline",
|
||||
"Content-Type": "application/octet-stream",
|
||||
},
|
||||
)
|
||||
mime_type, filename, _ = _get_remote_file_info("http://example.com/test/")
|
||||
# Should generate a random hex filename with .bin extension
|
||||
assert re.match(r"^[0-9a-f]{32}\.bin$", filename) is not None
|
||||
assert mime_type == "application/octet-stream"
|
||||
Loading…
Reference in New Issue