diff --git a/api/agent-notes/controllers/console/datasets/datasets_document.py.md b/api/agent-notes/controllers/console/datasets/datasets_document.py.md new file mode 100644 index 0000000000..b100249981 --- /dev/null +++ b/api/agent-notes/controllers/console/datasets/datasets_document.py.md @@ -0,0 +1,52 @@ +## Purpose + +`api/controllers/console/datasets/datasets_document.py` contains the console (authenticated) APIs for managing dataset documents (list/create/update/delete, processing controls, estimates, etc.). + +## Storage model (uploaded files) + +- For local file uploads into a knowledge base, the binary is stored via `extensions.ext_storage.storage` under the key: + - `upload_files//.` +- File metadata is stored in the `upload_files` table (`UploadFile` model), keyed by `UploadFile.id`. +- Dataset `Document` records reference the uploaded file via: + - `Document.data_source_info.upload_file_id` + +## Download endpoint + +- `GET /datasets//documents//download` + + - Only supported when `Document.data_source_type == "upload_file"`. + - Performs dataset permission + tenant checks via `DocumentResource.get_document(...)`. + - Delegates `Document -> UploadFile` validation and signed URL generation to `DocumentService.get_document_download_url(...)`. + - Applies `cloud_edition_billing_rate_limit_check("knowledge")` to match other KB operations. + - Response body is **only**: `{ "url": "" }`. + +- `POST /datasets//documents/download-zip` + + - Accepts `{ "document_ids": ["..."] }` (upload-file only). + - Returns `application/zip` as a single attachment download. + - Rationale: browsers often block multiple automatic downloads; a ZIP avoids that limitation. + - Applies `cloud_edition_billing_rate_limit_check("knowledge")`. + - Delegates dataset permission checks, document/upload-file validation, and download-name generation to + `DocumentService.prepare_document_batch_download_zip(...)` before streaming the ZIP. + +## Verification plan + +- Upload a document from a local file into a dataset. +- Call the download endpoint and confirm it returns a signed URL. +- Open the URL and confirm: + - Response headers force download (`Content-Disposition`), and + - Downloaded bytes match the uploaded file. +- Select multiple uploaded-file documents and download as ZIP; confirm all selected files exist in the archive. + +## Shared helper + +- `DocumentService.get_document_download_url(document)` resolves the `UploadFile` and signs a download URL. +- `DocumentService.prepare_document_batch_download_zip(...)` performs dataset permission checks, batches + document + upload file lookups, preserves request order, and generates the client-visible ZIP filename. +- Internal helpers now live in `DocumentService` (`_get_upload_file_id_for_upload_file_document(...)`, + `_get_upload_file_for_upload_file_document(...)`, `_get_upload_files_by_document_id_for_zip_download(...)`). +- ZIP packing is handled by `FileService.build_upload_files_zip_tempfile(...)`, which also: + - sanitizes entry names to avoid path traversal, and + - deduplicates names while preserving extensions (e.g., `doc.txt` → `doc (1).txt`). + Streaming the response and deferring cleanup is handled by the route via `send_file(path, ...)` + `ExitStack` + + `response.call_on_close(...)` (the file is deleted when the response is closed). diff --git a/api/agent-notes/services/dataset_service.py.md b/api/agent-notes/services/dataset_service.py.md new file mode 100644 index 0000000000..b68ef345f5 --- /dev/null +++ b/api/agent-notes/services/dataset_service.py.md @@ -0,0 +1,18 @@ +## Purpose + +`api/services/dataset_service.py` hosts dataset/document service logic used by console and API controllers. + +## Batch document operations + +- Batch document workflows should avoid N+1 database queries by using set-based lookups. +- Tenant checks must be enforced consistently across dataset/document operations. +- `DocumentService.get_documents_by_ids(...)` fetches documents for a dataset using `id.in_(...)`. +- `FileService.get_upload_files_by_ids(...)` performs tenant-scoped batch lookup for `UploadFile` (dedupes ids with `set(...)`). +- `DocumentService.get_document_download_url(...)` and `prepare_document_batch_download_zip(...)` handle + dataset/document permission checks plus `Document -> UploadFile` validation for download endpoints. + +## Verification plan + +- Exercise document list and download endpoints that use the service helpers. +- Confirm batch download uses constant query count for documents + upload files. +- Request a ZIP with a missing document id and confirm a 404 is returned. diff --git a/api/agent-notes/services/file_service.py.md b/api/agent-notes/services/file_service.py.md new file mode 100644 index 0000000000..cf394a1c05 --- /dev/null +++ b/api/agent-notes/services/file_service.py.md @@ -0,0 +1,35 @@ +## Purpose + +`api/services/file_service.py` owns business logic around `UploadFile` objects: upload validation, storage persistence, +previews/generators, and deletion. + +## Key invariants + +- All storage I/O goes through `extensions.ext_storage.storage`. +- Uploaded file keys follow: `upload_files//.`. +- Upload validation is enforced in `FileService.upload_file(...)` (blocked extensions, size limits, dataset-only types). + +## Batch lookup helpers + +- `FileService.get_upload_files_by_ids(tenant_id, upload_file_ids)` is the canonical tenant-scoped batch loader for + `UploadFile`. + +## Dataset document download helpers + +The dataset document download/ZIP endpoints now delegate “Document → UploadFile” validation and permission checks to +`DocumentService` (`api/services/dataset_service.py`). `FileService` stays focused on generic `UploadFile` operations +(uploading, previews, deletion), plus generic ZIP serving. + +### ZIP serving + +- `FileService.build_upload_files_zip_tempfile(...)` builds a ZIP from `UploadFile` objects and yields a seeked + tempfile **path** so callers can stream it (e.g., `send_file(path, ...)`) without hitting "read of closed file" + issues from file-handle lifecycle during streamed responses. +- Flask `send_file(...)` and the `ExitStack`/`call_on_close(...)` cleanup pattern are handled in the route layer. + +## Verification plan + +- Unit: `api/tests/unit_tests/controllers/console/datasets/test_datasets_document_download.py` + - Verify signed URL generation for upload-file documents and ZIP download behavior for multiple documents. +- Unit: `api/tests/unit_tests/services/test_file_service_zip_and_lookup.py` + - Verify ZIP packing produces a valid, openable archive and preserves file content. diff --git a/api/agent-notes/tests/unit_tests/controllers/console/datasets/test_datasets_document_download.py.md b/api/agent-notes/tests/unit_tests/controllers/console/datasets/test_datasets_document_download.py.md new file mode 100644 index 0000000000..8f78dacde8 --- /dev/null +++ b/api/agent-notes/tests/unit_tests/controllers/console/datasets/test_datasets_document_download.py.md @@ -0,0 +1,28 @@ +## Purpose + +Unit tests for the console dataset document download endpoint: + +- `GET /datasets//documents//download` + +## Testing approach + +- Uses `Flask.test_request_context()` and calls the `Resource.get(...)` method directly. +- Monkeypatches console decorators (`login_required`, `setup_required`, rate limit) to no-ops to keep the test focused. +- Mocks: + - `DatasetService.get_dataset` / `check_dataset_permission` + - `DocumentService.get_document` for single-file download tests + - `DocumentService.get_documents_by_ids` + `FileService.get_upload_files_by_ids` for ZIP download tests + - `FileService.get_upload_files_by_ids` for `UploadFile` lookups in single-file tests + - `services.dataset_service.file_helpers.get_signed_file_url` to return a deterministic URL +- Document mocks include `id` fields so batch lookups can map documents by id. + +## Covered cases + +- Success returns `{ "url": "" }` for upload-file documents. +- 404 when document is not `upload_file`. +- 404 when `upload_file_id` is missing. +- 404 when referenced `UploadFile` row does not exist. +- 403 when document tenant does not match current tenant. +- Batch ZIP download returns `application/zip` for upload-file documents. +- Batch ZIP download rejects non-upload-file documents. +- Batch ZIP download uses a random `.zip` attachment name (`download_name`), so tests only assert the suffix. diff --git a/api/agent-notes/tests/unit_tests/services/test_file_service_zip_and_lookup.py.md b/api/agent-notes/tests/unit_tests/services/test_file_service_zip_and_lookup.py.md new file mode 100644 index 0000000000..dbcdf26f10 --- /dev/null +++ b/api/agent-notes/tests/unit_tests/services/test_file_service_zip_and_lookup.py.md @@ -0,0 +1,18 @@ +## Purpose + +Unit tests for `api/services/file_service.py` helper methods that are not covered by higher-level controller tests. + +## What’s covered + +- `FileService.build_upload_files_zip_tempfile(...)` + - ZIP entry name sanitization (no directory components / traversal) + - name deduplication while preserving extensions + - writing streamed bytes from `storage.load(...)` into ZIP entries + - yields a tempfile path so callers can open/stream the ZIP without holding a live file handle +- `FileService.get_upload_files_by_ids(...)` + - returns `{}` for empty id lists + - returns an id-keyed mapping for non-empty lists + +## Notes + +- These tests intentionally stub `storage.load` and `db.session.scalars(...).all()` to avoid needing a real DB/storage. diff --git a/api/controllers/console/datasets/datasets_document.py b/api/controllers/console/datasets/datasets_document.py index 707d90f044..2599e6293a 100644 --- a/api/controllers/console/datasets/datasets_document.py +++ b/api/controllers/console/datasets/datasets_document.py @@ -2,10 +2,12 @@ import json import logging from argparse import ArgumentTypeError from collections.abc import Sequence -from typing import Literal, cast +from contextlib import ExitStack +from typing import Any, Literal, cast +from uuid import UUID import sqlalchemy as sa -from flask import request +from flask import request, send_file from flask_restx import Resource, fields, marshal, marshal_with from pydantic import BaseModel, Field from sqlalchemy import asc, desc, select @@ -42,6 +44,7 @@ from models import DatasetProcessRule, Document, DocumentSegment, UploadFile from models.dataset import DocumentPipelineExecutionLog from services.dataset_service import DatasetService, DocumentService from services.entities.knowledge_entities.knowledge_entities import KnowledgeConfig, ProcessRule, RetrievalModel +from services.file_service import FileService from ..app.error import ( ProviderModelCurrentlyNotSupportError, @@ -65,6 +68,9 @@ from ..wraps import ( logger = logging.getLogger(__name__) +# NOTE: Keep constants near the top of the module for discoverability. +DOCUMENT_BATCH_DOWNLOAD_ZIP_MAX_DOCS = 100 + def _get_or_create_model(model_name: str, field_def): existing = console_ns.models.get(model_name) @@ -104,6 +110,12 @@ class DocumentRenamePayload(BaseModel): name: str +class DocumentBatchDownloadZipPayload(BaseModel): + """Request payload for bulk downloading documents as a zip archive.""" + + document_ids: list[UUID] = Field(..., min_length=1, max_length=DOCUMENT_BATCH_DOWNLOAD_ZIP_MAX_DOCS) + + class DocumentDatasetListParam(BaseModel): page: int = Field(1, title="Page", description="Page number.") limit: int = Field(20, title="Limit", description="Page size.") @@ -120,6 +132,7 @@ register_schema_models( RetrievalModel, DocumentRetryPayload, DocumentRenamePayload, + DocumentBatchDownloadZipPayload, ) @@ -853,6 +866,62 @@ class DocumentApi(DocumentResource): return {"result": "success"}, 204 +@console_ns.route("/datasets//documents//download") +class DocumentDownloadApi(DocumentResource): + """Return a signed download URL for a dataset document's original uploaded file.""" + + @console_ns.doc("get_dataset_document_download_url") + @console_ns.doc(description="Get a signed download URL for a dataset document's original uploaded file") + @setup_required + @login_required + @account_initialization_required + @cloud_edition_billing_rate_limit_check("knowledge") + def get(self, dataset_id: str, document_id: str) -> dict[str, Any]: + # Reuse the shared permission/tenant checks implemented in DocumentResource. + document = self.get_document(str(dataset_id), str(document_id)) + return {"url": DocumentService.get_document_download_url(document)} + + +@console_ns.route("/datasets//documents/download-zip") +class DocumentBatchDownloadZipApi(DocumentResource): + """Download multiple uploaded-file documents as a single ZIP (avoids browser multi-download limits).""" + + @console_ns.doc("download_dataset_documents_as_zip") + @console_ns.doc(description="Download selected dataset documents as a single ZIP archive (upload-file only)") + @setup_required + @login_required + @account_initialization_required + @cloud_edition_billing_rate_limit_check("knowledge") + @console_ns.expect(console_ns.models[DocumentBatchDownloadZipPayload.__name__]) + def post(self, dataset_id: str): + """Stream a ZIP archive containing the requested uploaded documents.""" + # Parse and validate request payload. + payload = DocumentBatchDownloadZipPayload.model_validate(console_ns.payload or {}) + + current_user, current_tenant_id = current_account_with_tenant() + dataset_id = str(dataset_id) + document_ids: list[str] = [str(document_id) for document_id in payload.document_ids] + upload_files, download_name = DocumentService.prepare_document_batch_download_zip( + dataset_id=dataset_id, + document_ids=document_ids, + tenant_id=current_tenant_id, + current_user=current_user, + ) + + # Delegate ZIP packing to FileService, but keep Flask response+cleanup in the route. + with ExitStack() as stack: + zip_path = stack.enter_context(FileService.build_upload_files_zip_tempfile(upload_files=upload_files)) + response = send_file( + zip_path, + mimetype="application/zip", + as_attachment=True, + download_name=download_name, + ) + cleanup = stack.pop_all() + response.call_on_close(cleanup.close) + return response + + @console_ns.route("/datasets//documents//processing/") class DocumentProcessingApi(DocumentResource): @console_ns.doc("update_document_processing") diff --git a/api/services/dataset_service.py b/api/services/dataset_service.py index 18e5613438..be9a0e9279 100644 --- a/api/services/dataset_service.py +++ b/api/services/dataset_service.py @@ -13,10 +13,11 @@ import sqlalchemy as sa from redis.exceptions import LockNotOwnedError from sqlalchemy import exists, func, select from sqlalchemy.orm import Session -from werkzeug.exceptions import NotFound +from werkzeug.exceptions import Forbidden, NotFound from configs import dify_config from core.errors.error import LLMBadRequestError, ProviderTokenNotInitError +from core.file import helpers as file_helpers from core.helper.name_generator import generate_incremental_name from core.model_manager import ModelManager from core.model_runtime.entities.model_entities import ModelFeature, ModelType @@ -73,6 +74,7 @@ from services.errors.document import DocumentIndexingError from services.errors.file import FileNotExistsError from services.external_knowledge_service import ExternalDatasetService from services.feature_service import FeatureModel, FeatureService +from services.file_service import FileService from services.rag_pipeline.rag_pipeline import RagPipelineService from services.tag_service import TagService from services.vector_service import VectorService @@ -1162,6 +1164,7 @@ class DocumentService: Document.archived.is_(True), ), } + DOCUMENT_BATCH_DOWNLOAD_ZIP_FILENAME_EXTENSION = ".zip" @classmethod def normalize_display_status(cls, status: str | None) -> str | None: @@ -1288,6 +1291,143 @@ class DocumentService: else: return None + @staticmethod + def get_documents_by_ids(dataset_id: str, document_ids: Sequence[str]) -> Sequence[Document]: + """Fetch documents for a dataset in a single batch query.""" + if not document_ids: + return [] + document_id_list: list[str] = [str(document_id) for document_id in document_ids] + # Fetch all requested documents in one query to avoid N+1 lookups. + documents: Sequence[Document] = db.session.scalars( + select(Document).where( + Document.dataset_id == dataset_id, + Document.id.in_(document_id_list), + ) + ).all() + return documents + + @staticmethod + def get_document_download_url(document: Document) -> str: + """ + Return a signed download URL for an upload-file document. + """ + upload_file = DocumentService._get_upload_file_for_upload_file_document(document) + return file_helpers.get_signed_file_url(upload_file_id=upload_file.id, as_attachment=True) + + @staticmethod + def prepare_document_batch_download_zip( + *, + dataset_id: str, + document_ids: Sequence[str], + tenant_id: str, + current_user: Account, + ) -> tuple[list[UploadFile], str]: + """ + Resolve upload files for batch ZIP downloads and generate a client-visible filename. + """ + dataset = DatasetService.get_dataset(dataset_id) + if not dataset: + raise NotFound("Dataset not found.") + try: + DatasetService.check_dataset_permission(dataset, current_user) + except NoPermissionError as e: + raise Forbidden(str(e)) + + upload_files_by_document_id = DocumentService._get_upload_files_by_document_id_for_zip_download( + dataset_id=dataset_id, + document_ids=document_ids, + tenant_id=tenant_id, + ) + upload_files = [upload_files_by_document_id[document_id] for document_id in document_ids] + download_name = DocumentService._generate_document_batch_download_zip_filename() + return upload_files, download_name + + @staticmethod + def _generate_document_batch_download_zip_filename() -> str: + """ + Generate a random attachment filename for the batch download ZIP. + """ + return f"{uuid.uuid4().hex}{DocumentService.DOCUMENT_BATCH_DOWNLOAD_ZIP_FILENAME_EXTENSION}" + + @staticmethod + def _get_upload_file_id_for_upload_file_document( + document: Document, + *, + invalid_source_message: str, + missing_file_message: str, + ) -> str: + """ + Normalize and validate `Document -> UploadFile` linkage for download flows. + """ + if document.data_source_type != "upload_file": + raise NotFound(invalid_source_message) + + data_source_info: dict[str, Any] = document.data_source_info_dict or {} + upload_file_id: str | None = data_source_info.get("upload_file_id") + if not upload_file_id: + raise NotFound(missing_file_message) + + return str(upload_file_id) + + @staticmethod + def _get_upload_file_for_upload_file_document(document: Document) -> UploadFile: + """ + Load the `UploadFile` row for an upload-file document. + """ + upload_file_id = DocumentService._get_upload_file_id_for_upload_file_document( + document, + invalid_source_message="Document does not have an uploaded file to download.", + missing_file_message="Uploaded file not found.", + ) + upload_files_by_id = FileService.get_upload_files_by_ids(document.tenant_id, [upload_file_id]) + upload_file = upload_files_by_id.get(upload_file_id) + if not upload_file: + raise NotFound("Uploaded file not found.") + return upload_file + + @staticmethod + def _get_upload_files_by_document_id_for_zip_download( + *, + dataset_id: str, + document_ids: Sequence[str], + tenant_id: str, + ) -> dict[str, UploadFile]: + """ + Batch load upload files keyed by document id for ZIP downloads. + """ + document_id_list: list[str] = [str(document_id) for document_id in document_ids] + + documents = DocumentService.get_documents_by_ids(dataset_id, document_id_list) + documents_by_id: dict[str, Document] = {str(document.id): document for document in documents} + + missing_document_ids: set[str] = set(document_id_list) - set(documents_by_id.keys()) + if missing_document_ids: + raise NotFound("Document not found.") + + upload_file_ids: list[str] = [] + upload_file_ids_by_document_id: dict[str, str] = {} + for document_id, document in documents_by_id.items(): + if document.tenant_id != tenant_id: + raise Forbidden("No permission.") + + upload_file_id = DocumentService._get_upload_file_id_for_upload_file_document( + document, + invalid_source_message="Only uploaded-file documents can be downloaded as ZIP.", + missing_file_message="Only uploaded-file documents can be downloaded as ZIP.", + ) + upload_file_ids.append(upload_file_id) + upload_file_ids_by_document_id[document_id] = upload_file_id + + upload_files_by_id = FileService.get_upload_files_by_ids(tenant_id, upload_file_ids) + missing_upload_file_ids: set[str] = set(upload_file_ids) - set(upload_files_by_id.keys()) + if missing_upload_file_ids: + raise NotFound("Only uploaded-file documents can be downloaded as ZIP.") + + return { + document_id: upload_files_by_id[upload_file_id] + for document_id, upload_file_id in upload_file_ids_by_document_id.items() + } + @staticmethod def get_document_by_id(document_id: str) -> Document | None: document = db.session.query(Document).where(Document.id == document_id).first() diff --git a/api/services/file_service.py b/api/services/file_service.py index 0911cf38c4..a0a99f3f82 100644 --- a/api/services/file_service.py +++ b/api/services/file_service.py @@ -2,7 +2,11 @@ import base64 import hashlib import os import uuid +from collections.abc import Iterator, Sequence +from contextlib import contextmanager, suppress +from tempfile import NamedTemporaryFile from typing import Literal, Union +from zipfile import ZIP_DEFLATED, ZipFile from sqlalchemy import Engine, select from sqlalchemy.orm import Session, sessionmaker @@ -17,6 +21,7 @@ from constants import ( ) from core.file import helpers as file_helpers from core.rag.extractor.extract_processor import ExtractProcessor +from extensions.ext_database import db from extensions.ext_storage import storage from libs.datetime_utils import naive_utc_now from libs.helper import extract_tenant_id @@ -167,6 +172,9 @@ class FileService: return upload_file def get_file_preview(self, file_id: str): + """ + Return a short text preview extracted from a document file. + """ with self._session_maker(expire_on_commit=False) as session: upload_file = session.query(UploadFile).where(UploadFile.id == file_id).first() @@ -253,3 +261,101 @@ class FileService: return storage.delete(upload_file.key) session.delete(upload_file) + + @staticmethod + def get_upload_files_by_ids(tenant_id: str, upload_file_ids: Sequence[str]) -> dict[str, UploadFile]: + """ + Fetch `UploadFile` rows for a tenant in a single batch query. + + This is a generic `UploadFile` lookup helper (not dataset/document specific), so it lives in `FileService`. + """ + if not upload_file_ids: + return {} + + # Normalize and deduplicate ids before using them in the IN clause. + upload_file_id_list: list[str] = [str(upload_file_id) for upload_file_id in upload_file_ids] + unique_upload_file_ids: list[str] = list(set(upload_file_id_list)) + + # Fetch upload files in one query for efficient batch access. + upload_files: Sequence[UploadFile] = db.session.scalars( + select(UploadFile).where( + UploadFile.tenant_id == tenant_id, + UploadFile.id.in_(unique_upload_file_ids), + ) + ).all() + return {str(upload_file.id): upload_file for upload_file in upload_files} + + @staticmethod + def _sanitize_zip_entry_name(name: str) -> str: + """ + Sanitize a ZIP entry name to avoid path traversal and weird separators. + + We keep this conservative: the upload flow already rejects `/` and `\\`, but older rows (or imported data) + could still contain unsafe names. + """ + # Drop any directory components and prevent empty names. + base = os.path.basename(name).strip() or "file" + + # ZIP uses forward slashes as separators; remove any residual separator characters. + return base.replace("/", "_").replace("\\", "_") + + @staticmethod + def _dedupe_zip_entry_name(original_name: str, used_names: set[str]) -> str: + """ + Return a unique ZIP entry name, inserting suffixes before the extension. + """ + # Keep the original name when it's not already used. + if original_name not in used_names: + return original_name + + # Insert suffixes before the extension (e.g., "doc.txt" -> "doc (1).txt"). + stem, extension = os.path.splitext(original_name) + suffix = 1 + while True: + candidate = f"{stem} ({suffix}){extension}" + if candidate not in used_names: + return candidate + suffix += 1 + + @staticmethod + @contextmanager + def build_upload_files_zip_tempfile( + *, + upload_files: Sequence[UploadFile], + ) -> Iterator[str]: + """ + Build a ZIP from `UploadFile`s and yield a tempfile path. + + We yield a path (rather than an open file handle) to avoid "read of closed file" issues when Flask/Werkzeug + streams responses. The caller is expected to keep this context open until the response is fully sent, then + close it (e.g., via `response.call_on_close(...)`) to delete the tempfile. + """ + used_names: set[str] = set() + + # Build a ZIP in a temp file and keep it on disk until the caller finishes streaming it. + tmp_path: str | None = None + try: + with NamedTemporaryFile(mode="w+b", suffix=".zip", delete=False) as tmp: + tmp_path = tmp.name + with ZipFile(tmp, mode="w", compression=ZIP_DEFLATED) as zf: + for upload_file in upload_files: + # Ensure the entry name is safe and unique. + safe_name = FileService._sanitize_zip_entry_name(upload_file.name) + arcname = FileService._dedupe_zip_entry_name(safe_name, used_names) + used_names.add(arcname) + + # Stream file bytes from storage into the ZIP entry. + with zf.open(arcname, "w") as entry: + for chunk in storage.load(upload_file.key, stream=True): + entry.write(chunk) + + # Flush so `send_file(path, ...)` can re-open it safely on all platforms. + tmp.flush() + + assert tmp_path is not None + yield tmp_path + finally: + # Remove the temp file when the context is closed (typically after the response finishes streaming). + if tmp_path is not None: + with suppress(FileNotFoundError): + os.remove(tmp_path) diff --git a/api/tests/unit_tests/controllers/console/datasets/test_datasets_document_download.py b/api/tests/unit_tests/controllers/console/datasets/test_datasets_document_download.py new file mode 100644 index 0000000000..d5d7ee95c5 --- /dev/null +++ b/api/tests/unit_tests/controllers/console/datasets/test_datasets_document_download.py @@ -0,0 +1,430 @@ +""" +Unit tests for the dataset document download endpoint. + +These tests validate that the controller returns a signed download URL for +upload-file documents, and rejects unsupported or missing file cases. +""" + +from __future__ import annotations + +import importlib +import sys +from collections import UserDict +from io import BytesIO +from types import SimpleNamespace +from typing import Any +from zipfile import ZipFile + +import pytest +from flask import Flask +from werkzeug.exceptions import Forbidden, NotFound + + +@pytest.fixture +def app() -> Flask: + """Create a minimal Flask app for request-context based controller tests.""" + app = Flask(__name__) + app.config["TESTING"] = True + return app + + +@pytest.fixture +def datasets_document_module(monkeypatch: pytest.MonkeyPatch): + """ + Reload `controllers.console.datasets.datasets_document` with lightweight decorators. + + We patch auth / setup / rate-limit decorators to no-ops so we can unit test the + controller logic without requiring the full console stack. + """ + + from controllers.console import console_ns, wraps + from libs import login + + def _noop(func): # type: ignore[no-untyped-def] + return func + + # Bypass login/setup/account checks in unit tests. + monkeypatch.setattr(login, "login_required", _noop) + monkeypatch.setattr(wraps, "setup_required", _noop) + monkeypatch.setattr(wraps, "account_initialization_required", _noop) + + # Bypass billing-related decorators used by other endpoints in this module. + monkeypatch.setattr(wraps, "cloud_edition_billing_resource_check", lambda *_args, **_kwargs: (lambda f: f)) + monkeypatch.setattr(wraps, "cloud_edition_billing_rate_limit_check", lambda *_args, **_kwargs: (lambda f: f)) + + # Avoid Flask-RESTX route registration side effects during import. + def _noop_route(*_args, **_kwargs): # type: ignore[override] + def _decorator(cls): + return cls + + return _decorator + + monkeypatch.setattr(console_ns, "route", _noop_route) + + module_name = "controllers.console.datasets.datasets_document" + sys.modules.pop(module_name, None) + return importlib.import_module(module_name) + + +def _mock_user(*, is_dataset_editor: bool = True) -> SimpleNamespace: + """Build a minimal user object compatible with dataset permission checks.""" + return SimpleNamespace(is_dataset_editor=is_dataset_editor, id="user-123") + + +def _mock_document( + *, + document_id: str, + tenant_id: str, + data_source_type: str, + upload_file_id: str | None, +) -> SimpleNamespace: + """Build a minimal document object used by the controller.""" + data_source_info_dict: dict[str, Any] | None = None + if upload_file_id is not None: + data_source_info_dict = {"upload_file_id": upload_file_id} + else: + data_source_info_dict = {} + + return SimpleNamespace( + id=document_id, + tenant_id=tenant_id, + data_source_type=data_source_type, + data_source_info_dict=data_source_info_dict, + ) + + +def _wire_common_success_mocks( + *, + module, + monkeypatch: pytest.MonkeyPatch, + current_tenant_id: str, + document_tenant_id: str, + data_source_type: str, + upload_file_id: str | None, + upload_file_exists: bool, + signed_url: str, +) -> None: + """Patch controller dependencies to create a deterministic test environment.""" + import services.dataset_service as dataset_service_module + + # Make `current_account_with_tenant()` return a known user + tenant id. + monkeypatch.setattr(module, "current_account_with_tenant", lambda: (_mock_user(), current_tenant_id)) + + # Return a dataset object and allow permission checks to pass. + monkeypatch.setattr(module.DatasetService, "get_dataset", lambda _dataset_id: SimpleNamespace(id="ds-1")) + monkeypatch.setattr(module.DatasetService, "check_dataset_permission", lambda *_args, **_kwargs: None) + + # Return a document that will be validated inside DocumentResource.get_document. + document = _mock_document( + document_id="doc-1", + tenant_id=document_tenant_id, + data_source_type=data_source_type, + upload_file_id=upload_file_id, + ) + monkeypatch.setattr(module.DocumentService, "get_document", lambda *_args, **_kwargs: document) + + # Mock UploadFile lookup via FileService batch helper. + upload_files_by_id: dict[str, Any] = {} + if upload_file_exists and upload_file_id is not None: + upload_files_by_id[str(upload_file_id)] = SimpleNamespace(id=str(upload_file_id)) + monkeypatch.setattr(module.FileService, "get_upload_files_by_ids", lambda *_args, **_kwargs: upload_files_by_id) + + # Mock signing helper so the returned URL is deterministic. + monkeypatch.setattr(dataset_service_module.file_helpers, "get_signed_file_url", lambda **_kwargs: signed_url) + + +def _mock_send_file(obj, **kwargs): # type: ignore[no-untyped-def] + """Return a lightweight representation of `send_file(...)` for unit tests.""" + + class _ResponseMock(UserDict): + def __init__(self, sent_file: object, send_file_kwargs: dict[str, object]) -> None: + super().__init__({"_sent_file": sent_file, "_send_file_kwargs": send_file_kwargs}) + self._on_close: object | None = None + + def call_on_close(self, func): # type: ignore[no-untyped-def] + self._on_close = func + return func + + return _ResponseMock(obj, kwargs) + + +def test_batch_download_zip_returns_send_file( + app: Flask, datasets_document_module, monkeypatch: pytest.MonkeyPatch +) -> None: + """Ensure batch ZIP download returns a zip attachment via `send_file`.""" + + # Arrange common permission mocks. + monkeypatch.setattr(datasets_document_module, "current_account_with_tenant", lambda: (_mock_user(), "tenant-123")) + monkeypatch.setattr( + datasets_document_module.DatasetService, "get_dataset", lambda _dataset_id: SimpleNamespace(id="ds-1") + ) + monkeypatch.setattr( + datasets_document_module.DatasetService, "check_dataset_permission", lambda *_args, **_kwargs: None + ) + + # Two upload-file documents, each referencing an UploadFile. + doc1 = _mock_document( + document_id="11111111-1111-1111-1111-111111111111", + tenant_id="tenant-123", + data_source_type="upload_file", + upload_file_id="file-1", + ) + doc2 = _mock_document( + document_id="22222222-2222-2222-2222-222222222222", + tenant_id="tenant-123", + data_source_type="upload_file", + upload_file_id="file-2", + ) + monkeypatch.setattr( + datasets_document_module.DocumentService, + "get_documents_by_ids", + lambda *_args, **_kwargs: [doc1, doc2], + ) + monkeypatch.setattr( + datasets_document_module.FileService, + "get_upload_files_by_ids", + lambda *_args, **_kwargs: { + "file-1": SimpleNamespace(id="file-1", name="a.txt", key="k1"), + "file-2": SimpleNamespace(id="file-2", name="b.txt", key="k2"), + }, + ) + + # Mock storage streaming content. + import services.file_service as file_service_module + + monkeypatch.setattr(file_service_module.storage, "load", lambda _key, stream=True: [b"hello"]) + + # Replace send_file used by the controller to avoid a real Flask response object. + monkeypatch.setattr(datasets_document_module, "send_file", _mock_send_file) + + # Act + with app.test_request_context( + "/datasets/ds-1/documents/download-zip", + method="POST", + json={"document_ids": ["11111111-1111-1111-1111-111111111111", "22222222-2222-2222-2222-222222222222"]}, + ): + api = datasets_document_module.DocumentBatchDownloadZipApi() + result = api.post(dataset_id="ds-1") + + # Assert: we returned via send_file with correct mime type and attachment. + assert result["_send_file_kwargs"]["mimetype"] == "application/zip" + assert result["_send_file_kwargs"]["as_attachment"] is True + assert isinstance(result["_send_file_kwargs"]["download_name"], str) + assert result["_send_file_kwargs"]["download_name"].endswith(".zip") + # Ensure our cleanup hook is registered and execute it to avoid temp file leaks in unit tests. + assert getattr(result, "_on_close", None) is not None + result._on_close() # type: ignore[attr-defined] + + +def test_batch_download_zip_response_is_openable_zip( + app: Flask, datasets_document_module, monkeypatch: pytest.MonkeyPatch +) -> None: + """Ensure the real Flask `send_file` response body is a valid ZIP that can be opened.""" + + # Arrange: same controller mocks as the lightweight send_file test, but we keep the real `send_file`. + monkeypatch.setattr(datasets_document_module, "current_account_with_tenant", lambda: (_mock_user(), "tenant-123")) + monkeypatch.setattr( + datasets_document_module.DatasetService, "get_dataset", lambda _dataset_id: SimpleNamespace(id="ds-1") + ) + monkeypatch.setattr( + datasets_document_module.DatasetService, "check_dataset_permission", lambda *_args, **_kwargs: None + ) + + doc1 = _mock_document( + document_id="33333333-3333-3333-3333-333333333333", + tenant_id="tenant-123", + data_source_type="upload_file", + upload_file_id="file-1", + ) + doc2 = _mock_document( + document_id="44444444-4444-4444-4444-444444444444", + tenant_id="tenant-123", + data_source_type="upload_file", + upload_file_id="file-2", + ) + monkeypatch.setattr( + datasets_document_module.DocumentService, + "get_documents_by_ids", + lambda *_args, **_kwargs: [doc1, doc2], + ) + monkeypatch.setattr( + datasets_document_module.FileService, + "get_upload_files_by_ids", + lambda *_args, **_kwargs: { + "file-1": SimpleNamespace(id="file-1", name="a.txt", key="k1"), + "file-2": SimpleNamespace(id="file-2", name="b.txt", key="k2"), + }, + ) + + # Stream distinct bytes per key so we can verify both ZIP entries. + import services.file_service as file_service_module + + monkeypatch.setattr( + file_service_module.storage, "load", lambda key, stream=True: [b"one"] if key == "k1" else [b"two"] + ) + + # Act + with app.test_request_context( + "/datasets/ds-1/documents/download-zip", + method="POST", + json={"document_ids": ["33333333-3333-3333-3333-333333333333", "44444444-4444-4444-4444-444444444444"]}, + ): + api = datasets_document_module.DocumentBatchDownloadZipApi() + response = api.post(dataset_id="ds-1") + + # Assert: response body is a valid ZIP and contains the expected entries. + response.direct_passthrough = False + data = response.get_data() + response.close() + + with ZipFile(BytesIO(data), mode="r") as zf: + assert zf.namelist() == ["a.txt", "b.txt"] + assert zf.read("a.txt") == b"one" + assert zf.read("b.txt") == b"two" + + +def test_batch_download_zip_rejects_non_upload_file_document( + app: Flask, datasets_document_module, monkeypatch: pytest.MonkeyPatch +) -> None: + """Ensure batch ZIP download rejects non upload-file documents.""" + + monkeypatch.setattr(datasets_document_module, "current_account_with_tenant", lambda: (_mock_user(), "tenant-123")) + monkeypatch.setattr( + datasets_document_module.DatasetService, "get_dataset", lambda _dataset_id: SimpleNamespace(id="ds-1") + ) + monkeypatch.setattr( + datasets_document_module.DatasetService, "check_dataset_permission", lambda *_args, **_kwargs: None + ) + + doc = _mock_document( + document_id="55555555-5555-5555-5555-555555555555", + tenant_id="tenant-123", + data_source_type="website_crawl", + upload_file_id="file-1", + ) + monkeypatch.setattr( + datasets_document_module.DocumentService, + "get_documents_by_ids", + lambda *_args, **_kwargs: [doc], + ) + + with app.test_request_context( + "/datasets/ds-1/documents/download-zip", + method="POST", + json={"document_ids": ["55555555-5555-5555-5555-555555555555"]}, + ): + api = datasets_document_module.DocumentBatchDownloadZipApi() + with pytest.raises(NotFound): + api.post(dataset_id="ds-1") + + +def test_document_download_returns_url_for_upload_file_document( + app: Flask, datasets_document_module, monkeypatch: pytest.MonkeyPatch +) -> None: + """Ensure upload-file documents return a `{url}` JSON payload.""" + + _wire_common_success_mocks( + module=datasets_document_module, + monkeypatch=monkeypatch, + current_tenant_id="tenant-123", + document_tenant_id="tenant-123", + data_source_type="upload_file", + upload_file_id="file-123", + upload_file_exists=True, + signed_url="https://example.com/signed", + ) + + # Build a request context then call the resource method directly. + with app.test_request_context("/datasets/ds-1/documents/doc-1/download", method="GET"): + api = datasets_document_module.DocumentDownloadApi() + result = api.get(dataset_id="ds-1", document_id="doc-1") + + assert result == {"url": "https://example.com/signed"} + + +def test_document_download_rejects_non_upload_file_document( + app: Flask, datasets_document_module, monkeypatch: pytest.MonkeyPatch +) -> None: + """Ensure non-upload documents raise 404 (no file to download).""" + + _wire_common_success_mocks( + module=datasets_document_module, + monkeypatch=monkeypatch, + current_tenant_id="tenant-123", + document_tenant_id="tenant-123", + data_source_type="website_crawl", + upload_file_id="file-123", + upload_file_exists=True, + signed_url="https://example.com/signed", + ) + + with app.test_request_context("/datasets/ds-1/documents/doc-1/download", method="GET"): + api = datasets_document_module.DocumentDownloadApi() + with pytest.raises(NotFound): + api.get(dataset_id="ds-1", document_id="doc-1") + + +def test_document_download_rejects_missing_upload_file_id( + app: Flask, datasets_document_module, monkeypatch: pytest.MonkeyPatch +) -> None: + """Ensure missing `upload_file_id` raises 404.""" + + _wire_common_success_mocks( + module=datasets_document_module, + monkeypatch=monkeypatch, + current_tenant_id="tenant-123", + document_tenant_id="tenant-123", + data_source_type="upload_file", + upload_file_id=None, + upload_file_exists=False, + signed_url="https://example.com/signed", + ) + + with app.test_request_context("/datasets/ds-1/documents/doc-1/download", method="GET"): + api = datasets_document_module.DocumentDownloadApi() + with pytest.raises(NotFound): + api.get(dataset_id="ds-1", document_id="doc-1") + + +def test_document_download_rejects_when_upload_file_record_missing( + app: Flask, datasets_document_module, monkeypatch: pytest.MonkeyPatch +) -> None: + """Ensure missing UploadFile row raises 404.""" + + _wire_common_success_mocks( + module=datasets_document_module, + monkeypatch=monkeypatch, + current_tenant_id="tenant-123", + document_tenant_id="tenant-123", + data_source_type="upload_file", + upload_file_id="file-123", + upload_file_exists=False, + signed_url="https://example.com/signed", + ) + + with app.test_request_context("/datasets/ds-1/documents/doc-1/download", method="GET"): + api = datasets_document_module.DocumentDownloadApi() + with pytest.raises(NotFound): + api.get(dataset_id="ds-1", document_id="doc-1") + + +def test_document_download_rejects_tenant_mismatch( + app: Flask, datasets_document_module, monkeypatch: pytest.MonkeyPatch +) -> None: + """Ensure tenant mismatch is rejected by the shared `get_document()` permission check.""" + + _wire_common_success_mocks( + module=datasets_document_module, + monkeypatch=monkeypatch, + current_tenant_id="tenant-123", + document_tenant_id="tenant-999", + data_source_type="upload_file", + upload_file_id="file-123", + upload_file_exists=True, + signed_url="https://example.com/signed", + ) + + with app.test_request_context("/datasets/ds-1/documents/doc-1/download", method="GET"): + api = datasets_document_module.DocumentDownloadApi() + with pytest.raises(Forbidden): + api.get(dataset_id="ds-1", document_id="doc-1") diff --git a/api/tests/unit_tests/services/test_file_service_zip_and_lookup.py b/api/tests/unit_tests/services/test_file_service_zip_and_lookup.py new file mode 100644 index 0000000000..7b4d349e33 --- /dev/null +++ b/api/tests/unit_tests/services/test_file_service_zip_and_lookup.py @@ -0,0 +1,99 @@ +""" +Unit tests for `services.file_service.FileService` helpers. + +We keep these tests focused on: +- ZIP tempfile building (sanitization + deduplication + content writes) +- tenant-scoped batch lookup behavior (`get_upload_files_by_ids`) +""" + +from __future__ import annotations + +from types import SimpleNamespace +from typing import Any +from zipfile import ZipFile + +import pytest + +import services.file_service as file_service_module +from services.file_service import FileService + + +def test_build_upload_files_zip_tempfile_sanitizes_and_dedupes_names(monkeypatch: pytest.MonkeyPatch) -> None: + """Ensure ZIP entry names are safe and unique while preserving extensions.""" + + # Arrange: three upload files that all sanitize down to the same basename ("b.txt"). + upload_files: list[Any] = [ + SimpleNamespace(name="a/b.txt", key="k1"), + SimpleNamespace(name="c/b.txt", key="k2"), + SimpleNamespace(name="../b.txt", key="k3"), + ] + + # Stream distinct bytes per key so we can verify content is written to the right entry. + data_by_key: dict[str, list[bytes]] = {"k1": [b"one"], "k2": [b"two"], "k3": [b"three"]} + + def _load(key: str, stream: bool = True) -> list[bytes]: + # Return the corresponding chunks for this key (the production code iterates chunks). + assert stream is True + return data_by_key[key] + + monkeypatch.setattr(file_service_module.storage, "load", _load) + + # Act: build zip in a tempfile. + with FileService.build_upload_files_zip_tempfile(upload_files=upload_files) as tmp: + with ZipFile(tmp, mode="r") as zf: + # Assert: names are sanitized (no directory components) and deduped with suffixes. + assert zf.namelist() == ["b.txt", "b (1).txt", "b (2).txt"] + + # Assert: each entry contains the correct bytes from storage. + assert zf.read("b.txt") == b"one" + assert zf.read("b (1).txt") == b"two" + assert zf.read("b (2).txt") == b"three" + + +def test_get_upload_files_by_ids_returns_empty_when_no_ids(monkeypatch: pytest.MonkeyPatch) -> None: + """Ensure empty input returns an empty mapping without hitting the database.""" + + class _Session: + def scalars(self, _stmt): # type: ignore[no-untyped-def] + raise AssertionError("db.session.scalars should not be called for empty id lists") + + monkeypatch.setattr(file_service_module, "db", SimpleNamespace(session=_Session())) + + assert FileService.get_upload_files_by_ids("tenant-1", []) == {} + + +def test_get_upload_files_by_ids_returns_id_keyed_mapping(monkeypatch: pytest.MonkeyPatch) -> None: + """Ensure batch lookup returns a dict keyed by stringified UploadFile ids.""" + + upload_files: list[Any] = [ + SimpleNamespace(id="file-1", tenant_id="tenant-1"), + SimpleNamespace(id="file-2", tenant_id="tenant-1"), + ] + + class _ScalarResult: + def __init__(self, items: list[Any]) -> None: + self._items = items + + def all(self) -> list[Any]: + return self._items + + class _Session: + def __init__(self, items: list[Any]) -> None: + self._items = items + self.calls: list[object] = [] + + def scalars(self, stmt): # type: ignore[no-untyped-def] + # Capture the statement so we can at least assert the query path is taken. + self.calls.append(stmt) + return _ScalarResult(self._items) + + session = _Session(upload_files) + monkeypatch.setattr(file_service_module, "db", SimpleNamespace(session=session)) + + # Provide duplicates to ensure callers can safely pass repeated ids. + result = FileService.get_upload_files_by_ids("tenant-1", ["file-1", "file-1", "file-2"]) + + assert set(result.keys()) == {"file-1", "file-2"} + assert result["file-1"].id == "file-1" + assert result["file-2"].id == "file-2" + assert len(session.calls) == 1 diff --git a/web/app/components/base/chat/chat/citation/popup.tsx b/web/app/components/base/chat/chat/citation/popup.tsx index c1633227d2..e033a9fa77 100644 --- a/web/app/components/base/chat/chat/citation/popup.tsx +++ b/web/app/components/base/chat/chat/citation/popup.tsx @@ -1,4 +1,4 @@ -import type { FC } from 'react' +import type { FC, MouseEvent } from 'react' import type { Resources } from './index' import Link from 'next/link' import { Fragment, useState } from 'react' @@ -18,6 +18,8 @@ import { PortalToFollowElemContent, PortalToFollowElemTrigger, } from '@/app/components/base/portal-to-follow-elem' +import { useDocumentDownload } from '@/service/knowledge/use-document' +import { downloadUrl } from '@/utils/download' import ProgressTooltip from './progress-tooltip' import Tooltip from './tooltip' @@ -36,6 +38,30 @@ const Popup: FC = ({ ? (/\.([^.]*)$/.exec(data.documentName)?.[1] || '') : 'notion' + const { mutateAsync: downloadDocument, isPending: isDownloading } = useDocumentDownload() + + /** + * Download the original uploaded file for citations whose data source is upload-file. + * We request a signed URL from the dataset document download endpoint, then trigger browser download. + */ + const handleDownloadUploadFile = async (e: MouseEvent) => { + // Prevent toggling the citation popup when user clicks the download link. + e.preventDefault() + e.stopPropagation() + + // Only upload-file citations can be downloaded this way (needs dataset/document ids). + const isUploadFile = data.dataSourceType === 'upload_file' || data.dataSourceType === 'file' + const datasetId = data.sources?.[0]?.dataset_id + const documentId = data.documentId || data.sources?.[0]?.document_id + if (!isUploadFile || !datasetId || !documentId || isDownloading) + return + + // Fetch signed URL (usually points to `/files//file-preview?...&as_attachment=true`). + const res = await downloadDocument({ datasetId, documentId }) + if (res?.url) + downloadUrl({ url: res.url, fileName: data.documentName }) + } + return ( = ({ setOpen(v => !v)}>
+ {/* Keep the trigger purely for opening the popup (no download link here). */}
{data.documentName}
@@ -57,7 +84,21 @@ const Popup: FC = ({
-
{data.documentName}
+
+ {/* If it's an upload-file reference, the title becomes a download link. */} + {(data.dataSourceType === 'upload_file' || data.dataSourceType === 'file') && !!data.sources?.[0]?.dataset_id + ? ( + + ) + : data.documentName} +
diff --git a/web/app/components/datasets/documents/components/list.tsx b/web/app/components/datasets/documents/components/list.tsx index 2bf9c278c4..0c78da113b 100644 --- a/web/app/components/datasets/documents/components/list.tsx +++ b/web/app/components/datasets/documents/components/list.tsx @@ -30,9 +30,10 @@ import { useDatasetDetailContextWithSelector as useDatasetDetailContext } from ' import useTimestamp from '@/hooks/use-timestamp' import { ChunkingMode, DataSourceType, DocumentActionType } from '@/models/datasets' import { DatasourceType } from '@/models/pipeline' -import { useDocumentArchive, useDocumentBatchRetryIndex, useDocumentDelete, useDocumentDisable, useDocumentEnable } from '@/service/knowledge/use-document' +import { useDocumentArchive, useDocumentBatchRetryIndex, useDocumentDelete, useDocumentDisable, useDocumentDownloadZip, useDocumentEnable } from '@/service/knowledge/use-document' import { asyncRunSafe } from '@/utils' import { cn } from '@/utils/classnames' +import { downloadBlob } from '@/utils/download' import { formatNumber } from '@/utils/format' import BatchAction from '../detail/completed/common/batch-action' import StatusItem from '../status-item' @@ -222,6 +223,7 @@ const DocumentList: FC = ({ const { mutateAsync: disableDocument } = useDocumentDisable() const { mutateAsync: deleteDocument } = useDocumentDelete() const { mutateAsync: retryIndexDocument } = useDocumentBatchRetryIndex() + const { mutateAsync: requestDocumentsZip, isPending: isDownloadingZip } = useDocumentDownloadZip() const handleAction = (actionName: DocumentActionType) => { return async () => { @@ -300,6 +302,39 @@ const DocumentList: FC = ({ return dataSourceType === DatasourceType.onlineDrive }, []) + const downloadableSelectedIds = useMemo(() => { + const selectedSet = new Set(selectedIds) + return localDocs + .filter(doc => selectedSet.has(doc.id) && doc.data_source_type === DataSourceType.FILE) + .map(doc => doc.id) + }, [localDocs, selectedIds]) + + /** + * Generate a random ZIP filename for bulk document downloads. + * We intentionally avoid leaking dataset info in the exported archive name. + */ + const generateDocsZipFileName = useCallback((): string => { + // Prefer UUID for uniqueness; fall back to time+random when unavailable. + const randomPart = (typeof crypto !== 'undefined' && typeof crypto.randomUUID === 'function') + ? crypto.randomUUID() + : `${Date.now().toString(36)}${Math.random().toString(36).slice(2, 10)}` + return `${randomPart}-docs.zip` + }, []) + + const handleBatchDownload = useCallback(async () => { + if (isDownloadingZip) + return + + // Download as a single ZIP to avoid browser caps on multiple automatic downloads. + const [e, blob] = await asyncRunSafe(requestDocumentsZip({ datasetId, documentIds: downloadableSelectedIds })) + if (e || !blob) { + Toast.notify({ type: 'error', message: t('actionMsg.downloadUnsuccessfully', { ns: 'common' }) }) + return + } + + downloadBlob({ data: blob, fileName: generateDocsZipFileName() }) + }, [datasetId, downloadableSelectedIds, generateDocsZipFileName, isDownloadingZip, requestDocumentsZip, t]) + return (
@@ -463,6 +498,7 @@ const DocumentList: FC = ({ onArchive={handleAction(DocumentActionType.archive)} onBatchEnable={handleAction(DocumentActionType.enable)} onBatchDisable={handleAction(DocumentActionType.disable)} + onBatchDownload={downloadableSelectedIds.length > 0 ? handleBatchDownload : undefined} onBatchDelete={handleAction(DocumentActionType.delete)} onEditMetadata={showEditModal} onBatchReIndex={hasErrorDocumentsSelected ? handleBatchReIndex : undefined} diff --git a/web/app/components/datasets/documents/components/operations.tsx b/web/app/components/datasets/documents/components/operations.tsx index 0d3c40c053..ee638c5e12 100644 --- a/web/app/components/datasets/documents/components/operations.tsx +++ b/web/app/components/datasets/documents/components/operations.tsx @@ -1,8 +1,10 @@ import type { OperationName } from '../types' import type { CommonResponse } from '@/models/common' +import type { DocumentDownloadResponse } from '@/service/datasets' import { RiArchive2Line, RiDeleteBinLine, + RiDownload2Line, RiEditLine, RiEqualizer2Line, RiLoopLeftLine, @@ -28,6 +30,7 @@ import { useDocumentArchive, useDocumentDelete, useDocumentDisable, + useDocumentDownload, useDocumentEnable, useDocumentPause, useDocumentResume, @@ -37,6 +40,7 @@ import { } from '@/service/knowledge/use-document' import { asyncRunSafe } from '@/utils' import { cn } from '@/utils/classnames' +import { downloadUrl } from '@/utils/download' import s from '../style.module.css' import RenameModal from './rename-modal' @@ -69,7 +73,7 @@ const Operations = ({ scene = 'list', className = '', }: OperationsProps) => { - const { id, enabled = false, archived = false, data_source_type, display_status } = detail || {} + const { id, name, enabled = false, archived = false, data_source_type, display_status } = detail || {} const [showModal, setShowModal] = useState(false) const [deleting, setDeleting] = useState(false) const { notify } = useContext(ToastContext) @@ -80,6 +84,7 @@ const Operations = ({ const { mutateAsync: enableDocument } = useDocumentEnable() const { mutateAsync: disableDocument } = useDocumentDisable() const { mutateAsync: deleteDocument } = useDocumentDelete() + const { mutateAsync: downloadDocument, isPending: isDownloading } = useDocumentDownload() const { mutateAsync: syncDocument } = useSyncDocument() const { mutateAsync: syncWebsite } = useSyncWebsite() const { mutateAsync: pauseDocument } = useDocumentPause() @@ -158,6 +163,24 @@ const Operations = ({ onUpdate() }, [onUpdate]) + const handleDownload = useCallback(async () => { + // Avoid repeated clicks while the signed URL request is in-flight. + if (isDownloading) + return + + // Request a signed URL first (it points to `/files//file-preview?...&as_attachment=true`). + const [e, res] = await asyncRunSafe( + downloadDocument({ datasetId, documentId: id }) as Promise, + ) + if (e || !res?.url) { + notify({ type: 'error', message: t('actionMsg.downloadUnsuccessfully', { ns: 'common' }) }) + return + } + + // Trigger download without navigating away (helps avoid duplicate downloads in some browsers). + downloadUrl({ url: res.url, fileName: name }) + }, [datasetId, downloadDocument, id, isDownloading, name, notify, t]) + return (
e.stopPropagation()}> {isListScene && !embeddingAvailable && ( @@ -214,6 +237,20 @@ const Operations = ({ {t('list.table.rename', { ns: 'datasetDocuments' })}
+ {data_source_type === DataSourceType.FILE && ( +
{ + evt.preventDefault() + evt.stopPropagation() + evt.nativeEvent.stopImmediatePropagation?.() + handleDownload() + }} + > + + {t('list.action.download', { ns: 'datasetDocuments' })} +
+ )} {['notion_import', DataSourceType.WEB].includes(data_source_type) && (
onOperate('sync')}> @@ -223,6 +260,23 @@ const Operations = ({ )} + {archived && data_source_type === DataSourceType.FILE && ( + <> +
{ + evt.preventDefault() + evt.stopPropagation() + evt.nativeEvent.stopImmediatePropagation?.() + handleDownload() + }} + > + + {t('list.action.download', { ns: 'datasetDocuments' })} +
+ + + )} {!archived && display_status?.toLowerCase() === 'indexing' && (
onOperate('pause')}> diff --git a/web/app/components/datasets/documents/detail/completed/common/batch-action.tsx b/web/app/components/datasets/documents/detail/completed/common/batch-action.tsx index db36ca471a..2de72d9ff6 100644 --- a/web/app/components/datasets/documents/detail/completed/common/batch-action.tsx +++ b/web/app/components/datasets/documents/detail/completed/common/batch-action.tsx @@ -1,5 +1,5 @@ import type { FC } from 'react' -import { RiArchive2Line, RiCheckboxCircleLine, RiCloseCircleLine, RiDeleteBinLine, RiDraftLine, RiRefreshLine } from '@remixicon/react' +import { RiArchive2Line, RiCheckboxCircleLine, RiCloseCircleLine, RiDeleteBinLine, RiDownload2Line, RiDraftLine, RiRefreshLine } from '@remixicon/react' import { useBoolean } from 'ahooks' import * as React from 'react' import { useTranslation } from 'react-i18next' @@ -14,6 +14,7 @@ type IBatchActionProps = { selectedIds: string[] onBatchEnable: () => void onBatchDisable: () => void + onBatchDownload?: () => void onBatchDelete: () => Promise onArchive?: () => void onEditMetadata?: () => void @@ -26,6 +27,7 @@ const BatchAction: FC = ({ selectedIds, onBatchEnable, onBatchDisable, + onBatchDownload, onArchive, onBatchDelete, onEditMetadata, @@ -103,6 +105,16 @@ const BatchAction: FC = ({ {t(`${i18nPrefix}.reIndex`, { ns: 'dataset' })} )} + {onBatchDownload && ( + + )}