diff --git a/.github/labeler.yml b/.github/labeler.yml new file mode 100644 index 0000000000..d1d324d381 --- /dev/null +++ b/.github/labeler.yml @@ -0,0 +1,3 @@ +web: + - changed-files: + - any-glob-to-any-file: 'web/**' diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml new file mode 100644 index 0000000000..06782b53c1 --- /dev/null +++ b/.github/workflows/labeler.yml @@ -0,0 +1,14 @@ +name: "Pull Request Labeler" +on: + pull_request_target: + +jobs: + labeler: + permissions: + contents: read + pull-requests: write + runs-on: ubuntu-latest + steps: + - uses: actions/labeler@v6 + with: + sync-labels: true diff --git a/.github/workflows/style.yml b/.github/workflows/style.yml index debf4ba648..5551030f1e 100644 --- a/.github/workflows/style.yml +++ b/.github/workflows/style.yml @@ -117,6 +117,11 @@ jobs: # eslint-report: web/eslint_report.json # github-token: ${{ secrets.GITHUB_TOKEN }} + - name: Web tsslint + if: steps.changed-files.outputs.any_changed == 'true' + working-directory: ./web + run: pnpm run lint:tss + - name: Web type check if: steps.changed-files.outputs.any_changed == 'true' working-directory: ./web diff --git a/api/agent-notes/controllers/console/datasets/datasets_document.py.md b/api/agent-notes/controllers/console/datasets/datasets_document.py.md new file mode 100644 index 0000000000..b100249981 --- /dev/null +++ b/api/agent-notes/controllers/console/datasets/datasets_document.py.md @@ -0,0 +1,52 @@ +## Purpose + +`api/controllers/console/datasets/datasets_document.py` contains the console (authenticated) APIs for managing dataset documents (list/create/update/delete, processing controls, estimates, etc.). + +## Storage model (uploaded files) + +- For local file uploads into a knowledge base, the binary is stored via `extensions.ext_storage.storage` under the key: + - `upload_files//.` +- File metadata is stored in the `upload_files` table (`UploadFile` model), keyed by `UploadFile.id`. +- Dataset `Document` records reference the uploaded file via: + - `Document.data_source_info.upload_file_id` + +## Download endpoint + +- `GET /datasets//documents//download` + + - Only supported when `Document.data_source_type == "upload_file"`. + - Performs dataset permission + tenant checks via `DocumentResource.get_document(...)`. + - Delegates `Document -> UploadFile` validation and signed URL generation to `DocumentService.get_document_download_url(...)`. + - Applies `cloud_edition_billing_rate_limit_check("knowledge")` to match other KB operations. + - Response body is **only**: `{ "url": "" }`. + +- `POST /datasets//documents/download-zip` + + - Accepts `{ "document_ids": ["..."] }` (upload-file only). + - Returns `application/zip` as a single attachment download. + - Rationale: browsers often block multiple automatic downloads; a ZIP avoids that limitation. + - Applies `cloud_edition_billing_rate_limit_check("knowledge")`. + - Delegates dataset permission checks, document/upload-file validation, and download-name generation to + `DocumentService.prepare_document_batch_download_zip(...)` before streaming the ZIP. + +## Verification plan + +- Upload a document from a local file into a dataset. +- Call the download endpoint and confirm it returns a signed URL. +- Open the URL and confirm: + - Response headers force download (`Content-Disposition`), and + - Downloaded bytes match the uploaded file. +- Select multiple uploaded-file documents and download as ZIP; confirm all selected files exist in the archive. + +## Shared helper + +- `DocumentService.get_document_download_url(document)` resolves the `UploadFile` and signs a download URL. +- `DocumentService.prepare_document_batch_download_zip(...)` performs dataset permission checks, batches + document + upload file lookups, preserves request order, and generates the client-visible ZIP filename. +- Internal helpers now live in `DocumentService` (`_get_upload_file_id_for_upload_file_document(...)`, + `_get_upload_file_for_upload_file_document(...)`, `_get_upload_files_by_document_id_for_zip_download(...)`). +- ZIP packing is handled by `FileService.build_upload_files_zip_tempfile(...)`, which also: + - sanitizes entry names to avoid path traversal, and + - deduplicates names while preserving extensions (e.g., `doc.txt` → `doc (1).txt`). + Streaming the response and deferring cleanup is handled by the route via `send_file(path, ...)` + `ExitStack` + + `response.call_on_close(...)` (the file is deleted when the response is closed). diff --git a/api/agent-notes/services/dataset_service.py.md b/api/agent-notes/services/dataset_service.py.md new file mode 100644 index 0000000000..b68ef345f5 --- /dev/null +++ b/api/agent-notes/services/dataset_service.py.md @@ -0,0 +1,18 @@ +## Purpose + +`api/services/dataset_service.py` hosts dataset/document service logic used by console and API controllers. + +## Batch document operations + +- Batch document workflows should avoid N+1 database queries by using set-based lookups. +- Tenant checks must be enforced consistently across dataset/document operations. +- `DocumentService.get_documents_by_ids(...)` fetches documents for a dataset using `id.in_(...)`. +- `FileService.get_upload_files_by_ids(...)` performs tenant-scoped batch lookup for `UploadFile` (dedupes ids with `set(...)`). +- `DocumentService.get_document_download_url(...)` and `prepare_document_batch_download_zip(...)` handle + dataset/document permission checks plus `Document -> UploadFile` validation for download endpoints. + +## Verification plan + +- Exercise document list and download endpoints that use the service helpers. +- Confirm batch download uses constant query count for documents + upload files. +- Request a ZIP with a missing document id and confirm a 404 is returned. diff --git a/api/agent-notes/services/file_service.py.md b/api/agent-notes/services/file_service.py.md new file mode 100644 index 0000000000..cf394a1c05 --- /dev/null +++ b/api/agent-notes/services/file_service.py.md @@ -0,0 +1,35 @@ +## Purpose + +`api/services/file_service.py` owns business logic around `UploadFile` objects: upload validation, storage persistence, +previews/generators, and deletion. + +## Key invariants + +- All storage I/O goes through `extensions.ext_storage.storage`. +- Uploaded file keys follow: `upload_files//.`. +- Upload validation is enforced in `FileService.upload_file(...)` (blocked extensions, size limits, dataset-only types). + +## Batch lookup helpers + +- `FileService.get_upload_files_by_ids(tenant_id, upload_file_ids)` is the canonical tenant-scoped batch loader for + `UploadFile`. + +## Dataset document download helpers + +The dataset document download/ZIP endpoints now delegate “Document → UploadFile” validation and permission checks to +`DocumentService` (`api/services/dataset_service.py`). `FileService` stays focused on generic `UploadFile` operations +(uploading, previews, deletion), plus generic ZIP serving. + +### ZIP serving + +- `FileService.build_upload_files_zip_tempfile(...)` builds a ZIP from `UploadFile` objects and yields a seeked + tempfile **path** so callers can stream it (e.g., `send_file(path, ...)`) without hitting "read of closed file" + issues from file-handle lifecycle during streamed responses. +- Flask `send_file(...)` and the `ExitStack`/`call_on_close(...)` cleanup pattern are handled in the route layer. + +## Verification plan + +- Unit: `api/tests/unit_tests/controllers/console/datasets/test_datasets_document_download.py` + - Verify signed URL generation for upload-file documents and ZIP download behavior for multiple documents. +- Unit: `api/tests/unit_tests/services/test_file_service_zip_and_lookup.py` + - Verify ZIP packing produces a valid, openable archive and preserves file content. diff --git a/api/agent-notes/tests/unit_tests/controllers/console/datasets/test_datasets_document_download.py.md b/api/agent-notes/tests/unit_tests/controllers/console/datasets/test_datasets_document_download.py.md new file mode 100644 index 0000000000..8f78dacde8 --- /dev/null +++ b/api/agent-notes/tests/unit_tests/controllers/console/datasets/test_datasets_document_download.py.md @@ -0,0 +1,28 @@ +## Purpose + +Unit tests for the console dataset document download endpoint: + +- `GET /datasets//documents//download` + +## Testing approach + +- Uses `Flask.test_request_context()` and calls the `Resource.get(...)` method directly. +- Monkeypatches console decorators (`login_required`, `setup_required`, rate limit) to no-ops to keep the test focused. +- Mocks: + - `DatasetService.get_dataset` / `check_dataset_permission` + - `DocumentService.get_document` for single-file download tests + - `DocumentService.get_documents_by_ids` + `FileService.get_upload_files_by_ids` for ZIP download tests + - `FileService.get_upload_files_by_ids` for `UploadFile` lookups in single-file tests + - `services.dataset_service.file_helpers.get_signed_file_url` to return a deterministic URL +- Document mocks include `id` fields so batch lookups can map documents by id. + +## Covered cases + +- Success returns `{ "url": "" }` for upload-file documents. +- 404 when document is not `upload_file`. +- 404 when `upload_file_id` is missing. +- 404 when referenced `UploadFile` row does not exist. +- 403 when document tenant does not match current tenant. +- Batch ZIP download returns `application/zip` for upload-file documents. +- Batch ZIP download rejects non-upload-file documents. +- Batch ZIP download uses a random `.zip` attachment name (`download_name`), so tests only assert the suffix. diff --git a/api/agent-notes/tests/unit_tests/services/test_file_service_zip_and_lookup.py.md b/api/agent-notes/tests/unit_tests/services/test_file_service_zip_and_lookup.py.md new file mode 100644 index 0000000000..dbcdf26f10 --- /dev/null +++ b/api/agent-notes/tests/unit_tests/services/test_file_service_zip_and_lookup.py.md @@ -0,0 +1,18 @@ +## Purpose + +Unit tests for `api/services/file_service.py` helper methods that are not covered by higher-level controller tests. + +## What’s covered + +- `FileService.build_upload_files_zip_tempfile(...)` + - ZIP entry name sanitization (no directory components / traversal) + - name deduplication while preserving extensions + - writing streamed bytes from `storage.load(...)` into ZIP entries + - yields a tempfile path so callers can open/stream the ZIP without holding a live file handle +- `FileService.get_upload_files_by_ids(...)` + - returns `{}` for empty id lists + - returns an id-keyed mapping for non-empty lists + +## Notes + +- These tests intentionally stub `storage.load` and `db.session.scalars(...).all()` to avoid needing a real DB/storage. diff --git a/api/controllers/console/datasets/datasets_document.py b/api/controllers/console/datasets/datasets_document.py index 707d90f044..2599e6293a 100644 --- a/api/controllers/console/datasets/datasets_document.py +++ b/api/controllers/console/datasets/datasets_document.py @@ -2,10 +2,12 @@ import json import logging from argparse import ArgumentTypeError from collections.abc import Sequence -from typing import Literal, cast +from contextlib import ExitStack +from typing import Any, Literal, cast +from uuid import UUID import sqlalchemy as sa -from flask import request +from flask import request, send_file from flask_restx import Resource, fields, marshal, marshal_with from pydantic import BaseModel, Field from sqlalchemy import asc, desc, select @@ -42,6 +44,7 @@ from models import DatasetProcessRule, Document, DocumentSegment, UploadFile from models.dataset import DocumentPipelineExecutionLog from services.dataset_service import DatasetService, DocumentService from services.entities.knowledge_entities.knowledge_entities import KnowledgeConfig, ProcessRule, RetrievalModel +from services.file_service import FileService from ..app.error import ( ProviderModelCurrentlyNotSupportError, @@ -65,6 +68,9 @@ from ..wraps import ( logger = logging.getLogger(__name__) +# NOTE: Keep constants near the top of the module for discoverability. +DOCUMENT_BATCH_DOWNLOAD_ZIP_MAX_DOCS = 100 + def _get_or_create_model(model_name: str, field_def): existing = console_ns.models.get(model_name) @@ -104,6 +110,12 @@ class DocumentRenamePayload(BaseModel): name: str +class DocumentBatchDownloadZipPayload(BaseModel): + """Request payload for bulk downloading documents as a zip archive.""" + + document_ids: list[UUID] = Field(..., min_length=1, max_length=DOCUMENT_BATCH_DOWNLOAD_ZIP_MAX_DOCS) + + class DocumentDatasetListParam(BaseModel): page: int = Field(1, title="Page", description="Page number.") limit: int = Field(20, title="Limit", description="Page size.") @@ -120,6 +132,7 @@ register_schema_models( RetrievalModel, DocumentRetryPayload, DocumentRenamePayload, + DocumentBatchDownloadZipPayload, ) @@ -853,6 +866,62 @@ class DocumentApi(DocumentResource): return {"result": "success"}, 204 +@console_ns.route("/datasets//documents//download") +class DocumentDownloadApi(DocumentResource): + """Return a signed download URL for a dataset document's original uploaded file.""" + + @console_ns.doc("get_dataset_document_download_url") + @console_ns.doc(description="Get a signed download URL for a dataset document's original uploaded file") + @setup_required + @login_required + @account_initialization_required + @cloud_edition_billing_rate_limit_check("knowledge") + def get(self, dataset_id: str, document_id: str) -> dict[str, Any]: + # Reuse the shared permission/tenant checks implemented in DocumentResource. + document = self.get_document(str(dataset_id), str(document_id)) + return {"url": DocumentService.get_document_download_url(document)} + + +@console_ns.route("/datasets//documents/download-zip") +class DocumentBatchDownloadZipApi(DocumentResource): + """Download multiple uploaded-file documents as a single ZIP (avoids browser multi-download limits).""" + + @console_ns.doc("download_dataset_documents_as_zip") + @console_ns.doc(description="Download selected dataset documents as a single ZIP archive (upload-file only)") + @setup_required + @login_required + @account_initialization_required + @cloud_edition_billing_rate_limit_check("knowledge") + @console_ns.expect(console_ns.models[DocumentBatchDownloadZipPayload.__name__]) + def post(self, dataset_id: str): + """Stream a ZIP archive containing the requested uploaded documents.""" + # Parse and validate request payload. + payload = DocumentBatchDownloadZipPayload.model_validate(console_ns.payload or {}) + + current_user, current_tenant_id = current_account_with_tenant() + dataset_id = str(dataset_id) + document_ids: list[str] = [str(document_id) for document_id in payload.document_ids] + upload_files, download_name = DocumentService.prepare_document_batch_download_zip( + dataset_id=dataset_id, + document_ids=document_ids, + tenant_id=current_tenant_id, + current_user=current_user, + ) + + # Delegate ZIP packing to FileService, but keep Flask response+cleanup in the route. + with ExitStack() as stack: + zip_path = stack.enter_context(FileService.build_upload_files_zip_tempfile(upload_files=upload_files)) + response = send_file( + zip_path, + mimetype="application/zip", + as_attachment=True, + download_name=download_name, + ) + cleanup = stack.pop_all() + response.call_on_close(cleanup.close) + return response + + @console_ns.route("/datasets//documents//processing/") class DocumentProcessingApi(DocumentResource): @console_ns.doc("update_document_processing") diff --git a/api/services/dataset_service.py b/api/services/dataset_service.py index 18e5613438..be9a0e9279 100644 --- a/api/services/dataset_service.py +++ b/api/services/dataset_service.py @@ -13,10 +13,11 @@ import sqlalchemy as sa from redis.exceptions import LockNotOwnedError from sqlalchemy import exists, func, select from sqlalchemy.orm import Session -from werkzeug.exceptions import NotFound +from werkzeug.exceptions import Forbidden, NotFound from configs import dify_config from core.errors.error import LLMBadRequestError, ProviderTokenNotInitError +from core.file import helpers as file_helpers from core.helper.name_generator import generate_incremental_name from core.model_manager import ModelManager from core.model_runtime.entities.model_entities import ModelFeature, ModelType @@ -73,6 +74,7 @@ from services.errors.document import DocumentIndexingError from services.errors.file import FileNotExistsError from services.external_knowledge_service import ExternalDatasetService from services.feature_service import FeatureModel, FeatureService +from services.file_service import FileService from services.rag_pipeline.rag_pipeline import RagPipelineService from services.tag_service import TagService from services.vector_service import VectorService @@ -1162,6 +1164,7 @@ class DocumentService: Document.archived.is_(True), ), } + DOCUMENT_BATCH_DOWNLOAD_ZIP_FILENAME_EXTENSION = ".zip" @classmethod def normalize_display_status(cls, status: str | None) -> str | None: @@ -1288,6 +1291,143 @@ class DocumentService: else: return None + @staticmethod + def get_documents_by_ids(dataset_id: str, document_ids: Sequence[str]) -> Sequence[Document]: + """Fetch documents for a dataset in a single batch query.""" + if not document_ids: + return [] + document_id_list: list[str] = [str(document_id) for document_id in document_ids] + # Fetch all requested documents in one query to avoid N+1 lookups. + documents: Sequence[Document] = db.session.scalars( + select(Document).where( + Document.dataset_id == dataset_id, + Document.id.in_(document_id_list), + ) + ).all() + return documents + + @staticmethod + def get_document_download_url(document: Document) -> str: + """ + Return a signed download URL for an upload-file document. + """ + upload_file = DocumentService._get_upload_file_for_upload_file_document(document) + return file_helpers.get_signed_file_url(upload_file_id=upload_file.id, as_attachment=True) + + @staticmethod + def prepare_document_batch_download_zip( + *, + dataset_id: str, + document_ids: Sequence[str], + tenant_id: str, + current_user: Account, + ) -> tuple[list[UploadFile], str]: + """ + Resolve upload files for batch ZIP downloads and generate a client-visible filename. + """ + dataset = DatasetService.get_dataset(dataset_id) + if not dataset: + raise NotFound("Dataset not found.") + try: + DatasetService.check_dataset_permission(dataset, current_user) + except NoPermissionError as e: + raise Forbidden(str(e)) + + upload_files_by_document_id = DocumentService._get_upload_files_by_document_id_for_zip_download( + dataset_id=dataset_id, + document_ids=document_ids, + tenant_id=tenant_id, + ) + upload_files = [upload_files_by_document_id[document_id] for document_id in document_ids] + download_name = DocumentService._generate_document_batch_download_zip_filename() + return upload_files, download_name + + @staticmethod + def _generate_document_batch_download_zip_filename() -> str: + """ + Generate a random attachment filename for the batch download ZIP. + """ + return f"{uuid.uuid4().hex}{DocumentService.DOCUMENT_BATCH_DOWNLOAD_ZIP_FILENAME_EXTENSION}" + + @staticmethod + def _get_upload_file_id_for_upload_file_document( + document: Document, + *, + invalid_source_message: str, + missing_file_message: str, + ) -> str: + """ + Normalize and validate `Document -> UploadFile` linkage for download flows. + """ + if document.data_source_type != "upload_file": + raise NotFound(invalid_source_message) + + data_source_info: dict[str, Any] = document.data_source_info_dict or {} + upload_file_id: str | None = data_source_info.get("upload_file_id") + if not upload_file_id: + raise NotFound(missing_file_message) + + return str(upload_file_id) + + @staticmethod + def _get_upload_file_for_upload_file_document(document: Document) -> UploadFile: + """ + Load the `UploadFile` row for an upload-file document. + """ + upload_file_id = DocumentService._get_upload_file_id_for_upload_file_document( + document, + invalid_source_message="Document does not have an uploaded file to download.", + missing_file_message="Uploaded file not found.", + ) + upload_files_by_id = FileService.get_upload_files_by_ids(document.tenant_id, [upload_file_id]) + upload_file = upload_files_by_id.get(upload_file_id) + if not upload_file: + raise NotFound("Uploaded file not found.") + return upload_file + + @staticmethod + def _get_upload_files_by_document_id_for_zip_download( + *, + dataset_id: str, + document_ids: Sequence[str], + tenant_id: str, + ) -> dict[str, UploadFile]: + """ + Batch load upload files keyed by document id for ZIP downloads. + """ + document_id_list: list[str] = [str(document_id) for document_id in document_ids] + + documents = DocumentService.get_documents_by_ids(dataset_id, document_id_list) + documents_by_id: dict[str, Document] = {str(document.id): document for document in documents} + + missing_document_ids: set[str] = set(document_id_list) - set(documents_by_id.keys()) + if missing_document_ids: + raise NotFound("Document not found.") + + upload_file_ids: list[str] = [] + upload_file_ids_by_document_id: dict[str, str] = {} + for document_id, document in documents_by_id.items(): + if document.tenant_id != tenant_id: + raise Forbidden("No permission.") + + upload_file_id = DocumentService._get_upload_file_id_for_upload_file_document( + document, + invalid_source_message="Only uploaded-file documents can be downloaded as ZIP.", + missing_file_message="Only uploaded-file documents can be downloaded as ZIP.", + ) + upload_file_ids.append(upload_file_id) + upload_file_ids_by_document_id[document_id] = upload_file_id + + upload_files_by_id = FileService.get_upload_files_by_ids(tenant_id, upload_file_ids) + missing_upload_file_ids: set[str] = set(upload_file_ids) - set(upload_files_by_id.keys()) + if missing_upload_file_ids: + raise NotFound("Only uploaded-file documents can be downloaded as ZIP.") + + return { + document_id: upload_files_by_id[upload_file_id] + for document_id, upload_file_id in upload_file_ids_by_document_id.items() + } + @staticmethod def get_document_by_id(document_id: str) -> Document | None: document = db.session.query(Document).where(Document.id == document_id).first() diff --git a/api/services/file_service.py b/api/services/file_service.py index 0911cf38c4..a0a99f3f82 100644 --- a/api/services/file_service.py +++ b/api/services/file_service.py @@ -2,7 +2,11 @@ import base64 import hashlib import os import uuid +from collections.abc import Iterator, Sequence +from contextlib import contextmanager, suppress +from tempfile import NamedTemporaryFile from typing import Literal, Union +from zipfile import ZIP_DEFLATED, ZipFile from sqlalchemy import Engine, select from sqlalchemy.orm import Session, sessionmaker @@ -17,6 +21,7 @@ from constants import ( ) from core.file import helpers as file_helpers from core.rag.extractor.extract_processor import ExtractProcessor +from extensions.ext_database import db from extensions.ext_storage import storage from libs.datetime_utils import naive_utc_now from libs.helper import extract_tenant_id @@ -167,6 +172,9 @@ class FileService: return upload_file def get_file_preview(self, file_id: str): + """ + Return a short text preview extracted from a document file. + """ with self._session_maker(expire_on_commit=False) as session: upload_file = session.query(UploadFile).where(UploadFile.id == file_id).first() @@ -253,3 +261,101 @@ class FileService: return storage.delete(upload_file.key) session.delete(upload_file) + + @staticmethod + def get_upload_files_by_ids(tenant_id: str, upload_file_ids: Sequence[str]) -> dict[str, UploadFile]: + """ + Fetch `UploadFile` rows for a tenant in a single batch query. + + This is a generic `UploadFile` lookup helper (not dataset/document specific), so it lives in `FileService`. + """ + if not upload_file_ids: + return {} + + # Normalize and deduplicate ids before using them in the IN clause. + upload_file_id_list: list[str] = [str(upload_file_id) for upload_file_id in upload_file_ids] + unique_upload_file_ids: list[str] = list(set(upload_file_id_list)) + + # Fetch upload files in one query for efficient batch access. + upload_files: Sequence[UploadFile] = db.session.scalars( + select(UploadFile).where( + UploadFile.tenant_id == tenant_id, + UploadFile.id.in_(unique_upload_file_ids), + ) + ).all() + return {str(upload_file.id): upload_file for upload_file in upload_files} + + @staticmethod + def _sanitize_zip_entry_name(name: str) -> str: + """ + Sanitize a ZIP entry name to avoid path traversal and weird separators. + + We keep this conservative: the upload flow already rejects `/` and `\\`, but older rows (or imported data) + could still contain unsafe names. + """ + # Drop any directory components and prevent empty names. + base = os.path.basename(name).strip() or "file" + + # ZIP uses forward slashes as separators; remove any residual separator characters. + return base.replace("/", "_").replace("\\", "_") + + @staticmethod + def _dedupe_zip_entry_name(original_name: str, used_names: set[str]) -> str: + """ + Return a unique ZIP entry name, inserting suffixes before the extension. + """ + # Keep the original name when it's not already used. + if original_name not in used_names: + return original_name + + # Insert suffixes before the extension (e.g., "doc.txt" -> "doc (1).txt"). + stem, extension = os.path.splitext(original_name) + suffix = 1 + while True: + candidate = f"{stem} ({suffix}){extension}" + if candidate not in used_names: + return candidate + suffix += 1 + + @staticmethod + @contextmanager + def build_upload_files_zip_tempfile( + *, + upload_files: Sequence[UploadFile], + ) -> Iterator[str]: + """ + Build a ZIP from `UploadFile`s and yield a tempfile path. + + We yield a path (rather than an open file handle) to avoid "read of closed file" issues when Flask/Werkzeug + streams responses. The caller is expected to keep this context open until the response is fully sent, then + close it (e.g., via `response.call_on_close(...)`) to delete the tempfile. + """ + used_names: set[str] = set() + + # Build a ZIP in a temp file and keep it on disk until the caller finishes streaming it. + tmp_path: str | None = None + try: + with NamedTemporaryFile(mode="w+b", suffix=".zip", delete=False) as tmp: + tmp_path = tmp.name + with ZipFile(tmp, mode="w", compression=ZIP_DEFLATED) as zf: + for upload_file in upload_files: + # Ensure the entry name is safe and unique. + safe_name = FileService._sanitize_zip_entry_name(upload_file.name) + arcname = FileService._dedupe_zip_entry_name(safe_name, used_names) + used_names.add(arcname) + + # Stream file bytes from storage into the ZIP entry. + with zf.open(arcname, "w") as entry: + for chunk in storage.load(upload_file.key, stream=True): + entry.write(chunk) + + # Flush so `send_file(path, ...)` can re-open it safely on all platforms. + tmp.flush() + + assert tmp_path is not None + yield tmp_path + finally: + # Remove the temp file when the context is closed (typically after the response finishes streaming). + if tmp_path is not None: + with suppress(FileNotFoundError): + os.remove(tmp_path) diff --git a/api/tests/unit_tests/controllers/console/datasets/test_datasets_document_download.py b/api/tests/unit_tests/controllers/console/datasets/test_datasets_document_download.py new file mode 100644 index 0000000000..d5d7ee95c5 --- /dev/null +++ b/api/tests/unit_tests/controllers/console/datasets/test_datasets_document_download.py @@ -0,0 +1,430 @@ +""" +Unit tests for the dataset document download endpoint. + +These tests validate that the controller returns a signed download URL for +upload-file documents, and rejects unsupported or missing file cases. +""" + +from __future__ import annotations + +import importlib +import sys +from collections import UserDict +from io import BytesIO +from types import SimpleNamespace +from typing import Any +from zipfile import ZipFile + +import pytest +from flask import Flask +from werkzeug.exceptions import Forbidden, NotFound + + +@pytest.fixture +def app() -> Flask: + """Create a minimal Flask app for request-context based controller tests.""" + app = Flask(__name__) + app.config["TESTING"] = True + return app + + +@pytest.fixture +def datasets_document_module(monkeypatch: pytest.MonkeyPatch): + """ + Reload `controllers.console.datasets.datasets_document` with lightweight decorators. + + We patch auth / setup / rate-limit decorators to no-ops so we can unit test the + controller logic without requiring the full console stack. + """ + + from controllers.console import console_ns, wraps + from libs import login + + def _noop(func): # type: ignore[no-untyped-def] + return func + + # Bypass login/setup/account checks in unit tests. + monkeypatch.setattr(login, "login_required", _noop) + monkeypatch.setattr(wraps, "setup_required", _noop) + monkeypatch.setattr(wraps, "account_initialization_required", _noop) + + # Bypass billing-related decorators used by other endpoints in this module. + monkeypatch.setattr(wraps, "cloud_edition_billing_resource_check", lambda *_args, **_kwargs: (lambda f: f)) + monkeypatch.setattr(wraps, "cloud_edition_billing_rate_limit_check", lambda *_args, **_kwargs: (lambda f: f)) + + # Avoid Flask-RESTX route registration side effects during import. + def _noop_route(*_args, **_kwargs): # type: ignore[override] + def _decorator(cls): + return cls + + return _decorator + + monkeypatch.setattr(console_ns, "route", _noop_route) + + module_name = "controllers.console.datasets.datasets_document" + sys.modules.pop(module_name, None) + return importlib.import_module(module_name) + + +def _mock_user(*, is_dataset_editor: bool = True) -> SimpleNamespace: + """Build a minimal user object compatible with dataset permission checks.""" + return SimpleNamespace(is_dataset_editor=is_dataset_editor, id="user-123") + + +def _mock_document( + *, + document_id: str, + tenant_id: str, + data_source_type: str, + upload_file_id: str | None, +) -> SimpleNamespace: + """Build a minimal document object used by the controller.""" + data_source_info_dict: dict[str, Any] | None = None + if upload_file_id is not None: + data_source_info_dict = {"upload_file_id": upload_file_id} + else: + data_source_info_dict = {} + + return SimpleNamespace( + id=document_id, + tenant_id=tenant_id, + data_source_type=data_source_type, + data_source_info_dict=data_source_info_dict, + ) + + +def _wire_common_success_mocks( + *, + module, + monkeypatch: pytest.MonkeyPatch, + current_tenant_id: str, + document_tenant_id: str, + data_source_type: str, + upload_file_id: str | None, + upload_file_exists: bool, + signed_url: str, +) -> None: + """Patch controller dependencies to create a deterministic test environment.""" + import services.dataset_service as dataset_service_module + + # Make `current_account_with_tenant()` return a known user + tenant id. + monkeypatch.setattr(module, "current_account_with_tenant", lambda: (_mock_user(), current_tenant_id)) + + # Return a dataset object and allow permission checks to pass. + monkeypatch.setattr(module.DatasetService, "get_dataset", lambda _dataset_id: SimpleNamespace(id="ds-1")) + monkeypatch.setattr(module.DatasetService, "check_dataset_permission", lambda *_args, **_kwargs: None) + + # Return a document that will be validated inside DocumentResource.get_document. + document = _mock_document( + document_id="doc-1", + tenant_id=document_tenant_id, + data_source_type=data_source_type, + upload_file_id=upload_file_id, + ) + monkeypatch.setattr(module.DocumentService, "get_document", lambda *_args, **_kwargs: document) + + # Mock UploadFile lookup via FileService batch helper. + upload_files_by_id: dict[str, Any] = {} + if upload_file_exists and upload_file_id is not None: + upload_files_by_id[str(upload_file_id)] = SimpleNamespace(id=str(upload_file_id)) + monkeypatch.setattr(module.FileService, "get_upload_files_by_ids", lambda *_args, **_kwargs: upload_files_by_id) + + # Mock signing helper so the returned URL is deterministic. + monkeypatch.setattr(dataset_service_module.file_helpers, "get_signed_file_url", lambda **_kwargs: signed_url) + + +def _mock_send_file(obj, **kwargs): # type: ignore[no-untyped-def] + """Return a lightweight representation of `send_file(...)` for unit tests.""" + + class _ResponseMock(UserDict): + def __init__(self, sent_file: object, send_file_kwargs: dict[str, object]) -> None: + super().__init__({"_sent_file": sent_file, "_send_file_kwargs": send_file_kwargs}) + self._on_close: object | None = None + + def call_on_close(self, func): # type: ignore[no-untyped-def] + self._on_close = func + return func + + return _ResponseMock(obj, kwargs) + + +def test_batch_download_zip_returns_send_file( + app: Flask, datasets_document_module, monkeypatch: pytest.MonkeyPatch +) -> None: + """Ensure batch ZIP download returns a zip attachment via `send_file`.""" + + # Arrange common permission mocks. + monkeypatch.setattr(datasets_document_module, "current_account_with_tenant", lambda: (_mock_user(), "tenant-123")) + monkeypatch.setattr( + datasets_document_module.DatasetService, "get_dataset", lambda _dataset_id: SimpleNamespace(id="ds-1") + ) + monkeypatch.setattr( + datasets_document_module.DatasetService, "check_dataset_permission", lambda *_args, **_kwargs: None + ) + + # Two upload-file documents, each referencing an UploadFile. + doc1 = _mock_document( + document_id="11111111-1111-1111-1111-111111111111", + tenant_id="tenant-123", + data_source_type="upload_file", + upload_file_id="file-1", + ) + doc2 = _mock_document( + document_id="22222222-2222-2222-2222-222222222222", + tenant_id="tenant-123", + data_source_type="upload_file", + upload_file_id="file-2", + ) + monkeypatch.setattr( + datasets_document_module.DocumentService, + "get_documents_by_ids", + lambda *_args, **_kwargs: [doc1, doc2], + ) + monkeypatch.setattr( + datasets_document_module.FileService, + "get_upload_files_by_ids", + lambda *_args, **_kwargs: { + "file-1": SimpleNamespace(id="file-1", name="a.txt", key="k1"), + "file-2": SimpleNamespace(id="file-2", name="b.txt", key="k2"), + }, + ) + + # Mock storage streaming content. + import services.file_service as file_service_module + + monkeypatch.setattr(file_service_module.storage, "load", lambda _key, stream=True: [b"hello"]) + + # Replace send_file used by the controller to avoid a real Flask response object. + monkeypatch.setattr(datasets_document_module, "send_file", _mock_send_file) + + # Act + with app.test_request_context( + "/datasets/ds-1/documents/download-zip", + method="POST", + json={"document_ids": ["11111111-1111-1111-1111-111111111111", "22222222-2222-2222-2222-222222222222"]}, + ): + api = datasets_document_module.DocumentBatchDownloadZipApi() + result = api.post(dataset_id="ds-1") + + # Assert: we returned via send_file with correct mime type and attachment. + assert result["_send_file_kwargs"]["mimetype"] == "application/zip" + assert result["_send_file_kwargs"]["as_attachment"] is True + assert isinstance(result["_send_file_kwargs"]["download_name"], str) + assert result["_send_file_kwargs"]["download_name"].endswith(".zip") + # Ensure our cleanup hook is registered and execute it to avoid temp file leaks in unit tests. + assert getattr(result, "_on_close", None) is not None + result._on_close() # type: ignore[attr-defined] + + +def test_batch_download_zip_response_is_openable_zip( + app: Flask, datasets_document_module, monkeypatch: pytest.MonkeyPatch +) -> None: + """Ensure the real Flask `send_file` response body is a valid ZIP that can be opened.""" + + # Arrange: same controller mocks as the lightweight send_file test, but we keep the real `send_file`. + monkeypatch.setattr(datasets_document_module, "current_account_with_tenant", lambda: (_mock_user(), "tenant-123")) + monkeypatch.setattr( + datasets_document_module.DatasetService, "get_dataset", lambda _dataset_id: SimpleNamespace(id="ds-1") + ) + monkeypatch.setattr( + datasets_document_module.DatasetService, "check_dataset_permission", lambda *_args, **_kwargs: None + ) + + doc1 = _mock_document( + document_id="33333333-3333-3333-3333-333333333333", + tenant_id="tenant-123", + data_source_type="upload_file", + upload_file_id="file-1", + ) + doc2 = _mock_document( + document_id="44444444-4444-4444-4444-444444444444", + tenant_id="tenant-123", + data_source_type="upload_file", + upload_file_id="file-2", + ) + monkeypatch.setattr( + datasets_document_module.DocumentService, + "get_documents_by_ids", + lambda *_args, **_kwargs: [doc1, doc2], + ) + monkeypatch.setattr( + datasets_document_module.FileService, + "get_upload_files_by_ids", + lambda *_args, **_kwargs: { + "file-1": SimpleNamespace(id="file-1", name="a.txt", key="k1"), + "file-2": SimpleNamespace(id="file-2", name="b.txt", key="k2"), + }, + ) + + # Stream distinct bytes per key so we can verify both ZIP entries. + import services.file_service as file_service_module + + monkeypatch.setattr( + file_service_module.storage, "load", lambda key, stream=True: [b"one"] if key == "k1" else [b"two"] + ) + + # Act + with app.test_request_context( + "/datasets/ds-1/documents/download-zip", + method="POST", + json={"document_ids": ["33333333-3333-3333-3333-333333333333", "44444444-4444-4444-4444-444444444444"]}, + ): + api = datasets_document_module.DocumentBatchDownloadZipApi() + response = api.post(dataset_id="ds-1") + + # Assert: response body is a valid ZIP and contains the expected entries. + response.direct_passthrough = False + data = response.get_data() + response.close() + + with ZipFile(BytesIO(data), mode="r") as zf: + assert zf.namelist() == ["a.txt", "b.txt"] + assert zf.read("a.txt") == b"one" + assert zf.read("b.txt") == b"two" + + +def test_batch_download_zip_rejects_non_upload_file_document( + app: Flask, datasets_document_module, monkeypatch: pytest.MonkeyPatch +) -> None: + """Ensure batch ZIP download rejects non upload-file documents.""" + + monkeypatch.setattr(datasets_document_module, "current_account_with_tenant", lambda: (_mock_user(), "tenant-123")) + monkeypatch.setattr( + datasets_document_module.DatasetService, "get_dataset", lambda _dataset_id: SimpleNamespace(id="ds-1") + ) + monkeypatch.setattr( + datasets_document_module.DatasetService, "check_dataset_permission", lambda *_args, **_kwargs: None + ) + + doc = _mock_document( + document_id="55555555-5555-5555-5555-555555555555", + tenant_id="tenant-123", + data_source_type="website_crawl", + upload_file_id="file-1", + ) + monkeypatch.setattr( + datasets_document_module.DocumentService, + "get_documents_by_ids", + lambda *_args, **_kwargs: [doc], + ) + + with app.test_request_context( + "/datasets/ds-1/documents/download-zip", + method="POST", + json={"document_ids": ["55555555-5555-5555-5555-555555555555"]}, + ): + api = datasets_document_module.DocumentBatchDownloadZipApi() + with pytest.raises(NotFound): + api.post(dataset_id="ds-1") + + +def test_document_download_returns_url_for_upload_file_document( + app: Flask, datasets_document_module, monkeypatch: pytest.MonkeyPatch +) -> None: + """Ensure upload-file documents return a `{url}` JSON payload.""" + + _wire_common_success_mocks( + module=datasets_document_module, + monkeypatch=monkeypatch, + current_tenant_id="tenant-123", + document_tenant_id="tenant-123", + data_source_type="upload_file", + upload_file_id="file-123", + upload_file_exists=True, + signed_url="https://example.com/signed", + ) + + # Build a request context then call the resource method directly. + with app.test_request_context("/datasets/ds-1/documents/doc-1/download", method="GET"): + api = datasets_document_module.DocumentDownloadApi() + result = api.get(dataset_id="ds-1", document_id="doc-1") + + assert result == {"url": "https://example.com/signed"} + + +def test_document_download_rejects_non_upload_file_document( + app: Flask, datasets_document_module, monkeypatch: pytest.MonkeyPatch +) -> None: + """Ensure non-upload documents raise 404 (no file to download).""" + + _wire_common_success_mocks( + module=datasets_document_module, + monkeypatch=monkeypatch, + current_tenant_id="tenant-123", + document_tenant_id="tenant-123", + data_source_type="website_crawl", + upload_file_id="file-123", + upload_file_exists=True, + signed_url="https://example.com/signed", + ) + + with app.test_request_context("/datasets/ds-1/documents/doc-1/download", method="GET"): + api = datasets_document_module.DocumentDownloadApi() + with pytest.raises(NotFound): + api.get(dataset_id="ds-1", document_id="doc-1") + + +def test_document_download_rejects_missing_upload_file_id( + app: Flask, datasets_document_module, monkeypatch: pytest.MonkeyPatch +) -> None: + """Ensure missing `upload_file_id` raises 404.""" + + _wire_common_success_mocks( + module=datasets_document_module, + monkeypatch=monkeypatch, + current_tenant_id="tenant-123", + document_tenant_id="tenant-123", + data_source_type="upload_file", + upload_file_id=None, + upload_file_exists=False, + signed_url="https://example.com/signed", + ) + + with app.test_request_context("/datasets/ds-1/documents/doc-1/download", method="GET"): + api = datasets_document_module.DocumentDownloadApi() + with pytest.raises(NotFound): + api.get(dataset_id="ds-1", document_id="doc-1") + + +def test_document_download_rejects_when_upload_file_record_missing( + app: Flask, datasets_document_module, monkeypatch: pytest.MonkeyPatch +) -> None: + """Ensure missing UploadFile row raises 404.""" + + _wire_common_success_mocks( + module=datasets_document_module, + monkeypatch=monkeypatch, + current_tenant_id="tenant-123", + document_tenant_id="tenant-123", + data_source_type="upload_file", + upload_file_id="file-123", + upload_file_exists=False, + signed_url="https://example.com/signed", + ) + + with app.test_request_context("/datasets/ds-1/documents/doc-1/download", method="GET"): + api = datasets_document_module.DocumentDownloadApi() + with pytest.raises(NotFound): + api.get(dataset_id="ds-1", document_id="doc-1") + + +def test_document_download_rejects_tenant_mismatch( + app: Flask, datasets_document_module, monkeypatch: pytest.MonkeyPatch +) -> None: + """Ensure tenant mismatch is rejected by the shared `get_document()` permission check.""" + + _wire_common_success_mocks( + module=datasets_document_module, + monkeypatch=monkeypatch, + current_tenant_id="tenant-123", + document_tenant_id="tenant-999", + data_source_type="upload_file", + upload_file_id="file-123", + upload_file_exists=True, + signed_url="https://example.com/signed", + ) + + with app.test_request_context("/datasets/ds-1/documents/doc-1/download", method="GET"): + api = datasets_document_module.DocumentDownloadApi() + with pytest.raises(Forbidden): + api.get(dataset_id="ds-1", document_id="doc-1") diff --git a/api/tests/unit_tests/services/test_file_service_zip_and_lookup.py b/api/tests/unit_tests/services/test_file_service_zip_and_lookup.py new file mode 100644 index 0000000000..7b4d349e33 --- /dev/null +++ b/api/tests/unit_tests/services/test_file_service_zip_and_lookup.py @@ -0,0 +1,99 @@ +""" +Unit tests for `services.file_service.FileService` helpers. + +We keep these tests focused on: +- ZIP tempfile building (sanitization + deduplication + content writes) +- tenant-scoped batch lookup behavior (`get_upload_files_by_ids`) +""" + +from __future__ import annotations + +from types import SimpleNamespace +from typing import Any +from zipfile import ZipFile + +import pytest + +import services.file_service as file_service_module +from services.file_service import FileService + + +def test_build_upload_files_zip_tempfile_sanitizes_and_dedupes_names(monkeypatch: pytest.MonkeyPatch) -> None: + """Ensure ZIP entry names are safe and unique while preserving extensions.""" + + # Arrange: three upload files that all sanitize down to the same basename ("b.txt"). + upload_files: list[Any] = [ + SimpleNamespace(name="a/b.txt", key="k1"), + SimpleNamespace(name="c/b.txt", key="k2"), + SimpleNamespace(name="../b.txt", key="k3"), + ] + + # Stream distinct bytes per key so we can verify content is written to the right entry. + data_by_key: dict[str, list[bytes]] = {"k1": [b"one"], "k2": [b"two"], "k3": [b"three"]} + + def _load(key: str, stream: bool = True) -> list[bytes]: + # Return the corresponding chunks for this key (the production code iterates chunks). + assert stream is True + return data_by_key[key] + + monkeypatch.setattr(file_service_module.storage, "load", _load) + + # Act: build zip in a tempfile. + with FileService.build_upload_files_zip_tempfile(upload_files=upload_files) as tmp: + with ZipFile(tmp, mode="r") as zf: + # Assert: names are sanitized (no directory components) and deduped with suffixes. + assert zf.namelist() == ["b.txt", "b (1).txt", "b (2).txt"] + + # Assert: each entry contains the correct bytes from storage. + assert zf.read("b.txt") == b"one" + assert zf.read("b (1).txt") == b"two" + assert zf.read("b (2).txt") == b"three" + + +def test_get_upload_files_by_ids_returns_empty_when_no_ids(monkeypatch: pytest.MonkeyPatch) -> None: + """Ensure empty input returns an empty mapping without hitting the database.""" + + class _Session: + def scalars(self, _stmt): # type: ignore[no-untyped-def] + raise AssertionError("db.session.scalars should not be called for empty id lists") + + monkeypatch.setattr(file_service_module, "db", SimpleNamespace(session=_Session())) + + assert FileService.get_upload_files_by_ids("tenant-1", []) == {} + + +def test_get_upload_files_by_ids_returns_id_keyed_mapping(monkeypatch: pytest.MonkeyPatch) -> None: + """Ensure batch lookup returns a dict keyed by stringified UploadFile ids.""" + + upload_files: list[Any] = [ + SimpleNamespace(id="file-1", tenant_id="tenant-1"), + SimpleNamespace(id="file-2", tenant_id="tenant-1"), + ] + + class _ScalarResult: + def __init__(self, items: list[Any]) -> None: + self._items = items + + def all(self) -> list[Any]: + return self._items + + class _Session: + def __init__(self, items: list[Any]) -> None: + self._items = items + self.calls: list[object] = [] + + def scalars(self, stmt): # type: ignore[no-untyped-def] + # Capture the statement so we can at least assert the query path is taken. + self.calls.append(stmt) + return _ScalarResult(self._items) + + session = _Session(upload_files) + monkeypatch.setattr(file_service_module, "db", SimpleNamespace(session=session)) + + # Provide duplicates to ensure callers can safely pass repeated ids. + result = FileService.get_upload_files_by_ids("tenant-1", ["file-1", "file-1", "file-2"]) + + assert set(result.keys()) == {"file-1", "file-2"} + assert result["file-1"].id == "file-1" + assert result["file-2"].id == "file-2" + assert len(session.calls) == 1 diff --git a/web/.vscode/extensions.json b/web/.vscode/extensions.json index 68f5c7bf0e..01235650ab 100644 --- a/web/.vscode/extensions.json +++ b/web/.vscode/extensions.json @@ -1,6 +1,8 @@ { "recommendations": [ "bradlc.vscode-tailwindcss", - "kisstkondoros.vscode-codemetrics" + "kisstkondoros.vscode-codemetrics", + "johnsoncodehk.vscode-tsslint", + "dbaeumer.vscode-eslint" ] } diff --git a/web/app/components/base/chat/chat/citation/popup.tsx b/web/app/components/base/chat/chat/citation/popup.tsx index c1633227d2..e033a9fa77 100644 --- a/web/app/components/base/chat/chat/citation/popup.tsx +++ b/web/app/components/base/chat/chat/citation/popup.tsx @@ -1,4 +1,4 @@ -import type { FC } from 'react' +import type { FC, MouseEvent } from 'react' import type { Resources } from './index' import Link from 'next/link' import { Fragment, useState } from 'react' @@ -18,6 +18,8 @@ import { PortalToFollowElemContent, PortalToFollowElemTrigger, } from '@/app/components/base/portal-to-follow-elem' +import { useDocumentDownload } from '@/service/knowledge/use-document' +import { downloadUrl } from '@/utils/download' import ProgressTooltip from './progress-tooltip' import Tooltip from './tooltip' @@ -36,6 +38,30 @@ const Popup: FC = ({ ? (/\.([^.]*)$/.exec(data.documentName)?.[1] || '') : 'notion' + const { mutateAsync: downloadDocument, isPending: isDownloading } = useDocumentDownload() + + /** + * Download the original uploaded file for citations whose data source is upload-file. + * We request a signed URL from the dataset document download endpoint, then trigger browser download. + */ + const handleDownloadUploadFile = async (e: MouseEvent) => { + // Prevent toggling the citation popup when user clicks the download link. + e.preventDefault() + e.stopPropagation() + + // Only upload-file citations can be downloaded this way (needs dataset/document ids). + const isUploadFile = data.dataSourceType === 'upload_file' || data.dataSourceType === 'file' + const datasetId = data.sources?.[0]?.dataset_id + const documentId = data.documentId || data.sources?.[0]?.document_id + if (!isUploadFile || !datasetId || !documentId || isDownloading) + return + + // Fetch signed URL (usually points to `/files//file-preview?...&as_attachment=true`). + const res = await downloadDocument({ datasetId, documentId }) + if (res?.url) + downloadUrl({ url: res.url, fileName: data.documentName }) + } + return ( = ({ setOpen(v => !v)}>
+ {/* Keep the trigger purely for opening the popup (no download link here). */}
{data.documentName}
@@ -57,7 +84,21 @@ const Popup: FC = ({
-
{data.documentName}
+
+ {/* If it's an upload-file reference, the title becomes a download link. */} + {(data.dataSourceType === 'upload_file' || data.dataSourceType === 'file') && !!data.sources?.[0]?.dataset_id + ? ( + + ) + : data.documentName} +
diff --git a/web/app/components/datasets/documents/components/list.tsx b/web/app/components/datasets/documents/components/list.tsx index 2bf9c278c4..0c78da113b 100644 --- a/web/app/components/datasets/documents/components/list.tsx +++ b/web/app/components/datasets/documents/components/list.tsx @@ -30,9 +30,10 @@ import { useDatasetDetailContextWithSelector as useDatasetDetailContext } from ' import useTimestamp from '@/hooks/use-timestamp' import { ChunkingMode, DataSourceType, DocumentActionType } from '@/models/datasets' import { DatasourceType } from '@/models/pipeline' -import { useDocumentArchive, useDocumentBatchRetryIndex, useDocumentDelete, useDocumentDisable, useDocumentEnable } from '@/service/knowledge/use-document' +import { useDocumentArchive, useDocumentBatchRetryIndex, useDocumentDelete, useDocumentDisable, useDocumentDownloadZip, useDocumentEnable } from '@/service/knowledge/use-document' import { asyncRunSafe } from '@/utils' import { cn } from '@/utils/classnames' +import { downloadBlob } from '@/utils/download' import { formatNumber } from '@/utils/format' import BatchAction from '../detail/completed/common/batch-action' import StatusItem from '../status-item' @@ -222,6 +223,7 @@ const DocumentList: FC = ({ const { mutateAsync: disableDocument } = useDocumentDisable() const { mutateAsync: deleteDocument } = useDocumentDelete() const { mutateAsync: retryIndexDocument } = useDocumentBatchRetryIndex() + const { mutateAsync: requestDocumentsZip, isPending: isDownloadingZip } = useDocumentDownloadZip() const handleAction = (actionName: DocumentActionType) => { return async () => { @@ -300,6 +302,39 @@ const DocumentList: FC = ({ return dataSourceType === DatasourceType.onlineDrive }, []) + const downloadableSelectedIds = useMemo(() => { + const selectedSet = new Set(selectedIds) + return localDocs + .filter(doc => selectedSet.has(doc.id) && doc.data_source_type === DataSourceType.FILE) + .map(doc => doc.id) + }, [localDocs, selectedIds]) + + /** + * Generate a random ZIP filename for bulk document downloads. + * We intentionally avoid leaking dataset info in the exported archive name. + */ + const generateDocsZipFileName = useCallback((): string => { + // Prefer UUID for uniqueness; fall back to time+random when unavailable. + const randomPart = (typeof crypto !== 'undefined' && typeof crypto.randomUUID === 'function') + ? crypto.randomUUID() + : `${Date.now().toString(36)}${Math.random().toString(36).slice(2, 10)}` + return `${randomPart}-docs.zip` + }, []) + + const handleBatchDownload = useCallback(async () => { + if (isDownloadingZip) + return + + // Download as a single ZIP to avoid browser caps on multiple automatic downloads. + const [e, blob] = await asyncRunSafe(requestDocumentsZip({ datasetId, documentIds: downloadableSelectedIds })) + if (e || !blob) { + Toast.notify({ type: 'error', message: t('actionMsg.downloadUnsuccessfully', { ns: 'common' }) }) + return + } + + downloadBlob({ data: blob, fileName: generateDocsZipFileName() }) + }, [datasetId, downloadableSelectedIds, generateDocsZipFileName, isDownloadingZip, requestDocumentsZip, t]) + return (
@@ -463,6 +498,7 @@ const DocumentList: FC = ({ onArchive={handleAction(DocumentActionType.archive)} onBatchEnable={handleAction(DocumentActionType.enable)} onBatchDisable={handleAction(DocumentActionType.disable)} + onBatchDownload={downloadableSelectedIds.length > 0 ? handleBatchDownload : undefined} onBatchDelete={handleAction(DocumentActionType.delete)} onEditMetadata={showEditModal} onBatchReIndex={hasErrorDocumentsSelected ? handleBatchReIndex : undefined} diff --git a/web/app/components/datasets/documents/components/operations.tsx b/web/app/components/datasets/documents/components/operations.tsx index 0d3c40c053..ee638c5e12 100644 --- a/web/app/components/datasets/documents/components/operations.tsx +++ b/web/app/components/datasets/documents/components/operations.tsx @@ -1,8 +1,10 @@ import type { OperationName } from '../types' import type { CommonResponse } from '@/models/common' +import type { DocumentDownloadResponse } from '@/service/datasets' import { RiArchive2Line, RiDeleteBinLine, + RiDownload2Line, RiEditLine, RiEqualizer2Line, RiLoopLeftLine, @@ -28,6 +30,7 @@ import { useDocumentArchive, useDocumentDelete, useDocumentDisable, + useDocumentDownload, useDocumentEnable, useDocumentPause, useDocumentResume, @@ -37,6 +40,7 @@ import { } from '@/service/knowledge/use-document' import { asyncRunSafe } from '@/utils' import { cn } from '@/utils/classnames' +import { downloadUrl } from '@/utils/download' import s from '../style.module.css' import RenameModal from './rename-modal' @@ -69,7 +73,7 @@ const Operations = ({ scene = 'list', className = '', }: OperationsProps) => { - const { id, enabled = false, archived = false, data_source_type, display_status } = detail || {} + const { id, name, enabled = false, archived = false, data_source_type, display_status } = detail || {} const [showModal, setShowModal] = useState(false) const [deleting, setDeleting] = useState(false) const { notify } = useContext(ToastContext) @@ -80,6 +84,7 @@ const Operations = ({ const { mutateAsync: enableDocument } = useDocumentEnable() const { mutateAsync: disableDocument } = useDocumentDisable() const { mutateAsync: deleteDocument } = useDocumentDelete() + const { mutateAsync: downloadDocument, isPending: isDownloading } = useDocumentDownload() const { mutateAsync: syncDocument } = useSyncDocument() const { mutateAsync: syncWebsite } = useSyncWebsite() const { mutateAsync: pauseDocument } = useDocumentPause() @@ -158,6 +163,24 @@ const Operations = ({ onUpdate() }, [onUpdate]) + const handleDownload = useCallback(async () => { + // Avoid repeated clicks while the signed URL request is in-flight. + if (isDownloading) + return + + // Request a signed URL first (it points to `/files//file-preview?...&as_attachment=true`). + const [e, res] = await asyncRunSafe( + downloadDocument({ datasetId, documentId: id }) as Promise, + ) + if (e || !res?.url) { + notify({ type: 'error', message: t('actionMsg.downloadUnsuccessfully', { ns: 'common' }) }) + return + } + + // Trigger download without navigating away (helps avoid duplicate downloads in some browsers). + downloadUrl({ url: res.url, fileName: name }) + }, [datasetId, downloadDocument, id, isDownloading, name, notify, t]) + return (
e.stopPropagation()}> {isListScene && !embeddingAvailable && ( @@ -214,6 +237,20 @@ const Operations = ({ {t('list.table.rename', { ns: 'datasetDocuments' })}
+ {data_source_type === DataSourceType.FILE && ( +
{ + evt.preventDefault() + evt.stopPropagation() + evt.nativeEvent.stopImmediatePropagation?.() + handleDownload() + }} + > + + {t('list.action.download', { ns: 'datasetDocuments' })} +
+ )} {['notion_import', DataSourceType.WEB].includes(data_source_type) && (
onOperate('sync')}> @@ -223,6 +260,23 @@ const Operations = ({ )} + {archived && data_source_type === DataSourceType.FILE && ( + <> +
{ + evt.preventDefault() + evt.stopPropagation() + evt.nativeEvent.stopImmediatePropagation?.() + handleDownload() + }} + > + + {t('list.action.download', { ns: 'datasetDocuments' })} +
+ + + )} {!archived && display_status?.toLowerCase() === 'indexing' && (
onOperate('pause')}> diff --git a/web/app/components/datasets/documents/detail/completed/common/batch-action.tsx b/web/app/components/datasets/documents/detail/completed/common/batch-action.tsx index db36ca471a..2de72d9ff6 100644 --- a/web/app/components/datasets/documents/detail/completed/common/batch-action.tsx +++ b/web/app/components/datasets/documents/detail/completed/common/batch-action.tsx @@ -1,5 +1,5 @@ import type { FC } from 'react' -import { RiArchive2Line, RiCheckboxCircleLine, RiCloseCircleLine, RiDeleteBinLine, RiDraftLine, RiRefreshLine } from '@remixicon/react' +import { RiArchive2Line, RiCheckboxCircleLine, RiCloseCircleLine, RiDeleteBinLine, RiDownload2Line, RiDraftLine, RiRefreshLine } from '@remixicon/react' import { useBoolean } from 'ahooks' import * as React from 'react' import { useTranslation } from 'react-i18next' @@ -14,6 +14,7 @@ type IBatchActionProps = { selectedIds: string[] onBatchEnable: () => void onBatchDisable: () => void + onBatchDownload?: () => void onBatchDelete: () => Promise onArchive?: () => void onEditMetadata?: () => void @@ -26,6 +27,7 @@ const BatchAction: FC = ({ selectedIds, onBatchEnable, onBatchDisable, + onBatchDownload, onArchive, onBatchDelete, onEditMetadata, @@ -103,6 +105,16 @@ const BatchAction: FC = ({ {t(`${i18nPrefix}.reIndex`, { ns: 'dataset' })} )} + {onBatchDownload && ( + + )} +
+ ) + : null + ), +})) + +// ============================================================================ +// Test Data Factory +// ============================================================================ + +const createMockRelatedApp = (overrides: Partial = {}): RelatedApp => ({ + id: 'app-1', + name: 'Test App', + mode: AppModeEnum.COMPLETION, + icon: 'icon-url', + icon_type: 'image', + icon_background: '#fff', + icon_url: '', + ...overrides, +}) + +const createMockRelatedAppsResponse = (count: number = 2): RelatedAppResponse => ({ + data: Array.from({ length: count }, (_, i) => + createMockRelatedApp({ id: `app-${i + 1}`, name: `App ${i + 1}` })), + total: count, +}) + +// ============================================================================ +// Statistics Component Tests +// ============================================================================ + +describe('Statistics', () => { + beforeEach(() => { + vi.clearAllMocks() + }) + + describe('Rendering', () => { + it('should render without crashing', () => { + render( + , + ) + + expect(screen.getByText('10')).toBeInTheDocument() + }) + + it('should render document count correctly', () => { + render( + , + ) + + expect(screen.getByText('42')).toBeInTheDocument() + }) + + it('should render related apps total correctly', () => { + const relatedApps = createMockRelatedAppsResponse(5) + + render( + , + ) + + expect(screen.getByText('5')).toBeInTheDocument() + }) + + it('should display translated document label', () => { + render( + , + ) + + expect(screen.getByText(/documents/i)).toBeInTheDocument() + }) + + it('should display translated related app label', () => { + render( + , + ) + + expect(screen.getByText(/relatedApp/i)).toBeInTheDocument() + }) + }) + + describe('Edge Cases', () => { + it('should render placeholder when documentCount is undefined', () => { + render( + , + ) + + expect(screen.getByText('--')).toBeInTheDocument() + }) + + it('should render placeholder when relatedApps is undefined', () => { + render( + , + ) + + expect(screen.getAllByText('--').length).toBeGreaterThanOrEqual(1) + }) + + it('should handle zero document count', () => { + render( + , + ) + + expect(screen.getByText('0')).toBeInTheDocument() + }) + + it('should handle empty related apps array', () => { + const emptyRelatedApps: RelatedAppResponse = { data: [], total: 0 } + + render( + , + ) + + expect(screen.getByText('0')).toBeInTheDocument() + }) + + it('should handle large numbers correctly', () => { + render( + , + ) + + expect(screen.getByText('999999')).toBeInTheDocument() + expect(screen.getByText('100')).toBeInTheDocument() + }) + }) + + describe('Tooltip Interactions', () => { + it('should render tooltip trigger with info icon', () => { + render( + , + ) + + // Find the cursor-pointer element containing the relatedApp text + const tooltipTrigger = screen.getByText(/relatedApp/i).closest('.cursor-pointer') + expect(tooltipTrigger).toBeInTheDocument() + }) + + it('should render LinkedAppsPanel when related apps exist', async () => { + const relatedApps = createMockRelatedAppsResponse(3) + + render( + , + ) + + // The LinkedAppsPanel should be rendered inside the tooltip + // We can't easily test tooltip content in this context without more setup + // But we verify the condition logic works by checking component renders + expect(screen.getByText('3')).toBeInTheDocument() + }) + + it('should render NoLinkedAppsPanel when no related apps', () => { + const emptyRelatedApps: RelatedAppResponse = { data: [], total: 0 } + + render( + , + ) + + // Verify component renders correctly with empty apps + expect(screen.getByText('0')).toBeInTheDocument() + }) + }) + + describe('Props Variations', () => { + it('should handle expand=false', () => { + render( + , + ) + + // Component should still render with expand=false + expect(screen.getByText('10')).toBeInTheDocument() + }) + + it('should pass isMobile based on expand prop', () => { + // When expand is false, isMobile should be true (!expand) + render( + , + ) + + // Component renders - the isMobile logic is internal + expect(screen.getByText('10')).toBeInTheDocument() + }) + }) + + describe('Memoization', () => { + it('should be memoized with React.memo', () => { + const { rerender } = render( + , + ) + + // Rerender with same props + rerender( + , + ) + + // Component should not cause unnecessary re-renders + expect(screen.getByText('10')).toBeInTheDocument() + }) + }) +}) + +// ============================================================================ +// ApiAccess Component Tests +// ============================================================================ + +describe('ApiAccess', () => { + beforeEach(() => { + vi.clearAllMocks() + }) + + describe('Rendering', () => { + it('should render without crashing', () => { + render( + , + ) + + expect(screen.getByText(/appMenus\.apiAccess/i)).toBeInTheDocument() + }) + + it('should render API title when expanded', () => { + render( + , + ) + + expect(screen.getByText(/appMenus\.apiAccess/i)).toBeInTheDocument() + }) + + it('should not render API title when collapsed', () => { + render( + , + ) + + expect(screen.queryByText(/appMenus\.apiAccess/i)).not.toBeInTheDocument() + }) + + it('should render indicator when API is enabled', () => { + const { container } = render( + , + ) + + // Indicator component should be present + const indicatorElement = container.querySelector('.relative.flex.h-8') + expect(indicatorElement).toBeInTheDocument() + }) + + it('should render indicator when API is disabled', () => { + const { container } = render( + , + ) + + // Indicator component should be present + const indicatorElement = container.querySelector('.relative.flex.h-8') + expect(indicatorElement).toBeInTheDocument() + }) + }) + + describe('User Interactions', () => { + it('should toggle popup open state on click', async () => { + const user = userEvent.setup() + + render( + , + ) + + const trigger = screen.getByText(/appMenus\.apiAccess/i).closest('[class*="cursor-pointer"]') + expect(trigger).toBeInTheDocument() + + if (trigger) { + await user.click(trigger) + // After click, the Card component should be rendered in the portal + } + }) + + it('should apply hover styles on trigger', () => { + render( + , + ) + + const trigger = screen.getByText(/appMenus\.apiAccess/i).closest('div[class*="cursor-pointer"]') + expect(trigger).toHaveClass('cursor-pointer') + }) + }) + + describe('Props Variations', () => { + it('should apply compressed layout when expand is false', () => { + const { container } = render( + , + ) + + // When collapsed, width should be w-8 + const triggerContainer = container.querySelector('[class*="w-8"]') + expect(triggerContainer).toBeInTheDocument() + }) + + it('should pass apiEnabled to Card component', async () => { + const user = userEvent.setup() + + render( + , + ) + + const trigger = screen.getByText(/appMenus\.apiAccess/i).closest('[class*="cursor-pointer"]') + if (trigger) { + await user.click(trigger) + // The apiEnabled should be passed to Card + } + }) + }) + + describe('Memoization', () => { + it('should be memoized with React.memo', () => { + const { rerender } = render( + , + ) + + rerender( + , + ) + + expect(screen.getByText(/appMenus\.apiAccess/i)).toBeInTheDocument() + }) + }) +}) + +// ============================================================================ +// ApiAccessCard Component Tests +// ============================================================================ + +describe('ApiAccessCard', () => { + beforeEach(() => { + vi.clearAllMocks() + mockIsCurrentWorkspaceManager = true + mockEnableDatasetServiceApi.mockResolvedValue({ result: 'success' }) + mockDisableDatasetServiceApi.mockResolvedValue({ result: 'success' }) + }) + + describe('Rendering', () => { + it('should render without crashing', () => { + render( + , + ) + + expect(screen.getByText(/serviceApi\.enabled/i)).toBeInTheDocument() + }) + + it('should display enabled status when API is enabled', () => { + render( + , + ) + + expect(screen.getByText(/serviceApi\.enabled/i)).toBeInTheDocument() + }) + + it('should display disabled status when API is disabled', () => { + render( + , + ) + + expect(screen.getByText(/serviceApi\.disabled/i)).toBeInTheDocument() + }) + + it('should render API Reference link', () => { + render( + , + ) + + expect(screen.getByText(/overview\.apiInfo\.doc/i)).toBeInTheDocument() + }) + + it('should render switch component', () => { + render( + , + ) + + expect(screen.getByRole('switch')).toBeInTheDocument() + }) + }) + + describe('User Interactions', () => { + it('should call enableDatasetServiceApi when switch is toggled on', async () => { + const user = userEvent.setup() + + render( + , + ) + + const switchButton = screen.getByRole('switch') + await user.click(switchButton) + + await waitFor(() => { + expect(mockEnableDatasetServiceApi).toHaveBeenCalledWith('dataset-123') + }) + }) + + it('should call disableDatasetServiceApi when switch is toggled off', async () => { + const user = userEvent.setup() + + render( + , + ) + + const switchButton = screen.getByRole('switch') + await user.click(switchButton) + + await waitFor(() => { + expect(mockDisableDatasetServiceApi).toHaveBeenCalledWith('dataset-123') + }) + }) + + it('should call mutateDatasetRes after successful API toggle', async () => { + const user = userEvent.setup() + + render( + , + ) + + const switchButton = screen.getByRole('switch') + await user.click(switchButton) + + await waitFor(() => { + expect(mockMutateDatasetRes).toHaveBeenCalled() + }) + }) + + it('should not call mutateDatasetRes on API toggle failure', async () => { + mockEnableDatasetServiceApi.mockResolvedValueOnce({ result: 'fail' }) + const user = userEvent.setup() + + render( + , + ) + + const switchButton = screen.getByRole('switch') + await user.click(switchButton) + + await waitFor(() => { + expect(mockEnableDatasetServiceApi).toHaveBeenCalled() + }) + + // mutateDatasetRes should not be called on failure + expect(mockMutateDatasetRes).not.toHaveBeenCalled() + }) + + it('should have correct href for API Reference link', () => { + render( + , + ) + + const apiRefLink = screen.getByText(/overview\.apiInfo\.doc/i).closest('a') + expect(apiRefLink).toHaveAttribute('href', 'https://docs.dify.ai/api-reference/datasets') + }) + }) + + describe('Permission Handling', () => { + it('should disable switch when user is not workspace manager', () => { + mockIsCurrentWorkspaceManager = false + + render( + , + ) + + const switchButton = screen.getByRole('switch') + // Headless UI Switch uses CSS classes for disabled state + expect(switchButton).toHaveClass('!cursor-not-allowed') + expect(switchButton).toHaveClass('!opacity-50') + }) + + it('should enable switch when user is workspace manager', () => { + mockIsCurrentWorkspaceManager = true + + render( + , + ) + + const switchButton = screen.getByRole('switch') + expect(switchButton).not.toHaveClass('!cursor-not-allowed') + expect(switchButton).not.toHaveClass('!opacity-50') + }) + }) + + describe('Memoization', () => { + it('should be memoized with React.memo', () => { + const { rerender } = render( + , + ) + + rerender( + , + ) + + expect(screen.getByText(/serviceApi\.enabled/i)).toBeInTheDocument() + }) + + it('should use useCallback for handlers', () => { + // Verify handlers are stable by rendering multiple times + const { rerender } = render( + , + ) + + rerender( + , + ) + + // Component should render without issues with memoized callbacks + expect(screen.getByRole('switch')).toBeInTheDocument() + }) + }) +}) + +// ============================================================================ +// ExtraInfo (Main Component) Tests +// ============================================================================ + +describe('ExtraInfo', () => { + beforeEach(() => { + vi.clearAllMocks() + }) + + describe('Rendering', () => { + it('should render without crashing', () => { + render( + , + ) + + // Should render ApiAccess component + expect(screen.getByText(/appMenus\.apiAccess/i)).toBeInTheDocument() + }) + + it('should render Statistics when expand is true', () => { + render( + , + ) + + // Statistics shows document count + expect(screen.getByText('10')).toBeInTheDocument() + }) + + it('should not render Statistics when expand is false', () => { + render( + , + ) + + // Document count should not be visible when collapsed + expect(screen.queryByText('10')).not.toBeInTheDocument() + }) + + it('should always render ApiAccess regardless of expand state', () => { + const { rerender } = render( + , + ) + + // Check expanded state has ApiAccess title + expect(screen.getByText(/appMenus\.apiAccess/i)).toBeInTheDocument() + + rerender( + , + ) + + // ApiAccess should still be present (but without title text when collapsed) + // The component is still rendered, just with different styling + }) + }) + + describe('Context Integration', () => { + it('should read apiEnabled from dataset detail context', () => { + render( + , + ) + + // Since mockDataset has enable_api: true, the indicator should be green + expect(screen.getByText(/appMenus\.apiAccess/i)).toBeInTheDocument() + }) + + it('should read apiBaseUrl from useDatasetApiBaseUrl hook', () => { + render( + , + ) + + // Component should render with the mocked API base URL + expect(screen.getByText(/appMenus\.apiAccess/i)).toBeInTheDocument() + }) + + it('should handle missing apiBaseInfo with fallback empty string', async () => { + const { useDatasetApiBaseUrl } = await import('@/service/knowledge/use-dataset') + vi.mocked(useDatasetApiBaseUrl).mockReturnValue({ + data: undefined, + isLoading: false, + } as ReturnType) + + render( + , + ) + + expect(screen.getByText(/appMenus\.apiAccess/i)).toBeInTheDocument() + + // Reset mock + vi.mocked(useDatasetApiBaseUrl).mockReturnValue({ + data: { api_base_url: 'https://api.example.com' }, + isLoading: false, + } as ReturnType) + }) + + it('should handle missing apiEnabled with fallback false', async () => { + const { useDatasetDetailContextWithSelector } = await import('@/context/dataset-detail') + vi.mocked(useDatasetDetailContextWithSelector).mockImplementation((selector) => { + // Simulate dataset without enable_api by using a partial dataset + const partialDataset = { ...mockDataset } as Partial + delete (partialDataset as { enable_api?: boolean }).enable_api + return selector({ + dataset: partialDataset as DataSet, + mutateDatasetRes: vi.fn(), + }) + }) + + render( + , + ) + + expect(screen.getByText(/appMenus\.apiAccess/i)).toBeInTheDocument() + + // Reset mock + vi.mocked(useDatasetDetailContextWithSelector).mockImplementation(selector => + selector({ dataset: mockDataset as DataSet, mutateDatasetRes: vi.fn() }), + ) + }) + }) + + describe('Props Variations', () => { + it('should pass expand prop to Statistics component', () => { + render( + , + ) + + expect(screen.getByText('10')).toBeInTheDocument() + }) + + it('should pass expand prop to ApiAccess component', () => { + render( + , + ) + + expect(screen.getByText(/appMenus\.apiAccess/i)).toBeInTheDocument() + }) + + it('should pass documentCount to Statistics component', () => { + render( + , + ) + + expect(screen.getByText('99')).toBeInTheDocument() + }) + + it('should pass relatedApps to Statistics component', () => { + const relatedApps = createMockRelatedAppsResponse(7) + + render( + , + ) + + expect(screen.getByText('7')).toBeInTheDocument() + }) + }) + + describe('Edge Cases', () => { + it('should handle undefined documentCount', () => { + render( + , + ) + + expect(screen.getByText('--')).toBeInTheDocument() + }) + + it('should handle undefined relatedApps', () => { + render( + , + ) + + expect(screen.getByText('10')).toBeInTheDocument() + }) + + it('should handle all undefined optional props', () => { + render( + , + ) + + // Should render without crashing + expect(screen.getByText(/appMenus\.apiAccess/i)).toBeInTheDocument() + }) + + it('should handle zero values correctly', () => { + const emptyRelatedApps: RelatedAppResponse = { data: [], total: 0 } + + render( + , + ) + + expect(screen.getAllByText('0')).toHaveLength(2) + }) + }) + + describe('Memoization', () => { + it('should be memoized with React.memo', () => { + const { rerender } = render( + , + ) + + // Rerender with same props + rerender( + , + ) + + expect(screen.getByText('10')).toBeInTheDocument() + }) + + it('should update when props change', () => { + const { rerender } = render( + , + ) + + expect(screen.getByText('10')).toBeInTheDocument() + + rerender( + , + ) + + expect(screen.getByText('20')).toBeInTheDocument() + }) + + it('should hide Statistics when expand changes to false', () => { + const { rerender } = render( + , + ) + + expect(screen.getByText('10')).toBeInTheDocument() + + rerender( + , + ) + + expect(screen.queryByText('10')).not.toBeInTheDocument() + }) + }) + + describe('Component Composition', () => { + it('should render Statistics before ApiAccess when expanded', () => { + const { container } = render( + , + ) + + // Statistics should appear before ApiAccess in DOM order + const elements = container.querySelectorAll('div') + expect(elements.length).toBeGreaterThan(0) + }) + + it('should render only ApiAccess when collapsed', () => { + render( + , + ) + + // Only ApiAccess should be rendered (without its title in collapsed state) + expect(screen.queryByText('10')).not.toBeInTheDocument() + }) + }) +}) + +// ============================================================================ +// Integration Tests +// ============================================================================ + +describe('ExtraInfo Integration', () => { + beforeEach(() => { + vi.clearAllMocks() + }) + + it('should render complete expanded view with all child components', () => { + render( + , + ) + + // Statistics content + expect(screen.getByText('25')).toBeInTheDocument() + expect(screen.getByText('5')).toBeInTheDocument() + + // ApiAccess content + expect(screen.getByText(/appMenus\.apiAccess/i)).toBeInTheDocument() + }) + + it('should handle complete user workflow: view stats and toggle API', async () => { + const user = userEvent.setup() + + render( + , + ) + + // Verify statistics are visible + expect(screen.getByText('10')).toBeInTheDocument() + expect(screen.getByText('3')).toBeInTheDocument() + + // Click on ApiAccess to open the card + const apiAccessTrigger = screen.getByText(/appMenus\.apiAccess/i).closest('[class*="cursor-pointer"]') + if (apiAccessTrigger) + await user.click(apiAccessTrigger) + + // The popup should open with Card content (showing enabled/disabled status) + await waitFor(() => { + expect(screen.getByText(/serviceApi\.enabled/i)).toBeInTheDocument() + }) + }) + + it('should integrate with context correctly across all components', async () => { + render( + , + ) + + // The component tree should correctly receive context values + // apiEnabled from context affects ApiAccess indicator color + expect(screen.getByText(/appMenus\.apiAccess/i)).toBeInTheDocument() + }) +}) diff --git a/web/app/components/datasets/hit-testing/index.spec.tsx b/web/app/components/datasets/hit-testing/index.spec.tsx new file mode 100644 index 0000000000..45c68e44b1 --- /dev/null +++ b/web/app/components/datasets/hit-testing/index.spec.tsx @@ -0,0 +1,2654 @@ +import type { ReactNode } from 'react' +import type { DataSet, HitTesting, HitTestingChildChunk, HitTestingRecord, HitTestingResponse, Query } from '@/models/datasets' +import type { RetrievalConfig } from '@/types/app' +import { QueryClient, QueryClientProvider } from '@tanstack/react-query' +import { fireEvent, render, screen, waitFor } from '@testing-library/react' +import { describe, expect, it, vi } from 'vitest' +import { FileAppearanceTypeEnum } from '@/app/components/base/file-uploader/types' +import { RETRIEVE_METHOD } from '@/types/app' + +// ============================================================================ +// Imports (after mocks) +// ============================================================================ + +import ChildChunksItem from './components/child-chunks-item' +import ChunkDetailModal from './components/chunk-detail-modal' +import EmptyRecords from './components/empty-records' +import Mask from './components/mask' +import QueryInput from './components/query-input' +import Textarea from './components/query-input/textarea' +import Records from './components/records' +import ResultItem from './components/result-item' +import ResultItemExternal from './components/result-item-external' +import ResultItemFooter from './components/result-item-footer' +import ResultItemMeta from './components/result-item-meta' +import Score from './components/score' +import HitTestingPage from './index' +import ModifyExternalRetrievalModal from './modify-external-retrieval-modal' +import ModifyRetrievalModal from './modify-retrieval-modal' +import { extensionToFileType } from './utils/extension-to-file-type' + +// Mock Toast +// Note: These components use real implementations for integration testing: +// - Toast, FloatRightContainer, Drawer, Pagination, Loading +// - RetrievalMethodConfig, EconomicalRetrievalMethodConfig +// - ImageUploaderInRetrievalTesting, retrieval-method-info, check-rerank-model + +// Mock RetrievalSettings to allow triggering onChange +vi.mock('@/app/components/datasets/external-knowledge-base/create/RetrievalSettings', () => ({ + default: ({ onChange }: { onChange: (data: { top_k?: number, score_threshold?: number, score_threshold_enabled?: boolean }) => void }) => { + return ( +
+ + + +
+ ) + }, +})) + +// ============================================================================ +// Mock Setup +// ============================================================================ + +// Mock next/navigation +vi.mock('next/navigation', () => ({ + useRouter: () => ({ + push: vi.fn(), + replace: vi.fn(), + }), + usePathname: () => '/test', + useSearchParams: () => new URLSearchParams(), +})) + +// Mock use-context-selector +const mockDataset = { + id: 'dataset-1', + name: 'Test Dataset', + provider: 'vendor', + indexing_technique: 'high_quality' as const, + retrieval_model_dict: { + search_method: RETRIEVE_METHOD.semantic, + reranking_enable: false, + reranking_mode: undefined, + reranking_model: { + reranking_provider_name: '', + reranking_model_name: '', + }, + weights: undefined, + top_k: 10, + score_threshold_enabled: false, + score_threshold: 0.5, + }, + is_multimodal: false, +} as Partial + +vi.mock('use-context-selector', () => ({ + useContext: vi.fn(() => ({ dataset: mockDataset })), + useContextSelector: vi.fn((_, selector) => selector({ dataset: mockDataset })), + createContext: vi.fn(() => ({})), +})) + +// Mock dataset detail context +vi.mock('@/context/dataset-detail', () => ({ + default: {}, + useDatasetDetailContext: vi.fn(() => ({ dataset: mockDataset })), + useDatasetDetailContextWithSelector: vi.fn((selector: (v: { dataset?: typeof mockDataset }) => unknown) => + selector({ dataset: mockDataset as DataSet }), + ), +})) + +// Mock service hooks +const mockRecordsRefetch = vi.fn() +const mockHitTestingMutateAsync = vi.fn() +const mockExternalHitTestingMutateAsync = vi.fn() + +vi.mock('@/service/knowledge/use-dataset', () => ({ + useDatasetTestingRecords: vi.fn(() => ({ + data: { + data: [], + total: 0, + page: 1, + limit: 10, + has_more: false, + }, + refetch: mockRecordsRefetch, + isLoading: false, + })), +})) + +vi.mock('@/service/knowledge/use-hit-testing', () => ({ + useHitTesting: vi.fn(() => ({ + mutateAsync: mockHitTestingMutateAsync, + isPending: false, + })), + useExternalKnowledgeBaseHitTesting: vi.fn(() => ({ + mutateAsync: mockExternalHitTestingMutateAsync, + isPending: false, + })), +})) + +// Mock breakpoints hook +vi.mock('@/hooks/use-breakpoints', () => ({ + default: vi.fn(() => 'pc'), + MediaType: { + mobile: 'mobile', + pc: 'pc', + }, +})) + +// Mock timestamp hook +vi.mock('@/hooks/use-timestamp', () => ({ + default: vi.fn(() => ({ + formatTime: vi.fn((timestamp: number, _format: string) => new Date(timestamp * 1000).toISOString()), + })), +})) + +// Mock use-common to avoid QueryClient issues in nested hooks +vi.mock('@/service/use-common', () => ({ + useFileUploadConfig: vi.fn(() => ({ + data: { + file_size_limit: 10, + batch_count_limit: 5, + image_file_size_limit: 5, + }, + isLoading: false, + })), +})) + +// Store ref to ImageUploader onChange for testing +let mockImageUploaderOnChange: ((files: Array<{ sourceUrl?: string, uploadedId?: string, mimeType: string, name: string, size: number, extension: string }>) => void) | null = null + +// Mock ImageUploaderInRetrievalTesting to capture onChange +vi.mock('@/app/components/datasets/common/image-uploader/image-uploader-in-retrieval-testing', () => ({ + default: ({ textArea, actionButton, onChange }: { + textArea: React.ReactNode + actionButton: React.ReactNode + onChange: (files: Array<{ sourceUrl?: string, uploadedId?: string, mimeType: string, name: string, size: number, extension: string }>) => void + }) => { + mockImageUploaderOnChange = onChange + return ( +
+ {textArea} + {actionButton} + +
+ ) + }, +})) + +// Mock docLink hook +vi.mock('@/context/i18n', () => ({ + useDocLink: vi.fn(() => () => 'https://docs.example.com'), +})) + +// Mock provider context for retrieval method config +vi.mock('@/context/provider-context', () => ({ + useProviderContext: vi.fn(() => ({ + supportRetrievalMethods: [ + 'semantic_search', + 'full_text_search', + 'hybrid_search', + ], + })), +})) + +// Mock model list hook - include all exports used by child components +vi.mock('@/app/components/header/account-setting/model-provider-page/hooks', () => ({ + useModelList: vi.fn(() => ({ + data: [], + isLoading: false, + })), + useModelListAndDefaultModelAndCurrentProviderAndModel: vi.fn(() => ({ + modelList: [], + defaultModel: undefined, + currentProvider: undefined, + currentModel: undefined, + })), + useModelListAndDefaultModel: vi.fn(() => ({ + modelList: [], + defaultModel: undefined, + })), + useCurrentProviderAndModel: vi.fn(() => ({ + currentProvider: undefined, + currentModel: undefined, + })), + useDefaultModel: vi.fn(() => ({ + defaultModel: undefined, + })), +})) + +// ============================================================================ +// Test Wrapper with QueryClientProvider +// ============================================================================ + +const createTestQueryClient = () => new QueryClient({ + defaultOptions: { + queries: { + retry: false, + gcTime: 0, + }, + mutations: { + retry: false, + }, + }, +}) + +const TestWrapper = ({ children }: { children: ReactNode }) => { + const queryClient = createTestQueryClient() + return ( + + {children} + + ) +} + +const renderWithProviders = (ui: React.ReactElement) => { + return render(ui, { wrapper: TestWrapper }) +} + +// ============================================================================ +// Test Factories +// ============================================================================ + +const createMockSegment = (overrides = {}) => ({ + id: 'segment-1', + document: { + id: 'doc-1', + data_source_type: 'upload_file', + name: 'test-document.pdf', + doc_type: 'book' as const, + }, + content: 'Test segment content', + sign_content: 'Test signed content', + position: 1, + word_count: 100, + tokens: 50, + keywords: ['test', 'keyword'], + hit_count: 5, + index_node_hash: 'hash-123', + answer: '', + ...overrides, +}) + +const createMockHitTesting = (overrides = {}): HitTesting => ({ + segment: createMockSegment() as HitTesting['segment'], + content: createMockSegment() as HitTesting['content'], + score: 0.85, + tsne_position: { x: 0.5, y: 0.5 }, + child_chunks: null, + files: [], + ...overrides, +}) + +const createMockChildChunk = (overrides = {}): HitTestingChildChunk => ({ + id: 'child-chunk-1', + content: 'Child chunk content', + position: 1, + score: 0.9, + ...overrides, +}) + +const createMockRecord = (overrides = {}): HitTestingRecord => ({ + id: 'record-1', + source: 'hit_testing', + source_app_id: 'app-1', + created_by_role: 'account', + created_by: 'user-1', + created_at: 1609459200, + queries: [ + { content: 'Test query', content_type: 'text_query', file_info: null }, + ], + ...overrides, +}) + +const createMockRetrievalConfig = (overrides = {}): RetrievalConfig => ({ + search_method: RETRIEVE_METHOD.semantic, + reranking_enable: false, + reranking_mode: undefined, + reranking_model: { + reranking_provider_name: '', + reranking_model_name: '', + }, + weights: undefined, + top_k: 10, + score_threshold_enabled: false, + score_threshold: 0.5, + ...overrides, +} as RetrievalConfig) + +// ============================================================================ +// Utility Function Tests +// ============================================================================ + +describe('extensionToFileType', () => { + describe('PDF files', () => { + it('should return pdf type for pdf extension', () => { + expect(extensionToFileType('pdf')).toBe(FileAppearanceTypeEnum.pdf) + }) + }) + + describe('Word files', () => { + it('should return word type for doc extension', () => { + expect(extensionToFileType('doc')).toBe(FileAppearanceTypeEnum.word) + }) + + it('should return word type for docx extension', () => { + expect(extensionToFileType('docx')).toBe(FileAppearanceTypeEnum.word) + }) + }) + + describe('Markdown files', () => { + it('should return markdown type for md extension', () => { + expect(extensionToFileType('md')).toBe(FileAppearanceTypeEnum.markdown) + }) + + it('should return markdown type for mdx extension', () => { + expect(extensionToFileType('mdx')).toBe(FileAppearanceTypeEnum.markdown) + }) + + it('should return markdown type for markdown extension', () => { + expect(extensionToFileType('markdown')).toBe(FileAppearanceTypeEnum.markdown) + }) + }) + + describe('Excel files', () => { + it('should return excel type for csv extension', () => { + expect(extensionToFileType('csv')).toBe(FileAppearanceTypeEnum.excel) + }) + + it('should return excel type for xls extension', () => { + expect(extensionToFileType('xls')).toBe(FileAppearanceTypeEnum.excel) + }) + + it('should return excel type for xlsx extension', () => { + expect(extensionToFileType('xlsx')).toBe(FileAppearanceTypeEnum.excel) + }) + }) + + describe('Document files', () => { + it('should return document type for txt extension', () => { + expect(extensionToFileType('txt')).toBe(FileAppearanceTypeEnum.document) + }) + + it('should return document type for epub extension', () => { + expect(extensionToFileType('epub')).toBe(FileAppearanceTypeEnum.document) + }) + + it('should return document type for html extension', () => { + expect(extensionToFileType('html')).toBe(FileAppearanceTypeEnum.document) + }) + + it('should return document type for htm extension', () => { + expect(extensionToFileType('htm')).toBe(FileAppearanceTypeEnum.document) + }) + + it('should return document type for xml extension', () => { + expect(extensionToFileType('xml')).toBe(FileAppearanceTypeEnum.document) + }) + }) + + describe('PowerPoint files', () => { + it('should return ppt type for ppt extension', () => { + expect(extensionToFileType('ppt')).toBe(FileAppearanceTypeEnum.ppt) + }) + + it('should return ppt type for pptx extension', () => { + expect(extensionToFileType('pptx')).toBe(FileAppearanceTypeEnum.ppt) + }) + }) + + describe('Edge cases', () => { + it('should return custom type for unknown extension', () => { + expect(extensionToFileType('unknown')).toBe(FileAppearanceTypeEnum.custom) + }) + + it('should return custom type for empty string', () => { + expect(extensionToFileType('')).toBe(FileAppearanceTypeEnum.custom) + }) + }) +}) + +// ============================================================================ +// Score Component Tests +// ============================================================================ + +describe('Score', () => { + describe('Rendering', () => { + it('should render score with correct value', () => { + render() + expect(screen.getByText('0.85')).toBeInTheDocument() + expect(screen.getByText('score')).toBeInTheDocument() + }) + + it('should render nothing when value is null', () => { + const { container } = render() + expect(container.firstChild).toBeNull() + }) + + it('should render nothing when value is NaN', () => { + const { container } = render() + expect(container.firstChild).toBeNull() + }) + + it('should render nothing when value is 0', () => { + const { container } = render() + expect(container.firstChild).toBeNull() + }) + }) + + describe('Props', () => { + it('should apply besideChunkName styles when prop is true', () => { + const { container } = render() + const wrapper = container.firstChild as HTMLElement + expect(wrapper).toHaveClass('border-l-0') + }) + + it('should apply rounded styles when besideChunkName is false', () => { + const { container } = render() + const wrapper = container.firstChild as HTMLElement + expect(wrapper).toHaveClass('rounded-md') + }) + }) + + describe('Edge Cases', () => { + it('should display full score correctly', () => { + render() + expect(screen.getByText('1.00')).toBeInTheDocument() + }) + + it('should display very small score correctly', () => { + render() + expect(screen.getByText('0.01')).toBeInTheDocument() + }) + }) +}) + +// ============================================================================ +// Mask Component Tests +// ============================================================================ + +describe('Mask', () => { + describe('Rendering', () => { + it('should render without crashing', () => { + const { container } = render() + expect(container.firstChild).toBeInTheDocument() + }) + + it('should have gradient background class', () => { + const { container } = render() + expect(container.firstChild).toHaveClass('bg-gradient-to-b') + }) + }) + + describe('Props', () => { + it('should apply custom className', () => { + const { container } = render() + expect(container.firstChild).toHaveClass('custom-class') + }) + }) +}) + +// ============================================================================ +// EmptyRecords Component Tests +// ============================================================================ + +describe('EmptyRecords', () => { + describe('Rendering', () => { + it('should render without crashing', () => { + render() + expect(screen.getByText(/noRecentTip/i)).toBeInTheDocument() + }) + + it('should render history icon', () => { + const { container } = render() + const icon = container.querySelector('svg') + expect(icon).toBeInTheDocument() + }) + }) +}) + +// ============================================================================ +// ResultItemMeta Component Tests +// ============================================================================ + +describe('ResultItemMeta', () => { + const defaultProps = { + labelPrefix: 'Chunk', + positionId: 1, + wordCount: 100, + score: 0.85, + } + + describe('Rendering', () => { + it('should render without crashing', () => { + render() + expect(screen.getByText(/100/)).toBeInTheDocument() + }) + + it('should render score component', () => { + render() + expect(screen.getByText('0.85')).toBeInTheDocument() + }) + + it('should render word count', () => { + render() + expect(screen.getByText(/100/)).toBeInTheDocument() + }) + }) + + describe('Props', () => { + it('should apply custom className', () => { + const { container } = render() + expect(container.firstChild).toHaveClass('custom-class') + }) + + it('should handle different position IDs', () => { + render() + // Position ID is passed to SegmentIndexTag + expect(screen.getByText(/42/)).toBeInTheDocument() + }) + }) +}) + +// ============================================================================ +// ResultItemFooter Component Tests +// ============================================================================ + +describe('ResultItemFooter', () => { + const mockShowDetailModal = vi.fn() + const defaultProps = { + docType: FileAppearanceTypeEnum.pdf, + docTitle: 'Test Document.pdf', + showDetailModal: mockShowDetailModal, + } + + beforeEach(() => { + vi.clearAllMocks() + }) + + describe('Rendering', () => { + it('should render without crashing', () => { + render() + expect(screen.getByText('Test Document.pdf')).toBeInTheDocument() + }) + + it('should render open button', () => { + render() + expect(screen.getByText(/open/i)).toBeInTheDocument() + }) + }) + + describe('User Interactions', () => { + it('should call showDetailModal when open button is clicked', async () => { + render() + + const openButton = screen.getByText(/open/i).parentElement + if (openButton) + fireEvent.click(openButton) + + expect(mockShowDetailModal).toHaveBeenCalledTimes(1) + }) + }) +}) + +// ============================================================================ +// ChildChunksItem Component Tests +// ============================================================================ + +describe('ChildChunksItem', () => { + const mockChildChunk = createMockChildChunk() + + describe('Rendering', () => { + it('should render without crashing', () => { + render() + expect(screen.getByText(/Child chunk content/)).toBeInTheDocument() + }) + + it('should render position identifier', () => { + render() + // The C- and position number are in the same element + expect(screen.getByText(/C-/)).toBeInTheDocument() + }) + + it('should render score', () => { + render() + expect(screen.getByText('0.90')).toBeInTheDocument() + }) + }) + + describe('Props', () => { + it('should apply line-clamp when isShowAll is false', () => { + const { container } = render() + expect(container.firstChild).toHaveClass('line-clamp-2') + }) + + it('should not apply line-clamp when isShowAll is true', () => { + const { container } = render() + expect(container.firstChild).not.toHaveClass('line-clamp-2') + }) + }) +}) + +// ============================================================================ +// ResultItem Component Tests +// ============================================================================ + +describe('ResultItem', () => { + const mockHitTesting = createMockHitTesting() + + describe('Rendering', () => { + it('should render without crashing', () => { + render() + // Document name should be visible + expect(screen.getByText('test-document.pdf')).toBeInTheDocument() + }) + + it('should render score', () => { + render() + expect(screen.getByText('0.85')).toBeInTheDocument() + }) + + it('should render document name in footer', () => { + render() + expect(screen.getByText('test-document.pdf')).toBeInTheDocument() + }) + }) + + describe('User Interactions', () => { + it('should open detail modal when clicked', async () => { + render() + + const item = screen.getByText('test-document.pdf').closest('.cursor-pointer') + if (item) + fireEvent.click(item) + + await waitFor(() => { + expect(screen.getByText(/chunkDetail/i)).toBeInTheDocument() + }) + }) + }) + + describe('Parent-Child Retrieval', () => { + it('should render child chunks when present', () => { + const payloadWithChildren = createMockHitTesting({ + child_chunks: [createMockChildChunk()], + }) + + render() + expect(screen.getByText(/hitChunks/i)).toBeInTheDocument() + }) + + it('should toggle fold state when child chunks header is clicked', async () => { + const payloadWithChildren = createMockHitTesting({ + child_chunks: [createMockChildChunk()], + }) + + render() + + // Child chunks should be visible by default (not folded) + expect(screen.getByText(/Child chunk content/)).toBeInTheDocument() + + // Click to fold + const toggleButton = screen.getByText(/hitChunks/i).parentElement + if (toggleButton) { + fireEvent.click(toggleButton) + + await waitFor(() => { + expect(screen.queryByText(/Child chunk content/)).not.toBeInTheDocument() + }) + } + }) + }) + + describe('Keywords', () => { + it('should render keywords when present and no child chunks', () => { + const payload = createMockHitTesting({ + segment: createMockSegment({ keywords: ['keyword1', 'keyword2'] }), + child_chunks: null, + }) + + render() + expect(screen.getByText('keyword1')).toBeInTheDocument() + expect(screen.getByText('keyword2')).toBeInTheDocument() + }) + + it('should not render keywords when child chunks are present', () => { + const payload = createMockHitTesting({ + segment: createMockSegment({ keywords: ['keyword1'] }), + child_chunks: [createMockChildChunk()], + }) + + render() + expect(screen.queryByText('keyword1')).not.toBeInTheDocument() + }) + }) +}) + +// ============================================================================ +// ResultItemExternal Component Tests +// ============================================================================ + +describe('ResultItemExternal', () => { + const defaultProps = { + payload: { + content: 'External content', + title: 'External Title', + score: 0.75, + metadata: { + 'x-amz-bedrock-kb-source-uri': 'source-uri', + 'x-amz-bedrock-kb-data-source-id': 'data-source-id', + }, + }, + positionId: 1, + } + + describe('Rendering', () => { + it('should render without crashing', () => { + render() + expect(screen.getByText('External content')).toBeInTheDocument() + }) + + it('should render title in footer', () => { + render() + expect(screen.getByText('External Title')).toBeInTheDocument() + }) + + it('should render score', () => { + render() + expect(screen.getByText('0.75')).toBeInTheDocument() + }) + }) + + describe('User Interactions', () => { + it('should open detail modal when clicked', async () => { + render() + + const item = screen.getByText('External content').closest('.cursor-pointer') + if (item) + fireEvent.click(item) + + await waitFor(() => { + expect(screen.getByText(/chunkDetail/i)).toBeInTheDocument() + }) + }) + }) +}) + +// ============================================================================ +// Textarea Component Tests +// ============================================================================ + +describe('Textarea', () => { + const mockHandleTextChange = vi.fn() + + beforeEach(() => { + vi.clearAllMocks() + }) + + describe('Rendering', () => { + it('should render without crashing', () => { + render(