diff --git a/.agents/skills/component-refactoring/SKILL.md b/.agents/skills/component-refactoring/SKILL.md index 7006c382c8..140e0ef434 100644 --- a/.agents/skills/component-refactoring/SKILL.md +++ b/.agents/skills/component-refactoring/SKILL.md @@ -480,4 +480,4 @@ const useButtonState = () => { ### Related Skills - `frontend-testing` - For testing refactored components -- `web/testing/testing.md` - Testing specification +- `web/docs/test.md` - Testing specification diff --git a/.agents/skills/frontend-testing/SKILL.md b/.agents/skills/frontend-testing/SKILL.md index 0716c81ef7..280fcb6341 100644 --- a/.agents/skills/frontend-testing/SKILL.md +++ b/.agents/skills/frontend-testing/SKILL.md @@ -7,7 +7,7 @@ description: Generate Vitest + React Testing Library tests for Dify frontend com This skill enables Claude to generate high-quality, comprehensive frontend tests for the Dify project following established conventions and best practices. -> **⚠️ Authoritative Source**: This skill is derived from `web/testing/testing.md`. Use Vitest mock/timer APIs (`vi.*`). +> **⚠️ Authoritative Source**: This skill is derived from `web/docs/test.md`. Use Vitest mock/timer APIs (`vi.*`). ## When to Apply This Skill @@ -309,7 +309,7 @@ For more detailed information, refer to: ### Primary Specification (MUST follow) -- **`web/testing/testing.md`** - The canonical testing specification. This skill is derived from this document. +- **`web/docs/test.md`** - The canonical testing specification. This skill is derived from this document. ### Reference Examples in Codebase diff --git a/.agents/skills/frontend-testing/references/workflow.md b/.agents/skills/frontend-testing/references/workflow.md index 009c3e013b..bc4ed8285a 100644 --- a/.agents/skills/frontend-testing/references/workflow.md +++ b/.agents/skills/frontend-testing/references/workflow.md @@ -4,7 +4,7 @@ This guide defines the workflow for generating tests, especially for complex com ## Scope Clarification -This guide addresses **multi-file workflow** (how to process multiple test files). For coverage requirements within a single test file, see `web/testing/testing.md` § Coverage Goals. +This guide addresses **multi-file workflow** (how to process multiple test files). For coverage requirements within a single test file, see `web/docs/test.md` § Coverage Goals. | Scope | Rule | |-------|------| diff --git a/.github/workflows/api-tests.yml b/.github/workflows/api-tests.yml index 190e00d9fe..52e3272f99 100644 --- a/.github/workflows/api-tests.yml +++ b/.github/workflows/api-tests.yml @@ -72,6 +72,7 @@ jobs: OPENDAL_FS_ROOT: /tmp/dify-storage run: | uv run --project api pytest \ + -n auto \ --timeout "${PYTEST_TIMEOUT:-180}" \ api/tests/integration_tests/workflow \ api/tests/integration_tests/tools \ diff --git a/.github/workflows/style.yml b/.github/workflows/style.yml index fdc05d1d65..cbd6edf94b 100644 --- a/.github/workflows/style.yml +++ b/.github/workflows/style.yml @@ -47,13 +47,9 @@ jobs: if: steps.changed-files.outputs.any_changed == 'true' run: uv run --directory api --dev lint-imports - - name: Run Basedpyright Checks + - name: Run Type Checks if: steps.changed-files.outputs.any_changed == 'true' - run: dev/basedpyright-check - - - name: Run Mypy Type Checks - if: steps.changed-files.outputs.any_changed == 'true' - run: uv --directory api run mypy --exclude-gitignore --exclude 'tests/' --exclude 'migrations/' --check-untyped-defs --disable-error-code=import-untyped . + run: make type-check - name: Dotenv check if: steps.changed-files.outputs.any_changed == 'true' diff --git a/AGENTS.md b/AGENTS.md index 7d96ac3a6d..51fa6e4527 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -7,7 +7,7 @@ Dify is an open-source platform for developing LLM applications with an intuitiv The codebase is split into: - **Backend API** (`/api`): Python Flask application organized with Domain-Driven Design -- **Frontend Web** (`/web`): Next.js 15 application using TypeScript and React 19 +- **Frontend Web** (`/web`): Next.js application using TypeScript and React - **Docker deployment** (`/docker`): Containerized deployment configurations ## Backend Workflow @@ -18,36 +18,7 @@ The codebase is split into: ## Frontend Workflow -```bash -cd web -pnpm lint:fix -pnpm type-check:tsgo -pnpm test -``` - -### Frontend Linting - -ESLint is used for frontend code quality. Available commands: - -```bash -# Lint all files (report only) -pnpm lint - -# Lint and auto-fix issues -pnpm lint:fix - -# Lint specific files or directories -pnpm lint:fix app/components/base/button/ -pnpm lint:fix app/components/base/button/index.tsx - -# Lint quietly (errors only, no warnings) -pnpm lint:quiet - -# Check code complexity -pnpm lint:complexity -``` - -**Important**: Always run `pnpm lint:fix` before committing. The pre-commit hook runs `lint-staged` which only lints staged files. +- Read `web/AGENTS.md` for details ## Testing & Quality Practices diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 20a7d6c6f6..d7f007af67 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -77,7 +77,7 @@ How we prioritize: For setting up the frontend service, please refer to our comprehensive [guide](https://github.com/langgenius/dify/blob/main/web/README.md) in the `web/README.md` file. This document provides detailed instructions to help you set up the frontend environment properly. -**Testing**: All React components must have comprehensive test coverage. See [web/testing/testing.md](https://github.com/langgenius/dify/blob/main/web/testing/testing.md) for the canonical frontend testing guidelines and follow every requirement described there. +**Testing**: All React components must have comprehensive test coverage. See [web/docs/test.md](https://github.com/langgenius/dify/blob/main/web/docs/test.md) for the canonical frontend testing guidelines and follow every requirement described there. #### Backend diff --git a/Makefile b/Makefile index e92a7b1314..984e8676ee 100644 --- a/Makefile +++ b/Makefile @@ -68,9 +68,11 @@ lint: @echo "✅ Linting complete" type-check: - @echo "📝 Running type check with basedpyright..." - @uv run --directory api --dev basedpyright - @echo "✅ Type check complete" + @echo "📝 Running type checks (basedpyright + mypy + ty)..." + @./dev/basedpyright-check $(PATH_TO_CHECK) + @uv --directory api run mypy --exclude-gitignore --exclude 'tests/' --exclude 'migrations/' --check-untyped-defs --disable-error-code=import-untyped . + @cd api && uv run ty check + @echo "✅ Type checks complete" test: @echo "🧪 Running backend unit tests..." @@ -78,7 +80,7 @@ test: echo "Target: $(TARGET_TESTS)"; \ uv run --project api --dev pytest $(TARGET_TESTS); \ else \ - uv run --project api --dev dev/pytest/pytest_unit_tests.sh; \ + PYTEST_XDIST_ARGS="-n auto" uv run --project api --dev dev/pytest/pytest_unit_tests.sh; \ fi @echo "✅ Tests complete" @@ -130,7 +132,7 @@ help: @echo " make format - Format code with ruff" @echo " make check - Check code with ruff" @echo " make lint - Format, fix, and lint code (ruff, imports, dotenv)" - @echo " make type-check - Run type checking with basedpyright" + @echo " make type-check - Run type checks (basedpyright, mypy, ty)" @echo " make test - Run backend unit tests (or TARGET_TESTS=./api/tests/)" @echo "" @echo "Docker Build Targets:" diff --git a/api/.env.example b/api/.env.example index b5dbfff238..6804ffb822 100644 --- a/api/.env.example +++ b/api/.env.example @@ -620,6 +620,7 @@ PLUGIN_DAEMON_URL=http://127.0.0.1:5002 PLUGIN_REMOTE_INSTALL_PORT=5003 PLUGIN_REMOTE_INSTALL_HOST=localhost PLUGIN_MAX_PACKAGE_SIZE=15728640 +PLUGIN_MODEL_SCHEMA_CACHE_TTL=3600 INNER_API_KEY_FOR_PLUGIN=QaHbTe77CtuXmsfyhR7+vRjI/+XbV1AaFy691iy+kGDv2Jvy0/eAh8Y1 # Marketplace configuration diff --git a/api/.importlinter b/api/.importlinter index 2b4a3a5bd6..9dad254560 100644 --- a/api/.importlinter +++ b/api/.importlinter @@ -227,6 +227,9 @@ ignore_imports = core.workflow.nodes.knowledge_index.entities -> core.rag.retrieval.retrieval_methods core.workflow.nodes.knowledge_index.knowledge_index_node -> core.rag.retrieval.retrieval_methods core.workflow.nodes.knowledge_index.knowledge_index_node -> models.dataset + core.workflow.nodes.knowledge_index.knowledge_index_node -> services.summary_index_service + core.workflow.nodes.knowledge_index.knowledge_index_node -> tasks.generate_summary_index_task + core.workflow.nodes.knowledge_index.knowledge_index_node -> core.rag.index_processor.processor.paragraph_index_processor core.workflow.nodes.knowledge_retrieval.knowledge_retrieval_node -> core.rag.retrieval.retrieval_methods core.workflow.nodes.llm.node -> models.dataset core.workflow.nodes.agent.agent_node -> core.tools.utils.message_transformer @@ -300,6 +303,58 @@ ignore_imports = core.workflow.nodes.agent.agent_node -> services core.workflow.nodes.tool.tool_node -> services +[importlinter:contract:model-runtime-no-internal-imports] +name = Model Runtime Internal Imports +type = forbidden +source_modules = + core.model_runtime +forbidden_modules = + configs + controllers + extensions + models + services + tasks + core.agent + core.app + core.base + core.callback_handler + core.datasource + core.db + core.entities + core.errors + core.extension + core.external_data_tool + core.file + core.helper + core.hosting_configuration + core.indexing_runner + core.llm_generator + core.logging + core.mcp + core.memory + core.model_manager + core.moderation + core.ops + core.plugin + core.prompt + core.provider_manager + core.rag + core.repositories + core.schemas + core.tools + core.trigger + core.variables + core.workflow +ignore_imports = + core.model_runtime.model_providers.__base.ai_model -> configs + core.model_runtime.model_providers.__base.ai_model -> extensions.ext_redis + core.model_runtime.model_providers.__base.large_language_model -> configs + core.model_runtime.model_providers.__base.text_embedding_model -> core.entities.embedding_type + core.model_runtime.model_providers.model_provider_factory -> configs + core.model_runtime.model_providers.model_provider_factory -> extensions.ext_redis + core.model_runtime.model_providers.model_provider_factory -> models.provider_ids + [importlinter:contract:rsc] name = RSC type = layers diff --git a/api/configs/feature/__init__.py b/api/configs/feature/__init__.py index 1792190e34..93c8c06ca5 100644 --- a/api/configs/feature/__init__.py +++ b/api/configs/feature/__init__.py @@ -243,6 +243,11 @@ class PluginConfig(BaseSettings): default=15728640 * 12, ) + PLUGIN_MODEL_SCHEMA_CACHE_TTL: PositiveInt = Field( + description="TTL in seconds for caching plugin model schemas in Redis", + default=60 * 60, + ) + class CliApiConfig(BaseSettings): """ diff --git a/api/contexts/__init__.py b/api/contexts/__init__.py index 7c16bc231f..c52dcf8a57 100644 --- a/api/contexts/__init__.py +++ b/api/contexts/__init__.py @@ -6,7 +6,6 @@ from contexts.wrapper import RecyclableContextVar if TYPE_CHECKING: from core.datasource.__base.datasource_provider import DatasourcePluginProviderController - from core.model_runtime.entities.model_entities import AIModelEntity from core.plugin.entities.plugin_daemon import PluginModelProviderEntity from core.tools.plugin_tool.provider import PluginToolProviderController from core.trigger.provider import PluginTriggerProviderController @@ -29,12 +28,6 @@ plugin_model_providers_lock: RecyclableContextVar[Lock] = RecyclableContextVar( ContextVar("plugin_model_providers_lock") ) -plugin_model_schema_lock: RecyclableContextVar[Lock] = RecyclableContextVar(ContextVar("plugin_model_schema_lock")) - -plugin_model_schemas: RecyclableContextVar[dict[str, "AIModelEntity"]] = RecyclableContextVar( - ContextVar("plugin_model_schemas") -) - datasource_plugin_providers: RecyclableContextVar[dict[str, "DatasourcePluginProviderController"]] = ( RecyclableContextVar(ContextVar("datasource_plugin_providers")) ) diff --git a/api/controllers/console/admin.py b/api/controllers/console/admin.py index e1ee2c24b8..03b602f6e8 100644 --- a/api/controllers/console/admin.py +++ b/api/controllers/console/admin.py @@ -243,15 +243,13 @@ class InsertExploreBannerApi(Resource): def post(self): payload = InsertExploreBannerPayload.model_validate(console_ns.payload) - content = { - "category": payload.category, - "title": payload.title, - "description": payload.description, - "img-src": payload.img_src, - } - banner = ExporleBanner( - content=content, + content={ + "category": payload.category, + "title": payload.title, + "description": payload.description, + "img-src": payload.img_src, + }, link=payload.link, sort=payload.sort, language=payload.language, diff --git a/api/controllers/console/app/app_import.py b/api/controllers/console/app/app_import.py index 362291d779..092b346975 100644 --- a/api/controllers/console/app/app_import.py +++ b/api/controllers/console/app/app_import.py @@ -51,7 +51,7 @@ class AppImportPayload(BaseModel): app_id: str | None = Field(None) -class AppImportBundlePayload(BaseModel): +class AppImportBundleConfirmPayload(BaseModel): name: str | None = None description: str | None = None icon_type: str | None = None @@ -149,15 +149,38 @@ class AppImportCheckDependenciesApi(Resource): return result.model_dump(mode="json"), 200 -@console_ns.route("/apps/imports-bundle") -class AppImportBundleApi(Resource): +@console_ns.route("/apps/imports-bundle/prepare") +class AppImportBundlePrepareApi(Resource): + """Step 1: Get upload URL for bundle import.""" + + @setup_required + @login_required + @account_initialization_required + @edit_permission_required + def post(self): + from services.app_bundle_service import AppBundleService + + current_user, current_tenant_id = current_account_with_tenant() + + result = AppBundleService.prepare_import( + tenant_id=current_tenant_id, + account_id=current_user.id, + ) + + return {"import_id": result.import_id, "upload_url": result.upload_url}, 200 + + +@console_ns.route("/apps/imports-bundle//confirm") +class AppImportBundleConfirmApi(Resource): + """Step 2: Confirm bundle import after upload.""" + @setup_required @login_required @account_initialization_required @marshal_with(app_import_model) @cloud_edition_billing_resource_check("apps") @edit_permission_required - def post(self): + def post(self, import_id: str): from flask import request from core.app.entities.app_bundle_entities import BundleFormatError @@ -165,22 +188,12 @@ class AppImportBundleApi(Resource): current_user, _ = current_account_with_tenant() - if "file" not in request.files: - return {"error": "No file provided"}, 400 - - file = request.files["file"] - if not file.filename or not file.filename.endswith(".zip"): - return {"error": "Invalid file format, expected .zip"}, 400 - - zip_bytes = file.read() - - form_data = request.form.to_dict() - args = AppImportBundlePayload.model_validate(form_data) + args = AppImportBundleConfirmPayload.model_validate(request.get_json() or {}) try: - result = AppBundleService.import_bundle( + result = AppBundleService.confirm_import( + import_id=import_id, account=current_user, - zip_bytes=zip_bytes, name=args.name, description=args.description, icon_type=args.icon_type, diff --git a/api/controllers/console/app/generator.py b/api/controllers/console/app/generator.py index 63870f8038..d14dd52e4e 100644 --- a/api/controllers/console/app/generator.py +++ b/api/controllers/console/app/generator.py @@ -70,9 +70,7 @@ class ContextGeneratePayload(BaseModel): model_config_data: dict[str, Any] = Field(..., alias="model_config", description="Model configuration") available_vars: list[AvailableVarPayload] = Field(..., description="Available variables from upstream nodes") parameter_info: ParameterInfoPayload = Field(..., description="Target parameter metadata from the frontend") - code_context: CodeContextPayload = Field( - description="Existing code node context for incremental generation" - ) + code_context: CodeContextPayload = Field(description="Existing code node context for incremental generation") class SuggestedQuestionsPayload(BaseModel): diff --git a/api/controllers/console/datasets/datasets.py b/api/controllers/console/datasets/datasets.py index 8fbbc51e21..30e4ed1119 100644 --- a/api/controllers/console/datasets/datasets.py +++ b/api/controllers/console/datasets/datasets.py @@ -148,6 +148,7 @@ class DatasetUpdatePayload(BaseModel): embedding_model: str | None = None embedding_model_provider: str | None = None retrieval_model: dict[str, Any] | None = None + summary_index_setting: dict[str, Any] | None = None partial_member_list: list[dict[str, str]] | None = None external_retrieval_model: dict[str, Any] | None = None external_knowledge_id: str | None = None @@ -288,7 +289,14 @@ class DatasetListApi(Resource): @enterprise_license_required def get(self): current_user, current_tenant_id = current_account_with_tenant() - query = ConsoleDatasetListQuery.model_validate(request.args.to_dict()) + # Convert query parameters to dict, handling list parameters correctly + query_params: dict[str, str | list[str]] = dict(request.args.to_dict()) + # Handle ids and tag_ids as lists (Flask request.args.getlist returns list even for single value) + if "ids" in request.args: + query_params["ids"] = request.args.getlist("ids") + if "tag_ids" in request.args: + query_params["tag_ids"] = request.args.getlist("tag_ids") + query = ConsoleDatasetListQuery.model_validate(query_params) # provider = request.args.get("provider", default="vendor") if query.ids: datasets, total = DatasetService.get_datasets_by_ids(query.ids, current_tenant_id) diff --git a/api/controllers/console/datasets/datasets_document.py b/api/controllers/console/datasets/datasets_document.py index 57fb9abf29..6e3c0db8a3 100644 --- a/api/controllers/console/datasets/datasets_document.py +++ b/api/controllers/console/datasets/datasets_document.py @@ -45,6 +45,7 @@ from models.dataset import DocumentPipelineExecutionLog from services.dataset_service import DatasetService, DocumentService from services.entities.knowledge_entities.knowledge_entities import KnowledgeConfig, ProcessRule, RetrievalModel from services.file_service import FileService +from tasks.generate_summary_index_task import generate_summary_index_task from ..app.error import ( ProviderModelCurrentlyNotSupportError, @@ -103,6 +104,10 @@ class DocumentRenamePayload(BaseModel): name: str +class GenerateSummaryPayload(BaseModel): + document_list: list[str] + + class DocumentBatchDownloadZipPayload(BaseModel): """Request payload for bulk downloading documents as a zip archive.""" @@ -125,6 +130,7 @@ register_schema_models( RetrievalModel, DocumentRetryPayload, DocumentRenamePayload, + GenerateSummaryPayload, DocumentBatchDownloadZipPayload, ) @@ -312,6 +318,13 @@ class DatasetDocumentListApi(Resource): paginated_documents = db.paginate(select=query, page=page, per_page=limit, max_per_page=100, error_out=False) documents = paginated_documents.items + + DocumentService.enrich_documents_with_summary_index_status( + documents=documents, + dataset=dataset, + tenant_id=current_tenant_id, + ) + if fetch: for document in documents: completed_segments = ( @@ -797,6 +810,7 @@ class DocumentApi(DocumentResource): "display_status": document.display_status, "doc_form": document.doc_form, "doc_language": document.doc_language, + "need_summary": document.need_summary if document.need_summary is not None else False, } else: dataset_process_rules = DatasetService.get_process_rules(dataset_id) @@ -832,6 +846,7 @@ class DocumentApi(DocumentResource): "display_status": document.display_status, "doc_form": document.doc_form, "doc_language": document.doc_language, + "need_summary": document.need_summary if document.need_summary is not None else False, } return response, 200 @@ -1255,3 +1270,137 @@ class DocumentPipelineExecutionLogApi(DocumentResource): "input_data": log.input_data, "datasource_node_id": log.datasource_node_id, }, 200 + + +@console_ns.route("/datasets//documents/generate-summary") +class DocumentGenerateSummaryApi(Resource): + @console_ns.doc("generate_summary_for_documents") + @console_ns.doc(description="Generate summary index for documents") + @console_ns.doc(params={"dataset_id": "Dataset ID"}) + @console_ns.expect(console_ns.models[GenerateSummaryPayload.__name__]) + @console_ns.response(200, "Summary generation started successfully") + @console_ns.response(400, "Invalid request or dataset configuration") + @console_ns.response(403, "Permission denied") + @console_ns.response(404, "Dataset not found") + @setup_required + @login_required + @account_initialization_required + @cloud_edition_billing_rate_limit_check("knowledge") + def post(self, dataset_id): + """ + Generate summary index for specified documents. + + This endpoint checks if the dataset configuration supports summary generation + (indexing_technique must be 'high_quality' and summary_index_setting.enable must be true), + then asynchronously generates summary indexes for the provided documents. + """ + current_user, _ = current_account_with_tenant() + dataset_id = str(dataset_id) + + # Get dataset + dataset = DatasetService.get_dataset(dataset_id) + if not dataset: + raise NotFound("Dataset not found.") + + # Check permissions + if not current_user.is_dataset_editor: + raise Forbidden() + + try: + DatasetService.check_dataset_permission(dataset, current_user) + except services.errors.account.NoPermissionError as e: + raise Forbidden(str(e)) + + # Validate request payload + payload = GenerateSummaryPayload.model_validate(console_ns.payload or {}) + document_list = payload.document_list + + if not document_list: + from werkzeug.exceptions import BadRequest + + raise BadRequest("document_list cannot be empty.") + + # Check if dataset configuration supports summary generation + if dataset.indexing_technique != "high_quality": + raise ValueError( + f"Summary generation is only available for 'high_quality' indexing technique. " + f"Current indexing technique: {dataset.indexing_technique}" + ) + + summary_index_setting = dataset.summary_index_setting + if not summary_index_setting or not summary_index_setting.get("enable"): + raise ValueError("Summary index is not enabled for this dataset. Please enable it in the dataset settings.") + + # Verify all documents exist and belong to the dataset + documents = DocumentService.get_documents_by_ids(dataset_id, document_list) + + if len(documents) != len(document_list): + found_ids = {doc.id for doc in documents} + missing_ids = set(document_list) - found_ids + raise NotFound(f"Some documents not found: {list(missing_ids)}") + + # Dispatch async tasks for each document + for document in documents: + # Skip qa_model documents as they don't generate summaries + if document.doc_form == "qa_model": + logger.info("Skipping summary generation for qa_model document %s", document.id) + continue + + # Dispatch async task + generate_summary_index_task.delay(dataset_id, document.id) + logger.info( + "Dispatched summary generation task for document %s in dataset %s", + document.id, + dataset_id, + ) + + return {"result": "success"}, 200 + + +@console_ns.route("/datasets//documents//summary-status") +class DocumentSummaryStatusApi(DocumentResource): + @console_ns.doc("get_document_summary_status") + @console_ns.doc(description="Get summary index generation status for a document") + @console_ns.doc(params={"dataset_id": "Dataset ID", "document_id": "Document ID"}) + @console_ns.response(200, "Summary status retrieved successfully") + @console_ns.response(404, "Document not found") + @setup_required + @login_required + @account_initialization_required + def get(self, dataset_id, document_id): + """ + Get summary index generation status for a document. + + Returns: + - total_segments: Total number of segments in the document + - summary_status: Dictionary with status counts + - completed: Number of summaries completed + - generating: Number of summaries being generated + - error: Number of summaries with errors + - not_started: Number of segments without summary records + - summaries: List of summary records with status and content preview + """ + current_user, _ = current_account_with_tenant() + dataset_id = str(dataset_id) + document_id = str(document_id) + + # Get dataset + dataset = DatasetService.get_dataset(dataset_id) + if not dataset: + raise NotFound("Dataset not found.") + + # Check permissions + try: + DatasetService.check_dataset_permission(dataset, current_user) + except services.errors.account.NoPermissionError as e: + raise Forbidden(str(e)) + + # Get summary status detail from service + from services.summary_index_service import SummaryIndexService + + result = SummaryIndexService.get_document_summary_status_detail( + document_id=document_id, + dataset_id=dataset_id, + ) + + return result, 200 diff --git a/api/controllers/console/datasets/datasets_segments.py b/api/controllers/console/datasets/datasets_segments.py index 08e1ddd3e0..23a668112d 100644 --- a/api/controllers/console/datasets/datasets_segments.py +++ b/api/controllers/console/datasets/datasets_segments.py @@ -41,6 +41,17 @@ from services.errors.chunk import ChildChunkIndexingError as ChildChunkIndexingS from tasks.batch_create_segment_to_index_task import batch_create_segment_to_index_task +def _get_segment_with_summary(segment, dataset_id): + """Helper function to marshal segment and add summary information.""" + from services.summary_index_service import SummaryIndexService + + segment_dict = dict(marshal(segment, segment_fields)) + # Query summary for this segment (only enabled summaries) + summary = SummaryIndexService.get_segment_summary(segment_id=segment.id, dataset_id=dataset_id) + segment_dict["summary"] = summary.summary_content if summary else None + return segment_dict + + class SegmentListQuery(BaseModel): limit: int = Field(default=20, ge=1, le=100) status: list[str] = Field(default_factory=list) @@ -63,6 +74,7 @@ class SegmentUpdatePayload(BaseModel): keywords: list[str] | None = None regenerate_child_chunks: bool = False attachment_ids: list[str] | None = None + summary: str | None = None # Summary content for summary index class BatchImportPayload(BaseModel): @@ -181,8 +193,25 @@ class DatasetDocumentSegmentListApi(Resource): segments = db.paginate(select=query, page=page, per_page=limit, max_per_page=100, error_out=False) + # Query summaries for all segments in this page (batch query for efficiency) + segment_ids = [segment.id for segment in segments.items] + summaries = {} + if segment_ids: + from services.summary_index_service import SummaryIndexService + + summary_records = SummaryIndexService.get_segments_summaries(segment_ids=segment_ids, dataset_id=dataset_id) + # Only include enabled summaries (already filtered by service) + summaries = {chunk_id: summary.summary_content for chunk_id, summary in summary_records.items()} + + # Add summary to each segment + segments_with_summary = [] + for segment in segments.items: + segment_dict = dict(marshal(segment, segment_fields)) + segment_dict["summary"] = summaries.get(segment.id) + segments_with_summary.append(segment_dict) + response = { - "data": marshal(segments.items, segment_fields), + "data": segments_with_summary, "limit": limit, "total": segments.total, "total_pages": segments.pages, @@ -328,7 +357,7 @@ class DatasetDocumentSegmentAddApi(Resource): payload_dict = payload.model_dump(exclude_none=True) SegmentService.segment_create_args_validate(payload_dict, document) segment = SegmentService.create_segment(payload_dict, document, dataset) - return {"data": marshal(segment, segment_fields), "doc_form": document.doc_form}, 200 + return {"data": _get_segment_with_summary(segment, dataset_id), "doc_form": document.doc_form}, 200 @console_ns.route("/datasets//documents//segments/") @@ -390,10 +419,12 @@ class DatasetDocumentSegmentUpdateApi(Resource): payload = SegmentUpdatePayload.model_validate(console_ns.payload or {}) payload_dict = payload.model_dump(exclude_none=True) SegmentService.segment_create_args_validate(payload_dict, document) + + # Update segment (summary update with change detection is handled in SegmentService.update_segment) segment = SegmentService.update_segment( SegmentUpdateArgs.model_validate(payload.model_dump(exclude_none=True)), segment, document, dataset ) - return {"data": marshal(segment, segment_fields), "doc_form": document.doc_form}, 200 + return {"data": _get_segment_with_summary(segment, dataset_id), "doc_form": document.doc_form}, 200 @setup_required @login_required diff --git a/api/controllers/console/datasets/hit_testing.py b/api/controllers/console/datasets/hit_testing.py index 932cb4fcce..e62be13c2f 100644 --- a/api/controllers/console/datasets/hit_testing.py +++ b/api/controllers/console/datasets/hit_testing.py @@ -1,6 +1,13 @@ -from flask_restx import Resource +from flask_restx import Resource, fields from controllers.common.schema import register_schema_model +from fields.hit_testing_fields import ( + child_chunk_fields, + document_fields, + files_fields, + hit_testing_record_fields, + segment_fields, +) from libs.login import login_required from .. import console_ns @@ -14,13 +21,45 @@ from ..wraps import ( register_schema_model(console_ns, HitTestingPayload) +def _get_or_create_model(model_name: str, field_def): + """Get or create a flask_restx model to avoid dict type issues in Swagger.""" + existing = console_ns.models.get(model_name) + if existing is None: + existing = console_ns.model(model_name, field_def) + return existing + + +# Register models for flask_restx to avoid dict type issues in Swagger +document_model = _get_or_create_model("HitTestingDocument", document_fields) + +segment_fields_copy = segment_fields.copy() +segment_fields_copy["document"] = fields.Nested(document_model) +segment_model = _get_or_create_model("HitTestingSegment", segment_fields_copy) + +child_chunk_model = _get_or_create_model("HitTestingChildChunk", child_chunk_fields) +files_model = _get_or_create_model("HitTestingFile", files_fields) + +hit_testing_record_fields_copy = hit_testing_record_fields.copy() +hit_testing_record_fields_copy["segment"] = fields.Nested(segment_model) +hit_testing_record_fields_copy["child_chunks"] = fields.List(fields.Nested(child_chunk_model)) +hit_testing_record_fields_copy["files"] = fields.List(fields.Nested(files_model)) +hit_testing_record_model = _get_or_create_model("HitTestingRecord", hit_testing_record_fields_copy) + +# Response model for hit testing API +hit_testing_response_fields = { + "query": fields.String, + "records": fields.List(fields.Nested(hit_testing_record_model)), +} +hit_testing_response_model = _get_or_create_model("HitTestingResponse", hit_testing_response_fields) + + @console_ns.route("/datasets//hit-testing") class HitTestingApi(Resource, DatasetsHitTestingBase): @console_ns.doc("test_dataset_retrieval") @console_ns.doc(description="Test dataset knowledge retrieval") @console_ns.doc(params={"dataset_id": "Dataset ID"}) @console_ns.expect(console_ns.models[HitTestingPayload.__name__]) - @console_ns.response(200, "Hit testing completed successfully") + @console_ns.response(200, "Hit testing completed successfully", model=hit_testing_response_model) @console_ns.response(404, "Dataset not found") @console_ns.response(400, "Invalid parameters") @setup_required diff --git a/api/controllers/files/__init__.py b/api/controllers/files/__init__.py index 77eb012c7c..1c0a270542 100644 --- a/api/controllers/files/__init__.py +++ b/api/controllers/files/__init__.py @@ -15,12 +15,8 @@ api = ExternalApi( files_ns = Namespace("files", description="File operations", path="/") from . import ( - app_assets_download, - app_assets_upload, image_preview, - sandbox_archive, - sandbox_file_downloads, - storage_download, + storage_files, tool_files, upload, ) @@ -29,14 +25,10 @@ api.add_namespace(files_ns) __all__ = [ "api", - "app_assets_download", - "app_assets_upload", "bp", "files_ns", "image_preview", - "sandbox_archive", - "sandbox_file_downloads", - "storage_download", + "storage_files", "tool_files", "upload", ] diff --git a/api/controllers/files/app_assets_download.py b/api/controllers/files/app_assets_download.py deleted file mode 100644 index 1d829a5671..0000000000 --- a/api/controllers/files/app_assets_download.py +++ /dev/null @@ -1,77 +0,0 @@ -from urllib.parse import quote - -from flask import Response, request -from flask_restx import Resource -from pydantic import BaseModel, Field -from werkzeug.exceptions import Forbidden, NotFound - -from controllers.files import files_ns -from core.app_assets.storage import AppAssetSigner, AssetPath -from extensions.ext_storage import storage - -DEFAULT_REF_TEMPLATE_SWAGGER_2_0 = "#/definitions/{model}" - - -class AppAssetDownloadQuery(BaseModel): - expires_at: int = Field(..., description="Unix timestamp when the link expires") - nonce: str = Field(..., description="Random string for signature") - sign: str = Field(..., description="HMAC signature") - - -files_ns.schema_model( - AppAssetDownloadQuery.__name__, - AppAssetDownloadQuery.model_json_schema(ref_template=DEFAULT_REF_TEMPLATE_SWAGGER_2_0), -) - - -@files_ns.route("/app-assets/////download") -@files_ns.route( - "/app-assets//////download" -) -class AppAssetDownloadApi(Resource): - def get( - self, - asset_type: str, - tenant_id: str, - app_id: str, - resource_id: str, - sub_resource_id: str | None = None, - ): - args = AppAssetDownloadQuery.model_validate(request.args.to_dict(flat=True)) - - try: - asset_path = AssetPath.from_components( - asset_type=asset_type, - tenant_id=tenant_id, - app_id=app_id, - resource_id=resource_id, - sub_resource_id=sub_resource_id, - ) - except ValueError as exc: - raise Forbidden(str(exc)) from exc - - if not AppAssetSigner.verify_download_signature( - asset_path=asset_path, - expires_at=args.expires_at, - nonce=args.nonce, - sign=args.sign, - ): - raise Forbidden("Invalid or expired download link") - - storage_key = asset_path.get_storage_key() - - try: - generator = storage.load_stream(storage_key) - except FileNotFoundError as exc: - raise NotFound("File not found") from exc - - encoded_filename = quote(storage_key.split("/")[-1]) - - return Response( - generator, - mimetype="application/octet-stream", - direct_passthrough=True, - headers={ - "Content-Disposition": f"attachment; filename*=UTF-8''{encoded_filename}", - }, - ) diff --git a/api/controllers/files/app_assets_upload.py b/api/controllers/files/app_assets_upload.py deleted file mode 100644 index c25bc5e100..0000000000 --- a/api/controllers/files/app_assets_upload.py +++ /dev/null @@ -1,61 +0,0 @@ -from flask import Response, request -from flask_restx import Resource -from pydantic import BaseModel, Field -from werkzeug.exceptions import Forbidden - -from controllers.files import files_ns -from core.app_assets.storage import AppAssetSigner, AssetPath -from services.app_asset_service import AppAssetService - -DEFAULT_REF_TEMPLATE_SWAGGER_2_0 = "#/definitions/{model}" - - -class AppAssetUploadQuery(BaseModel): - expires_at: int = Field(..., description="Unix timestamp when the link expires") - nonce: str = Field(..., description="Random string for signature") - sign: str = Field(..., description="HMAC signature") - - -files_ns.schema_model( - AppAssetUploadQuery.__name__, - AppAssetUploadQuery.model_json_schema(ref_template=DEFAULT_REF_TEMPLATE_SWAGGER_2_0), -) - - -@files_ns.route("/app-assets/////upload") -@files_ns.route( - "/app-assets//////upload" -) -class AppAssetUploadApi(Resource): - def put( - self, - asset_type: str, - tenant_id: str, - app_id: str, - resource_id: str, - sub_resource_id: str | None = None, - ): - args = AppAssetUploadQuery.model_validate(request.args.to_dict(flat=True)) - - try: - asset_path = AssetPath.from_components( - asset_type=asset_type, - tenant_id=tenant_id, - app_id=app_id, - resource_id=resource_id, - sub_resource_id=sub_resource_id, - ) - except ValueError as exc: - raise Forbidden(str(exc)) from exc - - if not AppAssetSigner.verify_upload_signature( - asset_path=asset_path, - expires_at=args.expires_at, - nonce=args.nonce, - sign=args.sign, - ): - raise Forbidden("Invalid or expired upload link") - - content = request.get_data() - AppAssetService.get_storage().save(asset_path, content) - return Response(status=204) diff --git a/api/controllers/files/sandbox_archive.py b/api/controllers/files/sandbox_archive.py deleted file mode 100644 index 4f5e591a08..0000000000 --- a/api/controllers/files/sandbox_archive.py +++ /dev/null @@ -1,76 +0,0 @@ -from uuid import UUID - -from flask import Response, request -from flask_restx import Resource -from pydantic import BaseModel, Field -from werkzeug.exceptions import Forbidden, NotFound - -from controllers.files import files_ns -from core.sandbox.security.archive_signer import SandboxArchivePath, SandboxArchiveSigner -from extensions.ext_storage import storage - -DEFAULT_REF_TEMPLATE_SWAGGER_2_0 = "#/definitions/{model}" - - -class SandboxArchiveQuery(BaseModel): - expires_at: int = Field(..., description="Unix timestamp when the link expires") - nonce: str = Field(..., description="Random string for signature") - sign: str = Field(..., description="HMAC signature") - - -files_ns.schema_model( - SandboxArchiveQuery.__name__, - SandboxArchiveQuery.model_json_schema(ref_template=DEFAULT_REF_TEMPLATE_SWAGGER_2_0), -) - - -@files_ns.route("/sandbox-archives///download") -class SandboxArchiveDownloadApi(Resource): - def get(self, tenant_id: str, sandbox_id: str): - args = SandboxArchiveQuery.model_validate(request.args.to_dict(flat=True)) - - try: - archive_path = SandboxArchivePath(tenant_id=UUID(tenant_id), sandbox_id=UUID(sandbox_id)) - except ValueError as exc: - raise Forbidden(str(exc)) from exc - - if not SandboxArchiveSigner.verify_download_signature( - archive_path=archive_path, - expires_at=args.expires_at, - nonce=args.nonce, - sign=args.sign, - ): - raise Forbidden("Invalid or expired download link") - - try: - generator = storage.load_stream(archive_path.get_storage_key()) - except FileNotFoundError as exc: - raise NotFound("Archive not found") from exc - - return Response( - generator, - mimetype="application/gzip", - direct_passthrough=True, - ) - - -@files_ns.route("/sandbox-archives///upload") -class SandboxArchiveUploadApi(Resource): - def put(self, tenant_id: str, sandbox_id: str): - args = SandboxArchiveQuery.model_validate(request.args.to_dict(flat=True)) - - try: - archive_path = SandboxArchivePath(tenant_id=UUID(tenant_id), sandbox_id=UUID(sandbox_id)) - except ValueError as exc: - raise Forbidden(str(exc)) from exc - - if not SandboxArchiveSigner.verify_upload_signature( - archive_path=archive_path, - expires_at=args.expires_at, - nonce=args.nonce, - sign=args.sign, - ): - raise Forbidden("Invalid or expired upload link") - - storage.save(archive_path.get_storage_key(), request.get_data()) - return Response(status=204) diff --git a/api/controllers/files/sandbox_file_downloads.py b/api/controllers/files/sandbox_file_downloads.py deleted file mode 100644 index 7f021d4493..0000000000 --- a/api/controllers/files/sandbox_file_downloads.py +++ /dev/null @@ -1,96 +0,0 @@ -from urllib.parse import quote -from uuid import UUID - -from flask import Response, request -from flask_restx import Resource -from pydantic import BaseModel, Field -from werkzeug.exceptions import Forbidden, NotFound - -from controllers.files import files_ns -from core.sandbox.security.sandbox_file_signer import SandboxFileDownloadPath, SandboxFileSigner -from extensions.ext_storage import storage - -DEFAULT_REF_TEMPLATE_SWAGGER_2_0 = "#/definitions/{model}" - - -class SandboxFileDownloadQuery(BaseModel): - expires_at: int = Field(..., description="Unix timestamp when the link expires") - nonce: str = Field(..., description="Random string for signature") - sign: str = Field(..., description="HMAC signature") - - -files_ns.schema_model( - SandboxFileDownloadQuery.__name__, - SandboxFileDownloadQuery.model_json_schema(ref_template=DEFAULT_REF_TEMPLATE_SWAGGER_2_0), -) - - -@files_ns.route( - "/sandbox-file-downloads/////download" -) -class SandboxFileDownloadDownloadApi(Resource): - def get(self, tenant_id: str, sandbox_id: str, export_id: str, filename: str): - args = SandboxFileDownloadQuery.model_validate(request.args.to_dict(flat=True)) - - try: - export_path = SandboxFileDownloadPath( - tenant_id=UUID(tenant_id), - sandbox_id=UUID(sandbox_id), - export_id=export_id, - filename=filename, - ) - except ValueError as exc: - raise Forbidden(str(exc)) from exc - - if not SandboxFileSigner.verify_download_signature( - export_path=export_path, - expires_at=args.expires_at, - nonce=args.nonce, - sign=args.sign, - ): - raise Forbidden("Invalid or expired download link") - - try: - generator = storage.load_stream(export_path.get_storage_key()) - except FileNotFoundError as exc: - raise NotFound("File not found") from exc - - encoded_filename = quote(filename.split("/")[-1]) - - return Response( - generator, - mimetype="application/octet-stream", - direct_passthrough=True, - headers={ - "Content-Disposition": f"attachment; filename*=UTF-8''{encoded_filename}", - }, - ) - - -@files_ns.route( - "/sandbox-file-downloads/////upload" -) -class SandboxFileDownloadUploadApi(Resource): - def put(self, tenant_id: str, sandbox_id: str, export_id: str, filename: str): - args = SandboxFileDownloadQuery.model_validate(request.args.to_dict(flat=True)) - - try: - export_path = SandboxFileDownloadPath( - tenant_id=UUID(tenant_id), - sandbox_id=UUID(sandbox_id), - export_id=export_id, - filename=filename, - ) - except ValueError as exc: - raise Forbidden(str(exc)) from exc - - if not SandboxFileSigner.verify_upload_signature( - export_path=export_path, - expires_at=args.expires_at, - nonce=args.nonce, - sign=args.sign, - ): - raise Forbidden("Invalid or expired upload link") - - storage.save(export_path.get_storage_key(), request.get_data()) - return Response(status=204) diff --git a/api/controllers/files/storage_download.py b/api/controllers/files/storage_download.py deleted file mode 100644 index dfa6193a80..0000000000 --- a/api/controllers/files/storage_download.py +++ /dev/null @@ -1,56 +0,0 @@ -from urllib.parse import quote, unquote - -from flask import Response, request -from flask_restx import Resource -from pydantic import BaseModel, Field -from werkzeug.exceptions import Forbidden, NotFound - -from controllers.files import files_ns -from extensions.ext_storage import storage -from extensions.storage.file_presign_storage import FilePresignStorage - -DEFAULT_REF_TEMPLATE_SWAGGER_2_0 = "#/definitions/{model}" - - -class StorageDownloadQuery(BaseModel): - timestamp: str = Field(..., description="Unix timestamp used in the signature") - nonce: str = Field(..., description="Random string for signature") - sign: str = Field(..., description="HMAC signature") - - -files_ns.schema_model( - StorageDownloadQuery.__name__, - StorageDownloadQuery.model_json_schema(ref_template=DEFAULT_REF_TEMPLATE_SWAGGER_2_0), -) - - -@files_ns.route("/storage//download") -class StorageFileDownloadApi(Resource): - def get(self, filename: str): - filename = unquote(filename) - - args = StorageDownloadQuery.model_validate(request.args.to_dict(flat=True)) - - if not FilePresignStorage.verify_signature( - filename=filename, - timestamp=args.timestamp, - nonce=args.nonce, - sign=args.sign, - ): - raise Forbidden("Invalid or expired download link") - - try: - generator = storage.load_stream(filename) - except FileNotFoundError: - raise NotFound("File not found") - - encoded_filename = quote(filename.split("/")[-1]) - - return Response( - generator, - mimetype="application/octet-stream", - direct_passthrough=True, - headers={ - "Content-Disposition": f"attachment; filename*=UTF-8''{encoded_filename}", - }, - ) diff --git a/api/controllers/files/storage_files.py b/api/controllers/files/storage_files.py new file mode 100644 index 0000000000..1623395e9b --- /dev/null +++ b/api/controllers/files/storage_files.py @@ -0,0 +1,80 @@ +"""Token-based file proxy controller for storage operations. + +This controller handles file download and upload operations using opaque UUID tokens. +The token maps to the real storage key in Redis, so the actual storage path is never +exposed in the URL. + +Routes: + GET /files/storage-files/{token} - Download a file + PUT /files/storage-files/{token} - Upload a file + +The operation type (download/upload) is determined by the ticket stored in Redis, +not by the HTTP method. This ensures a download ticket cannot be used for upload +and vice versa. +""" + +from urllib.parse import quote + +from flask import Response, request +from flask_restx import Resource +from werkzeug.exceptions import Forbidden, NotFound, RequestEntityTooLarge + +from controllers.files import files_ns +from extensions.ext_storage import storage +from services.storage_ticket_service import StorageTicketService + + +@files_ns.route("/storage-files/") +class StorageFilesApi(Resource): + """Handle file operations through token-based URLs.""" + + def get(self, token: str): + """Download a file using a token. + + The ticket must have op="download", otherwise returns 403. + """ + ticket = StorageTicketService.get_ticket(token) + if ticket is None: + raise Forbidden("Invalid or expired token") + + if ticket.op != "download": + raise Forbidden("This token is not valid for download") + + try: + generator = storage.load_stream(ticket.storage_key) + except FileNotFoundError: + raise NotFound("File not found") + + filename = ticket.filename or ticket.storage_key.rsplit("/", 1)[-1] + encoded_filename = quote(filename) + + return Response( + generator, + mimetype="application/octet-stream", + direct_passthrough=True, + headers={ + "Content-Disposition": f"attachment; filename*=UTF-8''{encoded_filename}", + }, + ) + + def put(self, token: str): + """Upload a file using a token. + + The ticket must have op="upload", otherwise returns 403. + If the request body exceeds max_bytes, returns 413. + """ + ticket = StorageTicketService.get_ticket(token) + if ticket is None: + raise Forbidden("Invalid or expired token") + + if ticket.op != "upload": + raise Forbidden("This token is not valid for upload") + + content = request.get_data() + + if ticket.max_bytes is not None and len(content) > ticket.max_bytes: + raise RequestEntityTooLarge(f"Upload exceeds maximum size of {ticket.max_bytes} bytes") + + storage.save(ticket.storage_key, content) + + return Response(status=204) diff --git a/api/controllers/service_api/dataset/dataset.py b/api/controllers/service_api/dataset/dataset.py index 28864a140a..c11f64585a 100644 --- a/api/controllers/service_api/dataset/dataset.py +++ b/api/controllers/service_api/dataset/dataset.py @@ -46,6 +46,7 @@ class DatasetCreatePayload(BaseModel): retrieval_model: RetrievalModel | None = None embedding_model: str | None = None embedding_model_provider: str | None = None + summary_index_setting: dict | None = None class DatasetUpdatePayload(BaseModel): @@ -217,6 +218,7 @@ class DatasetListApi(DatasetApiResource): embedding_model_provider=payload.embedding_model_provider, embedding_model_name=payload.embedding_model, retrieval_model=payload.retrieval_model, + summary_index_setting=payload.summary_index_setting, ) except services.errors.dataset.DatasetNameDuplicateError: raise DatasetNameDuplicateError() diff --git a/api/controllers/service_api/dataset/document.py b/api/controllers/service_api/dataset/document.py index c85c1cf81e..a01524f1bc 100644 --- a/api/controllers/service_api/dataset/document.py +++ b/api/controllers/service_api/dataset/document.py @@ -45,6 +45,7 @@ from services.entities.knowledge_entities.knowledge_entities import ( Segmentation, ) from services.file_service import FileService +from services.summary_index_service import SummaryIndexService class DocumentTextCreatePayload(BaseModel): @@ -508,6 +509,12 @@ class DocumentListApi(DatasetApiResource): ) documents = paginated_documents.items + DocumentService.enrich_documents_with_summary_index_status( + documents=documents, + dataset=dataset, + tenant_id=tenant_id, + ) + response = { "data": marshal(documents, document_fields), "has_more": len(documents) == query_params.limit, @@ -612,6 +619,16 @@ class DocumentApi(DatasetApiResource): if metadata not in self.METADATA_CHOICES: raise InvalidMetadataError(f"Invalid metadata value: {metadata}") + # Calculate summary_index_status if needed + summary_index_status = None + has_summary_index = dataset.summary_index_setting and dataset.summary_index_setting.get("enable") is True + if has_summary_index and document.need_summary is True: + summary_index_status = SummaryIndexService.get_document_summary_index_status( + document_id=document_id, + dataset_id=dataset_id, + tenant_id=tenant_id, + ) + if metadata == "only": response = {"id": document.id, "doc_type": document.doc_type, "doc_metadata": document.doc_metadata_details} elif metadata == "without": @@ -646,6 +663,8 @@ class DocumentApi(DatasetApiResource): "display_status": document.display_status, "doc_form": document.doc_form, "doc_language": document.doc_language, + "summary_index_status": summary_index_status, + "need_summary": document.need_summary if document.need_summary is not None else False, } else: dataset_process_rules = DatasetService.get_process_rules(dataset_id) @@ -681,6 +700,8 @@ class DocumentApi(DatasetApiResource): "display_status": document.display_status, "doc_form": document.doc_form, "doc_language": document.doc_language, + "summary_index_status": summary_index_status, + "need_summary": document.need_summary if document.need_summary is not None else False, } return response diff --git a/api/core/app/apps/base_app_generate_response_converter.py b/api/core/app/apps/base_app_generate_response_converter.py index 74c6d2eca6..d1e2f16b6f 100644 --- a/api/core/app/apps/base_app_generate_response_converter.py +++ b/api/core/app/apps/base_app_generate_response_converter.py @@ -79,6 +79,7 @@ class AppGenerateResponseConverter(ABC): "document_name": resource["document_name"], "score": resource["score"], "content": resource["content"], + "summary": resource.get("summary"), } ) metadata["retriever_resources"] = updated_resources diff --git a/api/core/app/entities/app_bundle_entities.py b/api/core/app/entities/app_bundle_entities.py index 4ed7807346..8566fd2bb1 100644 --- a/api/core/app/entities/app_bundle_entities.py +++ b/api/core/app/entities/app_bundle_entities.py @@ -1,12 +1,17 @@ from __future__ import annotations import re +from datetime import UTC, datetime -from pydantic import BaseModel, Field +from pydantic import BaseModel, ConfigDict, Field + +from core.app.entities.app_asset_entities import AppAssetFileTree # Constants BUNDLE_DSL_FILENAME_PATTERN = re.compile(r"^[^/]+\.ya?ml$") BUNDLE_MAX_SIZE = 50 * 1024 * 1024 # 50MB +MANIFEST_FILENAME = "manifest.json" +MANIFEST_SCHEMA_VERSION = "1.0" # Exceptions @@ -22,21 +27,70 @@ class ZipSecurityError(Exception): pass -# Entities +# Manifest DTOs +class ManifestFileEntry(BaseModel): + """Maps node_id to file path in the bundle.""" + + model_config = ConfigDict(extra="forbid") + + node_id: str + path: str + + +class ManifestIntegrity(BaseModel): + """Basic integrity check fields.""" + + model_config = ConfigDict(extra="forbid") + + file_count: int + + +class ManifestAppAssets(BaseModel): + """App assets section containing the full tree.""" + + model_config = ConfigDict(extra="forbid") + + tree: AppAssetFileTree + + +class BundleManifest(BaseModel): + """ + Bundle manifest for app asset import/export. + + Schema version 1.0: + - dsl_filename: DSL file name in bundle root (e.g. "my_app.yml") + - tree: Full AppAssetFileTree (files + folders) for 100% restoration including node IDs + - files: Explicit node_id -> path mapping for file nodes only + - integrity: Basic file_count validation + """ + + model_config = ConfigDict(extra="forbid") + + schema_version: str = Field(default=MANIFEST_SCHEMA_VERSION) + generated_at: datetime = Field(default_factory=lambda: datetime.now(tz=UTC)) + dsl_filename: str = Field(description="DSL file name in bundle root") + app_assets: ManifestAppAssets + files: list[ManifestFileEntry] + integrity: ManifestIntegrity + + @property + def assets_prefix(self) -> str: + """Assets directory name (DSL filename without extension).""" + return self.dsl_filename.rsplit(".", 1)[0] + + @classmethod + def from_tree(cls, tree: AppAssetFileTree, dsl_filename: str) -> BundleManifest: + """Build manifest from an AppAssetFileTree.""" + files = [ManifestFileEntry(node_id=n.id, path=tree.get_path(n.id)) for n in tree.walk_files()] + return cls( + dsl_filename=dsl_filename, + app_assets=ManifestAppAssets(tree=tree), + files=files, + integrity=ManifestIntegrity(file_count=len(files)), + ) + + +# Export result class BundleExportResult(BaseModel): download_url: str = Field(description="Temporary download URL for the ZIP") filename: str = Field(description="Suggested filename for the ZIP") - - -class SourceFileEntry(BaseModel): - path: str = Field(description="File path within the ZIP") - node_id: str = Field(description="Node ID in the asset tree") - - -class ExtractedFile(BaseModel): - path: str = Field(description="Relative path of the extracted file") - content: bytes = Field(description="File content as bytes") - - -class ExtractedFolder(BaseModel): - path: str = Field(description="Relative path of the extracted folder") diff --git a/api/core/app_assets/storage.py b/api/core/app_assets/storage.py index ae0f137898..17964cb9d5 100644 --- a/api/core/app_assets/storage.py +++ b/api/core/app_assets/storage.py @@ -1,25 +1,31 @@ +"""App assets storage layer. + +This module provides storage abstractions for app assets (draft files, build zips, +resolved assets, skill bundles, source zips, bundle exports/imports). + +Key components: +- AssetPath: Factory for creating typed storage paths +- AppAssetStorage: High-level storage operations with presign support + +All presign operations use the unified FilePresignStorage wrapper, which automatically +falls back to Dify's file proxy when the underlying storage doesn't support presigned URLs. +""" + from __future__ import annotations -import base64 -import hashlib -import hmac -import os -import time -import urllib.parse from abc import ABC, abstractmethod -from collections.abc import Callable, Iterable +from collections.abc import Generator, Iterable from dataclasses import dataclass from typing import Any, ClassVar from uuid import UUID -from configs import dify_config from extensions.storage.base_storage import BaseStorage from extensions.storage.cached_presign_storage import CachedPresignStorage -from libs import rsa +from extensions.storage.file_presign_storage import FilePresignStorage _ASSET_BASE = "app_assets" _SILENT_STORAGE_NOT_FOUND = b"File Not Found" -_ASSET_PATH_REGISTRY: dict[str, tuple[bool, Callable[..., SignedAssetPath]]] = {} +_ASSET_PATH_REGISTRY: dict[str, tuple[bool, Any]] = {} def _require_uuid(value: str, field_name: str) -> None: @@ -29,12 +35,14 @@ def _require_uuid(value: str, field_name: str) -> None: raise ValueError(f"{field_name} must be a UUID") from exc -def register_asset_path(asset_type: str, *, requires_node: bool, factory: Callable[..., SignedAssetPath]) -> None: +def register_asset_path(asset_type: str, *, requires_node: bool, factory: Any) -> None: _ASSET_PATH_REGISTRY[asset_type] = (requires_node, factory) @dataclass(frozen=True) class AssetPathBase(ABC): + """Base class for all asset paths.""" + asset_type: ClassVar[str] tenant_id: str app_id: str @@ -50,49 +58,24 @@ class AssetPathBase(ABC): raise NotImplementedError -class SignedAssetPath(AssetPathBase, ABC): - @abstractmethod - def signature_parts(self) -> tuple[str, str | None]: - """Return (resource_id, sub_resource_id) used for signing. - - sub_resource_id should be None when not applicable. - """ - - @abstractmethod - def proxy_path_parts(self) -> list[str]: - raise NotImplementedError - - @dataclass(frozen=True) -class _DraftAssetPath(SignedAssetPath): +class _DraftAssetPath(AssetPathBase): asset_type: ClassVar[str] = "draft" def get_storage_key(self) -> str: return f"{_ASSET_BASE}/{self.tenant_id}/{self.app_id}/draft/{self.resource_id}" - def signature_parts(self) -> tuple[str, str | None]: - return (self.resource_id, None) - - def proxy_path_parts(self) -> list[str]: - return [self.asset_type, self.tenant_id, self.app_id, self.resource_id] - @dataclass(frozen=True) -class _BuildZipAssetPath(SignedAssetPath): +class _BuildZipAssetPath(AssetPathBase): asset_type: ClassVar[str] = "build-zip" def get_storage_key(self) -> str: return f"{_ASSET_BASE}/{self.tenant_id}/{self.app_id}/artifacts/{self.resource_id}.zip" - def signature_parts(self) -> tuple[str, str | None]: - return (self.resource_id, None) - - def proxy_path_parts(self) -> list[str]: - return [self.asset_type, self.tenant_id, self.app_id, self.resource_id] - @dataclass(frozen=True) -class _ResolvedAssetPath(SignedAssetPath): +class _ResolvedAssetPath(AssetPathBase): asset_type: ClassVar[str] = "resolved" node_id: str @@ -103,80 +86,76 @@ class _ResolvedAssetPath(SignedAssetPath): def get_storage_key(self) -> str: return f"{_ASSET_BASE}/{self.tenant_id}/{self.app_id}/artifacts/{self.resource_id}/resolved/{self.node_id}" - def signature_parts(self) -> tuple[str, str | None]: - return (self.resource_id, self.node_id) - - def proxy_path_parts(self) -> list[str]: - return [self.asset_type, self.tenant_id, self.app_id, self.resource_id, self.node_id] - @dataclass(frozen=True) -class _SkillBundleAssetPath(SignedAssetPath): +class _SkillBundleAssetPath(AssetPathBase): asset_type: ClassVar[str] = "skill-bundle" def get_storage_key(self) -> str: return f"{_ASSET_BASE}/{self.tenant_id}/{self.app_id}/artifacts/{self.resource_id}/skill_artifact_set.json" - def signature_parts(self) -> tuple[str, str | None]: - return (self.resource_id, None) - - def proxy_path_parts(self) -> list[str]: - return [self.asset_type, self.tenant_id, self.app_id, self.resource_id] - @dataclass(frozen=True) -class _SourceZipAssetPath(SignedAssetPath): +class _SourceZipAssetPath(AssetPathBase): asset_type: ClassVar[str] = "source-zip" def get_storage_key(self) -> str: return f"{_ASSET_BASE}/{self.tenant_id}/{self.app_id}/sources/{self.resource_id}.zip" - def signature_parts(self) -> tuple[str, str | None]: - return (self.resource_id, None) - - def proxy_path_parts(self) -> list[str]: - return [self.asset_type, self.tenant_id, self.app_id, self.resource_id] - @dataclass(frozen=True) -class _BundleExportZipAssetPath(SignedAssetPath): +class _BundleExportZipAssetPath(AssetPathBase): asset_type: ClassVar[str] = "bundle-export-zip" def get_storage_key(self) -> str: return f"{_ASSET_BASE}/{self.tenant_id}/{self.app_id}/bundle_exports/{self.resource_id}.zip" - def signature_parts(self) -> tuple[str, str | None]: - return (self.resource_id, None) - def proxy_path_parts(self) -> list[str]: - return [self.asset_type, self.tenant_id, self.app_id, self.resource_id] +@dataclass(frozen=True) +class BundleImportZipPath: + """Path for temporary import zip files.""" + + tenant_id: str + import_id: str + + def __post_init__(self) -> None: + _require_uuid(self.tenant_id, "tenant_id") + + def get_storage_key(self) -> str: + return f"{_ASSET_BASE}/{self.tenant_id}/imports/{self.import_id}.zip" class AssetPath: + """Factory for creating typed asset paths.""" + @staticmethod - def draft(tenant_id: str, app_id: str, node_id: str) -> SignedAssetPath: + def draft(tenant_id: str, app_id: str, node_id: str) -> AssetPathBase: return _DraftAssetPath(tenant_id=tenant_id, app_id=app_id, resource_id=node_id) @staticmethod - def build_zip(tenant_id: str, app_id: str, assets_id: str) -> SignedAssetPath: + def build_zip(tenant_id: str, app_id: str, assets_id: str) -> AssetPathBase: return _BuildZipAssetPath(tenant_id=tenant_id, app_id=app_id, resource_id=assets_id) @staticmethod - def resolved(tenant_id: str, app_id: str, assets_id: str, node_id: str) -> SignedAssetPath: + def resolved(tenant_id: str, app_id: str, assets_id: str, node_id: str) -> AssetPathBase: return _ResolvedAssetPath(tenant_id=tenant_id, app_id=app_id, resource_id=assets_id, node_id=node_id) @staticmethod - def skill_bundle(tenant_id: str, app_id: str, assets_id: str) -> SignedAssetPath: + def skill_bundle(tenant_id: str, app_id: str, assets_id: str) -> AssetPathBase: return _SkillBundleAssetPath(tenant_id=tenant_id, app_id=app_id, resource_id=assets_id) @staticmethod - def source_zip(tenant_id: str, app_id: str, workflow_id: str) -> SignedAssetPath: + def source_zip(tenant_id: str, app_id: str, workflow_id: str) -> AssetPathBase: return _SourceZipAssetPath(tenant_id=tenant_id, app_id=app_id, resource_id=workflow_id) @staticmethod - def bundle_export_zip(tenant_id: str, app_id: str, export_id: str) -> SignedAssetPath: + def bundle_export_zip(tenant_id: str, app_id: str, export_id: str) -> AssetPathBase: return _BundleExportZipAssetPath(tenant_id=tenant_id, app_id=app_id, resource_id=export_id) + @staticmethod + def bundle_import_zip(tenant_id: str, import_id: str) -> BundleImportZipPath: + return BundleImportZipPath(tenant_id=tenant_id, import_id=import_id) + @staticmethod def from_components( asset_type: str, @@ -184,7 +163,7 @@ class AssetPath: app_id: str, resource_id: str, sub_resource_id: str | None = None, - ) -> SignedAssetPath: + ) -> AssetPathBase: entry = _ASSET_PATH_REGISTRY.get(asset_type) if not entry: raise ValueError(f"Unsupported asset type: {asset_type}") @@ -206,120 +185,26 @@ register_asset_path("source-zip", requires_node=False, factory=AssetPath.source_ register_asset_path("bundle-export-zip", requires_node=False, factory=AssetPath.bundle_export_zip) -class AppAssetSigner: - SIGNATURE_PREFIX = "app-asset" - SIGNATURE_VERSION = "v1" - OPERATION_DOWNLOAD = "download" - OPERATION_UPLOAD = "upload" - - @classmethod - def create_download_signature(cls, asset_path: SignedAssetPath, expires_at: int, nonce: str) -> str: - return cls._create_signature( - asset_path=asset_path, - operation=cls.OPERATION_DOWNLOAD, - expires_at=expires_at, - nonce=nonce, - ) - - @classmethod - def create_upload_signature(cls, asset_path: SignedAssetPath, expires_at: int, nonce: str) -> str: - return cls._create_signature( - asset_path=asset_path, - operation=cls.OPERATION_UPLOAD, - expires_at=expires_at, - nonce=nonce, - ) - - @classmethod - def verify_download_signature(cls, asset_path: SignedAssetPath, expires_at: int, nonce: str, sign: str) -> bool: - return cls._verify_signature( - asset_path=asset_path, - operation=cls.OPERATION_DOWNLOAD, - expires_at=expires_at, - nonce=nonce, - sign=sign, - ) - - @classmethod - def verify_upload_signature(cls, asset_path: SignedAssetPath, expires_at: int, nonce: str, sign: str) -> bool: - return cls._verify_signature( - asset_path=asset_path, - operation=cls.OPERATION_UPLOAD, - expires_at=expires_at, - nonce=nonce, - sign=sign, - ) - - @classmethod - def _verify_signature( - cls, - *, - asset_path: SignedAssetPath, - operation: str, - expires_at: int, - nonce: str, - sign: str, - ) -> bool: - if expires_at <= 0: - return False - - expected_sign = cls._create_signature( - asset_path=asset_path, - operation=operation, - expires_at=expires_at, - nonce=nonce, - ) - if not hmac.compare_digest(sign, expected_sign): - return False - - current_time = int(time.time()) - if expires_at < current_time: - return False - - if expires_at - current_time > dify_config.FILES_ACCESS_TIMEOUT: - return False - - return True - - @classmethod - def _create_signature(cls, *, asset_path: SignedAssetPath, operation: str, expires_at: int, nonce: str) -> str: - key = cls._tenant_key(asset_path.tenant_id) - message = cls._signature_message( - asset_path=asset_path, - operation=operation, - expires_at=expires_at, - nonce=nonce, - ) - sign = hmac.new(key, message.encode(), hashlib.sha256).digest() - return base64.urlsafe_b64encode(sign).decode() - - @classmethod - def _signature_message(cls, *, asset_path: SignedAssetPath, operation: str, expires_at: int, nonce: str) -> str: - resource_id, sub_resource_id = asset_path.signature_parts() - return ( - f"{cls.SIGNATURE_PREFIX}|{cls.SIGNATURE_VERSION}|{operation}|" - f"{asset_path.asset_type}|{asset_path.tenant_id}|{asset_path.app_id}|" - f"{resource_id}|{sub_resource_id or ''}|{expires_at}|{nonce}" - ) - - @classmethod - def _tenant_key(cls, tenant_id: str) -> bytes: - try: - rsa_key, _ = rsa.get_decrypt_decoding(tenant_id) - except rsa.PrivkeyNotFoundError as exc: - raise ValueError(f"Tenant private key missing for tenant_id={tenant_id}") from exc - private_key = rsa_key.export_key() - return hashlib.sha256(private_key).digest() - - class AppAssetStorage: - _base_storage: BaseStorage + """High-level storage operations for app assets. + + Wraps BaseStorage with: + - FilePresignStorage for presign fallback support + - CachedPresignStorage for URL caching + + Usage: + storage = AppAssetStorage(base_storage, redis_client=redis) + storage.save(asset_path, content) + url = storage.get_download_url(asset_path) + """ + _storage: CachedPresignStorage def __init__(self, storage: BaseStorage, *, redis_client: Any, cache_key_prefix: str = "app_assets") -> None: - self._base_storage = storage + # Wrap with FilePresignStorage for fallback support, then CachedPresignStorage for caching + presign_storage = FilePresignStorage(storage) self._storage = CachedPresignStorage( - storage=storage, + storage=presign_storage, redis_client=redis_client, cache_key_prefix=cache_key_prefix, ) @@ -329,87 +214,51 @@ class AppAssetStorage: return self._storage def save(self, asset_path: AssetPathBase, content: bytes) -> None: - self._storage.save(self.get_storage_key(asset_path), content) + self._storage.save(asset_path.get_storage_key(), content) def load(self, asset_path: AssetPathBase) -> bytes: - return self._storage.load_once(self.get_storage_key(asset_path)) + return self._storage.load_once(asset_path.get_storage_key()) + + def load_stream(self, asset_path: AssetPathBase) -> Generator[bytes, None, None]: + return self._storage.load_stream(asset_path.get_storage_key()) def load_or_none(self, asset_path: AssetPathBase) -> bytes | None: try: - data = self._storage.load_once(self.get_storage_key(asset_path)) + data = self._storage.load_once(asset_path.get_storage_key()) except FileNotFoundError: return None if data == _SILENT_STORAGE_NOT_FOUND: return None return data + def exists(self, asset_path: AssetPathBase) -> bool: + return self._storage.exists(asset_path.get_storage_key()) + def delete(self, asset_path: AssetPathBase) -> None: - self._storage.delete(self.get_storage_key(asset_path)) + self._storage.delete(asset_path.get_storage_key()) - def get_storage_key(self, asset_path: AssetPathBase) -> str: - return asset_path.get_storage_key() + def get_download_url(self, asset_path: AssetPathBase, expires_in: int = 3600) -> str: + return self._storage.get_download_url(asset_path.get_storage_key(), expires_in) - def get_download_url(self, asset_path: SignedAssetPath, expires_in: int = 3600) -> str: - storage_key = self.get_storage_key(asset_path) + def get_download_urls(self, asset_paths: Iterable[AssetPathBase], expires_in: int = 3600) -> list[str]: + storage_keys = [p.get_storage_key() for p in asset_paths] + return self._storage.get_download_urls(storage_keys, expires_in) + + def get_upload_url(self, asset_path: AssetPathBase, expires_in: int = 3600) -> str: + return self._storage.get_upload_url(asset_path.get_storage_key(), expires_in) + + # Bundle import convenience methods + def get_import_upload_url(self, path: BundleImportZipPath, expires_in: int = 3600) -> str: + return self._storage.get_upload_url(path.get_storage_key(), expires_in) + + def get_import_download_url(self, path: BundleImportZipPath, expires_in: int = 3600) -> str: + return self._storage.get_download_url(path.get_storage_key(), expires_in) + + def delete_import_zip(self, path: BundleImportZipPath) -> None: + """Delete import zip file. Errors are logged but not raised.""" try: - return self._storage.get_download_url(storage_key, expires_in) - except NotImplementedError: - pass + self._storage.delete(path.get_storage_key()) + except Exception: + import logging - return self._generate_signed_proxy_download_url(asset_path, expires_in) - - def get_download_urls( - self, - asset_paths: Iterable[SignedAssetPath], - expires_in: int = 3600, - ) -> list[str]: - asset_paths_list = list(asset_paths) - storage_keys = [self.get_storage_key(asset_path) for asset_path in asset_paths_list] - - try: - return self._storage.get_download_urls(storage_keys, expires_in) - except NotImplementedError: - pass - - return [self._generate_signed_proxy_download_url(asset_path, expires_in) for asset_path in asset_paths_list] - - def get_upload_url( - self, - asset_path: SignedAssetPath, - expires_in: int = 3600, - ) -> str: - storage_key = self.get_storage_key(asset_path) - try: - return self._storage.get_upload_url(storage_key, expires_in) - except NotImplementedError: - pass - - return self._generate_signed_proxy_upload_url(asset_path, expires_in) - - def _generate_signed_proxy_download_url(self, asset_path: SignedAssetPath, expires_in: int) -> str: - expires_in = min(expires_in, dify_config.FILES_ACCESS_TIMEOUT) - expires_at = int(time.time()) + max(expires_in, 1) - nonce = os.urandom(16).hex() - sign = AppAssetSigner.create_download_signature(asset_path=asset_path, expires_at=expires_at, nonce=nonce) - - base_url = dify_config.FILES_URL - url = self._build_proxy_url(base_url=base_url, asset_path=asset_path, action="download") - query = urllib.parse.urlencode({"expires_at": expires_at, "nonce": nonce, "sign": sign}) - return f"{url}?{query}" - - def _generate_signed_proxy_upload_url(self, asset_path: SignedAssetPath, expires_in: int) -> str: - expires_in = min(expires_in, dify_config.FILES_ACCESS_TIMEOUT) - expires_at = int(time.time()) + max(expires_in, 1) - nonce = os.urandom(16).hex() - sign = AppAssetSigner.create_upload_signature(asset_path=asset_path, expires_at=expires_at, nonce=nonce) - - base_url = dify_config.FILES_URL - url = self._build_proxy_url(base_url=base_url, asset_path=asset_path, action="upload") - query = urllib.parse.urlencode({"expires_at": expires_at, "nonce": nonce, "sign": sign}) - return f"{url}?{query}" - - @staticmethod - def _build_proxy_url(*, base_url: str, asset_path: SignedAssetPath, action: str) -> str: - encoded_parts = [urllib.parse.quote(part, safe="") for part in asset_path.proxy_path_parts()] - path = "/".join(encoded_parts) - return f"{base_url}/files/app-assets/{path}/{action}" + logging.getLogger(__name__).debug("Failed to delete import zip: %s", path.get_storage_key()) diff --git a/api/core/app_bundle/__init__.py b/api/core/app_bundle/__init__.py index 5c1c22f206..7fb33b2b6d 100644 --- a/api/core/app_bundle/__init__.py +++ b/api/core/app_bundle/__init__.py @@ -1,5 +1 @@ -from .source_zip_extractor import SourceZipExtractor - -__all__ = [ - "SourceZipExtractor", -] +# App bundle utilities - manifest-driven import/export handled by AppBundleService diff --git a/api/core/app_bundle/source_zip_extractor.py b/api/core/app_bundle/source_zip_extractor.py deleted file mode 100644 index 7d489015b5..0000000000 --- a/api/core/app_bundle/source_zip_extractor.py +++ /dev/null @@ -1,98 +0,0 @@ -from __future__ import annotations - -import io -import zipfile -from typing import TYPE_CHECKING -from uuid import uuid4 - -from core.app.entities.app_asset_entities import AppAssetFileTree, AppAssetNode -from core.app.entities.app_bundle_entities import ExtractedFile, ExtractedFolder, ZipSecurityError -from core.app_assets.storage import AssetPath - -if TYPE_CHECKING: - from core.app_assets.storage import AppAssetStorage - - -class SourceZipExtractor: - def __init__(self, storage: AppAssetStorage) -> None: - self._storage = storage - - def extract_entries( - self, zip_bytes: bytes, *, expected_prefix: str - ) -> tuple[list[ExtractedFolder], list[ExtractedFile]]: - folders: list[ExtractedFolder] = [] - files: list[ExtractedFile] = [] - - with zipfile.ZipFile(io.BytesIO(zip_bytes), "r") as zf: - for info in zf.infolist(): - name = info.filename - self._validate_path(name) - - if not name.startswith(expected_prefix): - continue - - relative_path = name[len(expected_prefix) :].lstrip("/") - if not relative_path: - continue - - if info.is_dir(): - folders.append(ExtractedFolder(path=relative_path.rstrip("/"))) - else: - content = zf.read(info) - files.append(ExtractedFile(path=relative_path, content=content)) - - return folders, files - - def build_tree_and_save( - self, - folders: list[ExtractedFolder], - files: list[ExtractedFile], - tenant_id: str, - app_id: str, - ) -> AppAssetFileTree: - tree = AppAssetFileTree() - path_to_node_id: dict[str, str] = {} - - all_folder_paths = {f.path for f in folders} - for file in files: - self._ensure_parent_folders(file.path, all_folder_paths) - - sorted_folders = sorted(all_folder_paths, key=lambda p: p.count("/")) - for folder_path in sorted_folders: - node_id = str(uuid4()) - name = folder_path.rsplit("/", 1)[-1] - parent_path = folder_path.rsplit("/", 1)[0] if "/" in folder_path else None - parent_id = path_to_node_id.get(parent_path) if parent_path else None - - node = AppAssetNode.create_folder(node_id, name, parent_id) - tree.add(node) - path_to_node_id[folder_path] = node_id - - sorted_files = sorted(files, key=lambda f: f.path) - for file in sorted_files: - node_id = str(uuid4()) - name = file.path.rsplit("/", 1)[-1] - parent_path = file.path.rsplit("/", 1)[0] if "/" in file.path else None - parent_id = path_to_node_id.get(parent_path) if parent_path else None - - node = AppAssetNode.create_file(node_id, name, parent_id, len(file.content)) - tree.add(node) - - asset_path = AssetPath.draft(tenant_id, app_id, node_id) - self._storage.save(asset_path, file.content) - - return tree - - def _validate_path(self, path: str) -> None: - if ".." in path: - raise ZipSecurityError(f"Path traversal detected: {path}") - if path.startswith("/"): - raise ZipSecurityError(f"Absolute path detected: {path}") - if "\\" in path: - raise ZipSecurityError(f"Backslash in path: {path}") - - def _ensure_parent_folders(self, file_path: str, folder_set: set[str]) -> None: - parts = file_path.split("/")[:-1] - for i in range(1, len(parts) + 1): - parent = "/".join(parts[:i]) - folder_set.add(parent) diff --git a/api/core/entities/knowledge_entities.py b/api/core/entities/knowledge_entities.py index d4093b5245..b1ba3c3e2a 100644 --- a/api/core/entities/knowledge_entities.py +++ b/api/core/entities/knowledge_entities.py @@ -3,6 +3,7 @@ from pydantic import BaseModel, Field, field_validator class PreviewDetail(BaseModel): content: str + summary: str | None = None child_chunks: list[str] | None = None diff --git a/api/core/file/file_manager.py b/api/core/file/file_manager.py index 93c1a9be99..0184ff0f82 100644 --- a/api/core/file/file_manager.py +++ b/api/core/file/file_manager.py @@ -123,6 +123,8 @@ def download(f: File, /): ): return _download_file_content(f.storage_key) elif f.transfer_method == FileTransferMethod.REMOTE_URL: + if f.remote_url is None: + raise ValueError("Missing file remote_url") response = ssrf_proxy.get(f.remote_url, follow_redirects=True) response.raise_for_status() return response.content @@ -153,6 +155,8 @@ def _download_file_content(path: str, /): def _get_encoded_string(f: File, /): match f.transfer_method: case FileTransferMethod.REMOTE_URL: + if f.remote_url is None: + raise ValueError("Missing file remote_url") response = ssrf_proxy.get(f.remote_url, follow_redirects=True) response.raise_for_status() data = response.content diff --git a/api/core/helper/ssrf_proxy.py b/api/core/helper/ssrf_proxy.py index 128c64ff2c..ddccfbaf45 100644 --- a/api/core/helper/ssrf_proxy.py +++ b/api/core/helper/ssrf_proxy.py @@ -4,8 +4,10 @@ Proxy requests to avoid SSRF import logging import time +from typing import Any, TypeAlias import httpx +from pydantic import TypeAdapter, ValidationError from configs import dify_config from core.helper.http_client_pooling import get_pooled_http_client @@ -18,6 +20,9 @@ SSRF_DEFAULT_MAX_RETRIES = dify_config.SSRF_DEFAULT_MAX_RETRIES BACKOFF_FACTOR = 0.5 STATUS_FORCELIST = [429, 500, 502, 503, 504] +Headers: TypeAlias = dict[str, str] +_HEADERS_ADAPTER = TypeAdapter(Headers) + _SSL_VERIFIED_POOL_KEY = "ssrf:verified" _SSL_UNVERIFIED_POOL_KEY = "ssrf:unverified" _SSRF_CLIENT_LIMITS = httpx.Limits( @@ -76,7 +81,7 @@ def _get_ssrf_client(ssl_verify_enabled: bool) -> httpx.Client: ) -def _get_user_provided_host_header(headers: dict | None) -> str | None: +def _get_user_provided_host_header(headers: Headers | None) -> str | None: """ Extract the user-provided Host header from the headers dict. @@ -92,7 +97,7 @@ def _get_user_provided_host_header(headers: dict | None) -> str | None: return None -def _inject_trace_headers(headers: dict | None) -> dict: +def _inject_trace_headers(headers: Headers | None) -> Headers: """ Inject W3C traceparent header for distributed tracing. @@ -125,7 +130,7 @@ def _inject_trace_headers(headers: dict | None) -> dict: return headers -def make_request(method, url, max_retries=SSRF_DEFAULT_MAX_RETRIES, **kwargs): +def make_request(method: str, url: str, max_retries: int = SSRF_DEFAULT_MAX_RETRIES, **kwargs: Any) -> httpx.Response: # Convert requests-style allow_redirects to httpx-style follow_redirects if "allow_redirects" in kwargs: allow_redirects = kwargs.pop("allow_redirects") @@ -142,10 +147,15 @@ def make_request(method, url, max_retries=SSRF_DEFAULT_MAX_RETRIES, **kwargs): # prioritize per-call option, which can be switched on and off inside the HTTP node on the web UI verify_option = kwargs.pop("ssl_verify", dify_config.HTTP_REQUEST_NODE_SSL_VERIFY) + if not isinstance(verify_option, bool): + raise ValueError("ssl_verify must be a boolean") client = _get_ssrf_client(verify_option) # Inject traceparent header for distributed tracing (when OTEL is not enabled) - headers = kwargs.get("headers") or {} + try: + headers: Headers = _HEADERS_ADAPTER.validate_python(kwargs.get("headers") or {}) + except ValidationError as e: + raise ValueError("headers must be a mapping of string keys to string values") from e headers = _inject_trace_headers(headers) kwargs["headers"] = headers @@ -198,25 +208,25 @@ def make_request(method, url, max_retries=SSRF_DEFAULT_MAX_RETRIES, **kwargs): raise MaxRetriesExceededError(f"Reached maximum retries ({max_retries}) for URL {url}") -def get(url, max_retries=SSRF_DEFAULT_MAX_RETRIES, **kwargs): +def get(url: str, max_retries: int = SSRF_DEFAULT_MAX_RETRIES, **kwargs: Any) -> httpx.Response: return make_request("GET", url, max_retries=max_retries, **kwargs) -def post(url, max_retries=SSRF_DEFAULT_MAX_RETRIES, **kwargs): +def post(url: str, max_retries: int = SSRF_DEFAULT_MAX_RETRIES, **kwargs: Any) -> httpx.Response: return make_request("POST", url, max_retries=max_retries, **kwargs) -def put(url, max_retries=SSRF_DEFAULT_MAX_RETRIES, **kwargs): +def put(url: str, max_retries: int = SSRF_DEFAULT_MAX_RETRIES, **kwargs: Any) -> httpx.Response: return make_request("PUT", url, max_retries=max_retries, **kwargs) -def patch(url, max_retries=SSRF_DEFAULT_MAX_RETRIES, **kwargs): +def patch(url: str, max_retries: int = SSRF_DEFAULT_MAX_RETRIES, **kwargs: Any) -> httpx.Response: return make_request("PATCH", url, max_retries=max_retries, **kwargs) -def delete(url, max_retries=SSRF_DEFAULT_MAX_RETRIES, **kwargs): +def delete(url: str, max_retries: int = SSRF_DEFAULT_MAX_RETRIES, **kwargs: Any) -> httpx.Response: return make_request("DELETE", url, max_retries=max_retries, **kwargs) -def head(url, max_retries=SSRF_DEFAULT_MAX_RETRIES, **kwargs): +def head(url: str, max_retries: int = SSRF_DEFAULT_MAX_RETRIES, **kwargs: Any) -> httpx.Response: return make_request("HEAD", url, max_retries=max_retries, **kwargs) diff --git a/api/core/indexing_runner.py b/api/core/indexing_runner.py index f1b50f360b..e172e88298 100644 --- a/api/core/indexing_runner.py +++ b/api/core/indexing_runner.py @@ -311,14 +311,18 @@ class IndexingRunner: qa_preview_texts: list[QAPreviewDetail] = [] total_segments = 0 + # doc_form represents the segmentation method (general, parent-child, QA) index_type = doc_form index_processor = IndexProcessorFactory(index_type).init_index_processor() + # one extract_setting is one source document for extract_setting in extract_settings: # extract processing_rule = DatasetProcessRule( mode=tmp_processing_rule["mode"], rules=json.dumps(tmp_processing_rule["rules"]) ) + # Extract document content text_docs = index_processor.extract(extract_setting, process_rule_mode=tmp_processing_rule["mode"]) + # Cleaning and segmentation documents = index_processor.transform( text_docs, current_user=None, @@ -361,6 +365,12 @@ class IndexingRunner: if doc_form and doc_form == "qa_model": return IndexingEstimate(total_segments=total_segments * 20, qa_preview=qa_preview_texts, preview=[]) + + # Generate summary preview + summary_index_setting = tmp_processing_rule.get("summary_index_setting") + if summary_index_setting and summary_index_setting.get("enable") and preview_texts: + preview_texts = index_processor.generate_summary_preview(tenant_id, preview_texts, summary_index_setting) + return IndexingEstimate(total_segments=total_segments, preview=preview_texts) def _extract( diff --git a/api/core/llm_generator/llm_generator.py b/api/core/llm_generator/llm_generator.py index 880c0142c2..23ba87c031 100644 --- a/api/core/llm_generator/llm_generator.py +++ b/api/core/llm_generator/llm_generator.py @@ -471,7 +471,6 @@ class LLMGenerator: prompt_messages=complete_messages, output_model=CodeNodeStructuredOutput, model_parameters=model_parameters, - stream=True, tenant_id=tenant_id, ) @@ -553,16 +552,10 @@ class LLMGenerator: completion_params = model_config.get("completion_params", {}) if model_config else {} try: - response = invoke_llm_with_pydantic_model( - provider=model_instance.provider, - model_schema=model_schema, - model_instance=model_instance, - prompt_messages=prompt_messages, - output_model=SuggestedQuestionsOutput, - model_parameters=completion_params, - stream=True, - tenant_id=tenant_id, - ) + response = invoke_llm_with_pydantic_model(provider=model_instance.provider, model_schema=model_schema, + model_instance=model_instance, prompt_messages=prompt_messages, + output_model=SuggestedQuestionsOutput, + model_parameters=completion_params, tenant_id=tenant_id) return {"questions": response.questions, "error": ""} @@ -842,15 +835,11 @@ Generate {language} code to extract/transform available variables for the target try: from core.llm_generator.output_parser.structured_output import invoke_llm_with_pydantic_model - response = invoke_llm_with_pydantic_model( - provider=model_instance.provider, - model_schema=model_schema, - model_instance=model_instance, - prompt_messages=list(prompt_messages), - output_model=InstructionModifyOutput, - model_parameters=model_parameters, - stream=True, - ) + response = invoke_llm_with_pydantic_model(provider=model_instance.provider, model_schema=model_schema, + model_instance=model_instance, + prompt_messages=list(prompt_messages), + output_model=InstructionModifyOutput, + model_parameters=model_parameters) return response.model_dump(mode="python") except InvokeError as e: error = str(e) diff --git a/api/core/llm_generator/output_parser/structured_output.py b/api/core/llm_generator/output_parser/structured_output.py index 9122519854..8f166a5757 100644 --- a/api/core/llm_generator/output_parser/structured_output.py +++ b/api/core/llm_generator/output_parser/structured_output.py @@ -1,8 +1,8 @@ import json -from collections.abc import Generator, Mapping, Sequence +from collections.abc import Mapping, Sequence from copy import deepcopy from enum import StrEnum -from typing import Any, Literal, TypeVar, cast, overload +from typing import Any, TypeVar, cast import json_repair from pydantic import BaseModel, TypeAdapter, ValidationError @@ -14,13 +14,9 @@ from core.model_manager import ModelInstance from core.model_runtime.callbacks.base_callback import Callback from core.model_runtime.entities.llm_entities import ( LLMResult, - LLMResultChunk, - LLMResultChunkDelta, - LLMResultChunkWithStructuredOutput, LLMResultWithStructuredOutput, ) from core.model_runtime.entities.message_entities import ( - AssistantPromptMessage, PromptMessage, PromptMessageTool, SystemPromptMessage, @@ -52,7 +48,6 @@ TOOL_CALL_FEATURES = {ModelFeature.TOOL_CALL, ModelFeature.MULTI_TOOL_CALL, Mode T = TypeVar("T", bound=BaseModel) -@overload def invoke_llm_with_structured_output( *, provider: str, @@ -63,58 +58,10 @@ def invoke_llm_with_structured_output( model_parameters: Mapping | None = None, tools: Sequence[PromptMessageTool] | None = None, stop: list[str] | None = None, - stream: Literal[True], user: str | None = None, callbacks: list[Callback] | None = None, tenant_id: str | None = None, -) -> Generator[LLMResultChunkWithStructuredOutput, None, None]: ... -@overload -def invoke_llm_with_structured_output( - *, - provider: str, - model_schema: AIModelEntity, - model_instance: ModelInstance, - prompt_messages: Sequence[PromptMessage], - json_schema: Mapping[str, Any], - model_parameters: Mapping | None = None, - tools: Sequence[PromptMessageTool] | None = None, - stop: list[str] | None = None, - stream: Literal[False], - user: str | None = None, - callbacks: list[Callback] | None = None, - tenant_id: str | None = None, -) -> LLMResultWithStructuredOutput: ... -@overload -def invoke_llm_with_structured_output( - *, - provider: str, - model_schema: AIModelEntity, - model_instance: ModelInstance, - prompt_messages: Sequence[PromptMessage], - json_schema: Mapping[str, Any], - model_parameters: Mapping | None = None, - tools: Sequence[PromptMessageTool] | None = None, - stop: list[str] | None = None, - stream: bool = True, - user: str | None = None, - callbacks: list[Callback] | None = None, - tenant_id: str | None = None, -) -> LLMResultWithStructuredOutput | Generator[LLMResultChunkWithStructuredOutput, None, None]: ... -def invoke_llm_with_structured_output( - *, - provider: str, - model_schema: AIModelEntity, - model_instance: ModelInstance, - prompt_messages: Sequence[PromptMessage], - json_schema: Mapping[str, Any], - model_parameters: Mapping | None = None, - tools: Sequence[PromptMessageTool] | None = None, - stop: list[str] | None = None, - stream: bool = True, - user: str | None = None, - callbacks: list[Callback] | None = None, - tenant_id: str | None = None, -) -> LLMResultWithStructuredOutput | Generator[LLMResultChunkWithStructuredOutput, None, None]: +) -> LLMResultWithStructuredOutput: """ Invoke large language model with structured output. @@ -129,7 +76,6 @@ def invoke_llm_with_structured_output( :param model_parameters: model parameters :param tools: tools for tool calling :param stop: stop words - :param stream: is stream response :param user: unique user id :param callbacks: callbacks :param tenant_id: tenant ID for file reference conversion. When provided and @@ -165,91 +111,33 @@ def invoke_llm_with_structured_output( model_parameters=model_parameters_with_json_schema, tools=tools, stop=stop, - stream=stream, + stream=False, user=user, callbacks=callbacks, ) - if isinstance(llm_result, LLMResult): - # Non-streaming result - structured_output = _extract_structured_output(llm_result) + # Non-streaming result + structured_output = _extract_structured_output(llm_result) - # Fill missing fields with default values - structured_output = fill_defaults_from_schema(structured_output, json_schema) + # Fill missing fields with default values + structured_output = fill_defaults_from_schema(structured_output, json_schema) - # Convert file references if tenant_id is provided - if tenant_id is not None: - structured_output = convert_file_refs_in_output( - output=structured_output, - json_schema=json_schema, - tenant_id=tenant_id, - ) - - return LLMResultWithStructuredOutput( - structured_output=structured_output, - model=llm_result.model, - message=llm_result.message, - usage=llm_result.usage, - system_fingerprint=llm_result.system_fingerprint, - prompt_messages=llm_result.prompt_messages, + # Convert file references if tenant_id is provided + if tenant_id is not None: + structured_output = convert_file_refs_in_output( + output=structured_output, + json_schema=json_schema, + tenant_id=tenant_id, ) - else: - def generator() -> Generator[LLMResultChunkWithStructuredOutput, None, None]: - result_text: str = "" - tool_call_args: dict[str, str] = {} # tool_call_id -> arguments - prompt_messages: Sequence[PromptMessage] = [] - system_fingerprint: str | None = None - - for event in llm_result: - if isinstance(event, LLMResultChunk): - prompt_messages = event.prompt_messages - system_fingerprint = event.system_fingerprint - - # Collect text content - result_text += event.delta.message.get_text_content() - # Collect tool call arguments - if event.delta.message.tool_calls: - for tool_call in event.delta.message.tool_calls: - call_id = tool_call.id or "" - if tool_call.function.arguments: - tool_call_args[call_id] = tool_call_args.get(call_id, "") + tool_call.function.arguments - - yield LLMResultChunkWithStructuredOutput( - model=model_schema.model, - prompt_messages=prompt_messages, - system_fingerprint=system_fingerprint, - delta=event.delta, - ) - - # Extract structured output: prefer tool call, fallback to text - structured_output = _extract_structured_output_from_stream(result_text, tool_call_args) - - # Fill missing fields with default values - structured_output = fill_defaults_from_schema(structured_output, json_schema) - - # Convert file references if tenant_id is provided - if tenant_id is not None: - structured_output = convert_file_refs_in_output( - output=structured_output, - json_schema=json_schema, - tenant_id=tenant_id, - ) - - yield LLMResultChunkWithStructuredOutput( - structured_output=structured_output, - model=model_schema.model, - prompt_messages=prompt_messages, - system_fingerprint=system_fingerprint, - delta=LLMResultChunkDelta( - index=0, - message=AssistantPromptMessage(content=""), - usage=None, - finish_reason=None, - ), - ) - - return generator() + return LLMResultWithStructuredOutput( + structured_output=structured_output, + model=llm_result.model, + message=llm_result.message, + usage=llm_result.usage, + system_fingerprint=llm_result.system_fingerprint, + prompt_messages=llm_result.prompt_messages, + ) def invoke_llm_with_pydantic_model( @@ -262,7 +150,6 @@ def invoke_llm_with_pydantic_model( model_parameters: Mapping | None = None, tools: Sequence[PromptMessageTool] | None = None, stop: list[str] | None = None, - stream: bool = True, # Some model plugin implementations don't support stream=False user: str | None = None, callbacks: list[Callback] | None = None, tenant_id: str | None = None, @@ -281,36 +168,6 @@ def invoke_llm_with_pydantic_model( """ json_schema = _schema_from_pydantic(output_model) - if stream: - result_generator = invoke_llm_with_structured_output( - provider=provider, - model_schema=model_schema, - model_instance=model_instance, - prompt_messages=prompt_messages, - json_schema=json_schema, - model_parameters=model_parameters, - tools=tools, - stop=stop, - stream=True, - user=user, - callbacks=callbacks, - tenant_id=tenant_id, - ) - - # Consume the generator to get the final chunk with structured_output - last_chunk: LLMResultChunkWithStructuredOutput | None = None - for chunk in result_generator: - last_chunk = chunk - - if last_chunk is None: - raise OutputParserError("No chunks received from LLM") - - structured_output = last_chunk.structured_output - if structured_output is None: - raise OutputParserError("Structured output is empty") - - return _validate_structured_output(output_model, structured_output) - result = invoke_llm_with_structured_output( provider=provider, model_schema=model_schema, @@ -320,7 +177,6 @@ def invoke_llm_with_pydantic_model( model_parameters=model_parameters, tools=tools, stop=stop, - stream=False, user=user, callbacks=callbacks, tenant_id=tenant_id, @@ -416,7 +272,7 @@ def _parse_tool_call_arguments(arguments: str) -> Mapping[str, Any]: repaired = json_repair.loads(arguments) if not isinstance(repaired, dict): raise OutputParserError(f"Failed to parse tool call arguments: {arguments}") - return cast(dict, repaired) + return repaired def _get_default_value_for_type(type_name: str | list[str] | None) -> Any: diff --git a/api/core/llm_generator/prompts.py b/api/core/llm_generator/prompts.py index 5db3733bd9..a15d0a7840 100644 --- a/api/core/llm_generator/prompts.py +++ b/api/core/llm_generator/prompts.py @@ -435,3 +435,20 @@ INSTRUCTION_GENERATE_TEMPLATE_PROMPT = """The output of this prompt is not as ex You should edit the prompt according to the IDEAL OUTPUT.""" INSTRUCTION_GENERATE_TEMPLATE_CODE = """Please fix the errors in the {{#error_message#}}.""" + +DEFAULT_GENERATOR_SUMMARY_PROMPT = ( + """Summarize the following content. Extract only the key information and main points. """ + """Remove redundant details. + +Requirements: +1. Write a concise summary in plain text +2. Use the same language as the input content +3. Focus on important facts, concepts, and details +4. If images are included, describe their key information +5. Do not use words like "好的", "ok", "I understand", "This text discusses", "The content mentions" +6. Write directly without extra words + +Output only the summary text. Start summarizing now: + +""" +) diff --git a/api/core/model_runtime/model_providers/__base/ai_model.py b/api/core/model_runtime/model_providers/__base/ai_model.py index 45f0335c2e..c3e50eaddd 100644 --- a/api/core/model_runtime/model_providers/__base/ai_model.py +++ b/api/core/model_runtime/model_providers/__base/ai_model.py @@ -1,10 +1,11 @@ import decimal import hashlib -from threading import Lock +import logging -from pydantic import BaseModel, ConfigDict, Field +from pydantic import BaseModel, ConfigDict, Field, ValidationError +from redis import RedisError -import contexts +from configs import dify_config from core.model_runtime.entities.common_entities import I18nObject from core.model_runtime.entities.defaults import PARAMETER_RULE_TEMPLATE from core.model_runtime.entities.model_entities import ( @@ -24,6 +25,9 @@ from core.model_runtime.errors.invoke import ( InvokeServerUnavailableError, ) from core.plugin.entities.plugin_daemon import PluginModelProviderEntity +from extensions.ext_redis import redis_client + +logger = logging.getLogger(__name__) class AIModel(BaseModel): @@ -144,34 +148,60 @@ class AIModel(BaseModel): plugin_model_manager = PluginModelClient() cache_key = f"{self.tenant_id}:{self.plugin_id}:{self.provider_name}:{self.model_type.value}:{model}" - # sort credentials sorted_credentials = sorted(credentials.items()) if credentials else [] cache_key += ":".join([hashlib.md5(f"{k}:{v}".encode()).hexdigest() for k, v in sorted_credentials]) + cached_schema_json = None try: - contexts.plugin_model_schemas.get() - except LookupError: - contexts.plugin_model_schemas.set({}) - contexts.plugin_model_schema_lock.set(Lock()) - - with contexts.plugin_model_schema_lock.get(): - if cache_key in contexts.plugin_model_schemas.get(): - return contexts.plugin_model_schemas.get()[cache_key] - - schema = plugin_model_manager.get_model_schema( - tenant_id=self.tenant_id, - user_id="unknown", - plugin_id=self.plugin_id, - provider=self.provider_name, - model_type=self.model_type.value, - model=model, - credentials=credentials or {}, + cached_schema_json = redis_client.get(cache_key) + except (RedisError, RuntimeError) as exc: + logger.warning( + "Failed to read plugin model schema cache for model %s: %s", + model, + str(exc), + exc_info=True, ) + if cached_schema_json: + try: + return AIModelEntity.model_validate_json(cached_schema_json) + except ValidationError: + logger.warning( + "Failed to validate cached plugin model schema for model %s", + model, + exc_info=True, + ) + try: + redis_client.delete(cache_key) + except (RedisError, RuntimeError) as exc: + logger.warning( + "Failed to delete invalid plugin model schema cache for model %s: %s", + model, + str(exc), + exc_info=True, + ) - if schema: - contexts.plugin_model_schemas.get()[cache_key] = schema + schema = plugin_model_manager.get_model_schema( + tenant_id=self.tenant_id, + user_id="unknown", + plugin_id=self.plugin_id, + provider=self.provider_name, + model_type=self.model_type.value, + model=model, + credentials=credentials or {}, + ) - return schema + if schema: + try: + redis_client.setex(cache_key, dify_config.PLUGIN_MODEL_SCHEMA_CACHE_TTL, schema.model_dump_json()) + except (RedisError, RuntimeError) as exc: + logger.warning( + "Failed to write plugin model schema cache for model %s: %s", + model, + str(exc), + exc_info=True, + ) + + return schema def get_customizable_model_schema_from_credentials(self, model: str, credentials: dict) -> AIModelEntity | None: """ diff --git a/api/core/model_runtime/model_providers/model_provider_factory.py b/api/core/model_runtime/model_providers/model_provider_factory.py index 28f162a928..64538a6779 100644 --- a/api/core/model_runtime/model_providers/model_provider_factory.py +++ b/api/core/model_runtime/model_providers/model_provider_factory.py @@ -5,7 +5,11 @@ import logging from collections.abc import Sequence from threading import Lock +from pydantic import ValidationError +from redis import RedisError + import contexts +from configs import dify_config from core.model_runtime.entities.model_entities import AIModelEntity, ModelType from core.model_runtime.entities.provider_entities import ProviderConfig, ProviderEntity, SimpleProviderEntity from core.model_runtime.model_providers.__base.ai_model import AIModel @@ -18,6 +22,7 @@ from core.model_runtime.model_providers.__base.tts_model import TTSModel from core.model_runtime.schema_validators.model_credential_schema_validator import ModelCredentialSchemaValidator from core.model_runtime.schema_validators.provider_credential_schema_validator import ProviderCredentialSchemaValidator from core.plugin.entities.plugin_daemon import PluginModelProviderEntity +from extensions.ext_redis import redis_client from models.provider_ids import ModelProviderID logger = logging.getLogger(__name__) @@ -175,34 +180,60 @@ class ModelProviderFactory: """ plugin_id, provider_name = self.get_plugin_id_and_provider_name_from_provider(provider) cache_key = f"{self.tenant_id}:{plugin_id}:{provider_name}:{model_type.value}:{model}" - # sort credentials sorted_credentials = sorted(credentials.items()) if credentials else [] cache_key += ":".join([hashlib.md5(f"{k}:{v}".encode()).hexdigest() for k, v in sorted_credentials]) + cached_schema_json = None try: - contexts.plugin_model_schemas.get() - except LookupError: - contexts.plugin_model_schemas.set({}) - contexts.plugin_model_schema_lock.set(Lock()) - - with contexts.plugin_model_schema_lock.get(): - if cache_key in contexts.plugin_model_schemas.get(): - return contexts.plugin_model_schemas.get()[cache_key] - - schema = self.plugin_model_manager.get_model_schema( - tenant_id=self.tenant_id, - user_id="unknown", - plugin_id=plugin_id, - provider=provider_name, - model_type=model_type.value, - model=model, - credentials=credentials or {}, + cached_schema_json = redis_client.get(cache_key) + except (RedisError, RuntimeError) as exc: + logger.warning( + "Failed to read plugin model schema cache for model %s: %s", + model, + str(exc), + exc_info=True, ) + if cached_schema_json: + try: + return AIModelEntity.model_validate_json(cached_schema_json) + except ValidationError: + logger.warning( + "Failed to validate cached plugin model schema for model %s", + model, + exc_info=True, + ) + try: + redis_client.delete(cache_key) + except (RedisError, RuntimeError) as exc: + logger.warning( + "Failed to delete invalid plugin model schema cache for model %s: %s", + model, + str(exc), + exc_info=True, + ) - if schema: - contexts.plugin_model_schemas.get()[cache_key] = schema + schema = self.plugin_model_manager.get_model_schema( + tenant_id=self.tenant_id, + user_id="unknown", + plugin_id=plugin_id, + provider=provider_name, + model_type=model_type.value, + model=model, + credentials=credentials or {}, + ) - return schema + if schema: + try: + redis_client.setex(cache_key, dify_config.PLUGIN_MODEL_SCHEMA_CACHE_TTL, schema.model_dump_json()) + except (RedisError, RuntimeError) as exc: + logger.warning( + "Failed to write plugin model schema cache for model %s: %s", + model, + str(exc), + exc_info=True, + ) + + return schema def get_models( self, diff --git a/api/core/plugin/backwards_invocation/model.py b/api/core/plugin/backwards_invocation/model.py index 6cdc047a64..1abd9fabc7 100644 --- a/api/core/plugin/backwards_invocation/model.py +++ b/api/core/plugin/backwards_invocation/model.py @@ -114,46 +114,32 @@ class PluginModelBackwardsInvocation(BaseBackwardsInvocation): model_instance=model_instance, prompt_messages=payload.prompt_messages, json_schema=payload.structured_output_schema, + model_parameters=payload.completion_params, tools=payload.tools, stop=payload.stop, - stream=True if payload.stream is None else payload.stream, - user=user_id, - model_parameters=payload.completion_params, + user=user_id ) - if isinstance(response, Generator): + if response.usage: + llm_utils.deduct_llm_quota(tenant_id=tenant.id, model_instance=model_instance, usage=response.usage) - def handle() -> Generator[LLMResultChunkWithStructuredOutput, None, None]: - for chunk in response: - if chunk.delta.usage: - llm_utils.deduct_llm_quota( - tenant_id=tenant.id, model_instance=model_instance, usage=chunk.delta.usage - ) - chunk.prompt_messages = [] - yield chunk + def handle_non_streaming( + response: LLMResultWithStructuredOutput, + ) -> Generator[LLMResultChunkWithStructuredOutput, None, None]: + yield LLMResultChunkWithStructuredOutput( + model=response.model, + prompt_messages=[], + system_fingerprint=response.system_fingerprint, + structured_output=response.structured_output, + delta=LLMResultChunkDelta( + index=0, + message=response.message, + usage=response.usage, + finish_reason="", + ), + ) - return handle() - else: - if response.usage: - llm_utils.deduct_llm_quota(tenant_id=tenant.id, model_instance=model_instance, usage=response.usage) - - def handle_non_streaming( - response: LLMResultWithStructuredOutput, - ) -> Generator[LLMResultChunkWithStructuredOutput, None, None]: - yield LLMResultChunkWithStructuredOutput( - model=response.model, - prompt_messages=[], - system_fingerprint=response.system_fingerprint, - structured_output=response.structured_output, - delta=LLMResultChunkDelta( - index=0, - message=response.message, - usage=response.usage, - finish_reason="", - ), - ) - - return handle_non_streaming(response) + return handle_non_streaming(response) @classmethod def invoke_text_embedding(cls, user_id: str, tenant: Tenant, payload: RequestInvokeTextEmbedding): diff --git a/api/core/rag/datasource/retrieval_service.py b/api/core/rag/datasource/retrieval_service.py index 8ec1ce6242..91c16ce079 100644 --- a/api/core/rag/datasource/retrieval_service.py +++ b/api/core/rag/datasource/retrieval_service.py @@ -24,7 +24,13 @@ from core.rag.rerank.rerank_type import RerankMode from core.rag.retrieval.retrieval_methods import RetrievalMethod from core.tools.signature import sign_upload_file from extensions.ext_database import db -from models.dataset import ChildChunk, Dataset, DocumentSegment, SegmentAttachmentBinding +from models.dataset import ( + ChildChunk, + Dataset, + DocumentSegment, + DocumentSegmentSummary, + SegmentAttachmentBinding, +) from models.dataset import Document as DatasetDocument from models.model import UploadFile from services.external_knowledge_service import ExternalDatasetService @@ -389,15 +395,15 @@ class RetrievalService: .all() } - records = [] - include_segment_ids = set() - segment_child_map = {} - valid_dataset_documents = {} image_doc_ids: list[Any] = [] child_index_node_ids = [] index_node_ids = [] doc_to_document_map = {} + summary_segment_ids = set() # Track segments retrieved via summary + summary_score_map: dict[str, float] = {} # Map original_chunk_id to summary score + + # First pass: collect all document IDs and identify summary documents for document in documents: document_id = document.metadata.get("document_id") if document_id not in dataset_documents: @@ -408,16 +414,39 @@ class RetrievalService: continue valid_dataset_documents[document_id] = dataset_document + doc_id = document.metadata.get("doc_id") or "" + doc_to_document_map[doc_id] = document + + # Check if this is a summary document + is_summary = document.metadata.get("is_summary", False) + if is_summary: + # For summary documents, find the original chunk via original_chunk_id + original_chunk_id = document.metadata.get("original_chunk_id") + if original_chunk_id: + summary_segment_ids.add(original_chunk_id) + # Save summary's score for later use + summary_score = document.metadata.get("score") + if summary_score is not None: + try: + summary_score_float = float(summary_score) + # If the same segment has multiple summary hits, take the highest score + if original_chunk_id not in summary_score_map: + summary_score_map[original_chunk_id] = summary_score_float + else: + summary_score_map[original_chunk_id] = max( + summary_score_map[original_chunk_id], summary_score_float + ) + except (ValueError, TypeError): + # Skip invalid score values + pass + continue # Skip adding to other lists for summary documents + if dataset_document.doc_form == IndexStructureType.PARENT_CHILD_INDEX: - doc_id = document.metadata.get("doc_id") or "" - doc_to_document_map[doc_id] = document if document.metadata.get("doc_type") == DocType.IMAGE: image_doc_ids.append(doc_id) else: child_index_node_ids.append(doc_id) else: - doc_id = document.metadata.get("doc_id") or "" - doc_to_document_map[doc_id] = document if document.metadata.get("doc_type") == DocType.IMAGE: image_doc_ids.append(doc_id) else: @@ -433,6 +462,7 @@ class RetrievalService: attachment_map: dict[str, list[dict[str, Any]]] = {} child_chunk_map: dict[str, list[ChildChunk]] = {} doc_segment_map: dict[str, list[str]] = {} + segment_summary_map: dict[str, str] = {} # Map segment_id to summary content with session_factory.create_session() as session: attachments = cls.get_segment_attachment_infos(image_doc_ids, session) @@ -447,6 +477,7 @@ class RetrievalService: doc_segment_map[attachment["segment_id"]].append(attachment["attachment_id"]) else: doc_segment_map[attachment["segment_id"]] = [attachment["attachment_id"]] + child_chunk_stmt = select(ChildChunk).where(ChildChunk.index_node_id.in_(child_index_node_ids)) child_index_nodes = session.execute(child_chunk_stmt).scalars().all() @@ -470,6 +501,7 @@ class RetrievalService: index_node_segments = session.execute(document_segment_stmt).scalars().all() # type: ignore for index_node_segment in index_node_segments: doc_segment_map[index_node_segment.id] = [index_node_segment.index_node_id] + if segment_ids: document_segment_stmt = select(DocumentSegment).where( DocumentSegment.enabled == True, @@ -481,6 +513,40 @@ class RetrievalService: if index_node_segments: segments.extend(index_node_segments) + # Handle summary documents: query segments by original_chunk_id + if summary_segment_ids: + summary_segment_ids_list = list(summary_segment_ids) + summary_segment_stmt = select(DocumentSegment).where( + DocumentSegment.enabled == True, + DocumentSegment.status == "completed", + DocumentSegment.id.in_(summary_segment_ids_list), + ) + summary_segments = session.execute(summary_segment_stmt).scalars().all() # type: ignore + segments.extend(summary_segments) + # Add summary segment IDs to segment_ids for summary query + for seg in summary_segments: + if seg.id not in segment_ids: + segment_ids.append(seg.id) + + # Batch query summaries for segments retrieved via summary (only enabled summaries) + if summary_segment_ids: + summaries = ( + session.query(DocumentSegmentSummary) + .filter( + DocumentSegmentSummary.chunk_id.in_(list(summary_segment_ids)), + DocumentSegmentSummary.status == "completed", + DocumentSegmentSummary.enabled == True, # Only retrieve enabled summaries + ) + .all() + ) + for summary in summaries: + if summary.summary_content: + segment_summary_map[summary.chunk_id] = summary.summary_content + + include_segment_ids = set() + segment_child_map: dict[str, dict[str, Any]] = {} + records: list[dict[str, Any]] = [] + for segment in segments: child_chunks: list[ChildChunk] = child_chunk_map.get(segment.id, []) attachment_infos: list[dict[str, Any]] = attachment_map.get(segment.id, []) @@ -489,30 +555,44 @@ class RetrievalService: if ds_dataset_document and ds_dataset_document.doc_form == IndexStructureType.PARENT_CHILD_INDEX: if segment.id not in include_segment_ids: include_segment_ids.add(segment.id) + # Check if this segment was retrieved via summary + # Use summary score as base score if available, otherwise 0.0 + max_score = summary_score_map.get(segment.id, 0.0) + if child_chunks or attachment_infos: child_chunk_details = [] - max_score = 0.0 for child_chunk in child_chunks: - document = doc_to_document_map[child_chunk.index_node_id] + child_document: Document | None = doc_to_document_map.get(child_chunk.index_node_id) + if child_document: + child_score = child_document.metadata.get("score", 0.0) + else: + child_score = 0.0 child_chunk_detail = { "id": child_chunk.id, "content": child_chunk.content, "position": child_chunk.position, - "score": document.metadata.get("score", 0.0) if document else 0.0, + "score": child_score, } child_chunk_details.append(child_chunk_detail) - max_score = max(max_score, document.metadata.get("score", 0.0) if document else 0.0) + max_score = max(max_score, child_score) for attachment_info in attachment_infos: - file_document = doc_to_document_map[attachment_info["id"]] - max_score = max( - max_score, file_document.metadata.get("score", 0.0) if file_document else 0.0 - ) + file_document = doc_to_document_map.get(attachment_info["id"]) + if file_document: + max_score = max(max_score, file_document.metadata.get("score", 0.0)) map_detail = { "max_score": max_score, "child_chunks": child_chunk_details, } segment_child_map[segment.id] = map_detail + else: + # No child chunks or attachments, use summary score if available + summary_score = summary_score_map.get(segment.id) + if summary_score is not None: + segment_child_map[segment.id] = { + "max_score": summary_score, + "child_chunks": [], + } record: dict[str, Any] = { "segment": segment, } @@ -520,14 +600,23 @@ class RetrievalService: else: if segment.id not in include_segment_ids: include_segment_ids.add(segment.id) - max_score = 0.0 - segment_document = doc_to_document_map.get(segment.index_node_id) - if segment_document: - max_score = max(max_score, segment_document.metadata.get("score", 0.0)) + + # Check if this segment was retrieved via summary + # Use summary score if available (summary retrieval takes priority) + max_score = summary_score_map.get(segment.id, 0.0) + + # If not retrieved via summary, use original segment's score + if segment.id not in summary_score_map: + segment_document = doc_to_document_map.get(segment.index_node_id) + if segment_document: + max_score = max(max_score, segment_document.metadata.get("score", 0.0)) + + # Also consider attachment scores for attachment_info in attachment_infos: file_doc = doc_to_document_map.get(attachment_info["id"]) if file_doc: max_score = max(max_score, file_doc.metadata.get("score", 0.0)) + record = { "segment": segment, "score": max_score, @@ -576,9 +665,16 @@ class RetrievalService: else None ) + # Extract summary if this segment was retrieved via summary + summary_content = segment_summary_map.get(segment.id) + # Create RetrievalSegments object retrieval_segment = RetrievalSegments( - segment=segment, child_chunks=child_chunks_list, score=score, files=files + segment=segment, + child_chunks=child_chunks_list, + score=score, + files=files, + summary=summary_content, ) result.append(retrieval_segment) diff --git a/api/core/rag/embedding/retrieval.py b/api/core/rag/embedding/retrieval.py index b54a37b49e..f6834ab87b 100644 --- a/api/core/rag/embedding/retrieval.py +++ b/api/core/rag/embedding/retrieval.py @@ -20,3 +20,4 @@ class RetrievalSegments(BaseModel): child_chunks: list[RetrievalChildChunk] | None = None score: float | None = None files: list[dict[str, str | int]] | None = None + summary: str | None = None # Summary content if retrieved via summary index diff --git a/api/core/rag/entities/citation_metadata.py b/api/core/rag/entities/citation_metadata.py index 9f66cd9a03..aec5c353f8 100644 --- a/api/core/rag/entities/citation_metadata.py +++ b/api/core/rag/entities/citation_metadata.py @@ -22,3 +22,4 @@ class RetrievalSourceMetadata(BaseModel): doc_metadata: dict[str, Any] | None = None title: str | None = None files: list[dict[str, Any]] | None = None + summary: str | None = None diff --git a/api/core/rag/extractor/word_extractor.py b/api/core/rag/extractor/word_extractor.py index 511f5a698d..1ddbfc5864 100644 --- a/api/core/rag/extractor/word_extractor.py +++ b/api/core/rag/extractor/word_extractor.py @@ -1,4 +1,7 @@ -"""Abstract interface for document loader implementations.""" +"""Word (.docx) document extractor used for RAG ingestion. + +Supports local file paths and remote URLs (downloaded via `core.helper.ssrf_proxy`). +""" import logging import mimetypes @@ -8,7 +11,6 @@ import tempfile import uuid from urllib.parse import urlparse -import httpx from docx import Document as DocxDocument from docx.oxml.ns import qn from docx.text.run import Run @@ -44,7 +46,7 @@ class WordExtractor(BaseExtractor): # If the file is a web path, download it to a temporary file, and use that if not os.path.isfile(self.file_path) and self._is_valid_url(self.file_path): - response = httpx.get(self.file_path, timeout=None) + response = ssrf_proxy.get(self.file_path) if response.status_code != 200: response.close() @@ -55,6 +57,7 @@ class WordExtractor(BaseExtractor): self.temp_file = tempfile.NamedTemporaryFile() # noqa SIM115 try: self.temp_file.write(response.content) + self.temp_file.flush() finally: response.close() self.file_path = self.temp_file.name diff --git a/api/core/rag/index_processor/index_processor_base.py b/api/core/rag/index_processor/index_processor_base.py index e36b54eedd..151a3de7d9 100644 --- a/api/core/rag/index_processor/index_processor_base.py +++ b/api/core/rag/index_processor/index_processor_base.py @@ -13,6 +13,7 @@ from urllib.parse import unquote, urlparse import httpx from configs import dify_config +from core.entities.knowledge_entities import PreviewDetail from core.helper import ssrf_proxy from core.rag.extractor.entity.extract_setting import ExtractSetting from core.rag.index_processor.constant.doc_type import DocType @@ -45,6 +46,17 @@ class BaseIndexProcessor(ABC): def transform(self, documents: list[Document], current_user: Account | None = None, **kwargs) -> list[Document]: raise NotImplementedError + @abstractmethod + def generate_summary_preview( + self, tenant_id: str, preview_texts: list[PreviewDetail], summary_index_setting: dict + ) -> list[PreviewDetail]: + """ + For each segment in preview_texts, generate a summary using LLM and attach it to the segment. + The summary can be stored in a new attribute, e.g., summary. + This method should be implemented by subclasses. + """ + raise NotImplementedError + @abstractmethod def load( self, diff --git a/api/core/rag/index_processor/processor/paragraph_index_processor.py b/api/core/rag/index_processor/processor/paragraph_index_processor.py index cf68cff7dc..ab91e29145 100644 --- a/api/core/rag/index_processor/processor/paragraph_index_processor.py +++ b/api/core/rag/index_processor/processor/paragraph_index_processor.py @@ -1,9 +1,27 @@ """Paragraph index processor.""" +import logging +import re import uuid from collections.abc import Mapping -from typing import Any +from typing import Any, cast +logger = logging.getLogger(__name__) + +from core.entities.knowledge_entities import PreviewDetail +from core.file import File, FileTransferMethod, FileType, file_manager +from core.llm_generator.prompts import DEFAULT_GENERATOR_SUMMARY_PROMPT +from core.model_manager import ModelInstance +from core.model_runtime.entities.llm_entities import LLMResult, LLMUsage +from core.model_runtime.entities.message_entities import ( + ImagePromptMessageContent, + PromptMessage, + PromptMessageContentUnionTypes, + TextPromptMessageContent, + UserPromptMessage, +) +from core.model_runtime.entities.model_entities import ModelFeature, ModelType +from core.provider_manager import ProviderManager from core.rag.cleaner.clean_processor import CleanProcessor from core.rag.datasource.keyword.keyword_factory import Keyword from core.rag.datasource.retrieval_service import RetrievalService @@ -17,12 +35,17 @@ from core.rag.index_processor.index_processor_base import BaseIndexProcessor from core.rag.models.document import AttachmentDocument, Document, MultimodalGeneralStructureChunk from core.rag.retrieval.retrieval_methods import RetrievalMethod from core.tools.utils.text_processing_utils import remove_leading_symbols +from core.workflow.nodes.llm import llm_utils +from extensions.ext_database import db +from factories.file_factory import build_from_mapping from libs import helper +from models import UploadFile from models.account import Account -from models.dataset import Dataset, DatasetProcessRule +from models.dataset import Dataset, DatasetProcessRule, DocumentSegment, SegmentAttachmentBinding from models.dataset import Document as DatasetDocument from services.account_service import AccountService from services.entities.knowledge_entities.knowledge_entities import Rule +from services.summary_index_service import SummaryIndexService class ParagraphIndexProcessor(BaseIndexProcessor): @@ -108,6 +131,29 @@ class ParagraphIndexProcessor(BaseIndexProcessor): keyword.add_texts(documents) def clean(self, dataset: Dataset, node_ids: list[str] | None, with_keywords: bool = True, **kwargs): + # Note: Summary indexes are now disabled (not deleted) when segments are disabled. + # This method is called for actual deletion scenarios (e.g., when segment is deleted). + # For disable operations, disable_summaries_for_segments is called directly in the task. + # Only delete summaries if explicitly requested (e.g., when segment is actually deleted) + delete_summaries = kwargs.get("delete_summaries", False) + if delete_summaries: + if node_ids: + # Find segments by index_node_id + segments = ( + db.session.query(DocumentSegment) + .filter( + DocumentSegment.dataset_id == dataset.id, + DocumentSegment.index_node_id.in_(node_ids), + ) + .all() + ) + segment_ids = [segment.id for segment in segments] + if segment_ids: + SummaryIndexService.delete_summaries_for_segments(dataset, segment_ids) + else: + # Delete all summaries for the dataset + SummaryIndexService.delete_summaries_for_segments(dataset, None) + if dataset.indexing_technique == "high_quality": vector = Vector(dataset) if node_ids: @@ -227,3 +273,322 @@ class ParagraphIndexProcessor(BaseIndexProcessor): } else: raise ValueError("Chunks is not a list") + + def generate_summary_preview( + self, tenant_id: str, preview_texts: list[PreviewDetail], summary_index_setting: dict + ) -> list[PreviewDetail]: + """ + For each segment, concurrently call generate_summary to generate a summary + and write it to the summary attribute of PreviewDetail. + In preview mode (indexing-estimate), if any summary generation fails, the method will raise an exception. + """ + import concurrent.futures + + from flask import current_app + + # Capture Flask app context for worker threads + flask_app = None + try: + flask_app = current_app._get_current_object() # type: ignore + except RuntimeError: + logger.warning("No Flask application context available, summary generation may fail") + + def process(preview: PreviewDetail) -> None: + """Generate summary for a single preview item.""" + if flask_app: + # Ensure Flask app context in worker thread + with flask_app.app_context(): + summary, _ = self.generate_summary(tenant_id, preview.content, summary_index_setting) + preview.summary = summary + else: + # Fallback: try without app context (may fail) + summary, _ = self.generate_summary(tenant_id, preview.content, summary_index_setting) + preview.summary = summary + + # Generate summaries concurrently using ThreadPoolExecutor + # Set a reasonable timeout to prevent hanging (60 seconds per chunk, max 5 minutes total) + timeout_seconds = min(300, 60 * len(preview_texts)) + errors: list[Exception] = [] + + with concurrent.futures.ThreadPoolExecutor(max_workers=min(10, len(preview_texts))) as executor: + futures = [executor.submit(process, preview) for preview in preview_texts] + # Wait for all tasks to complete with timeout + done, not_done = concurrent.futures.wait(futures, timeout=timeout_seconds) + + # Cancel tasks that didn't complete in time + if not_done: + timeout_error_msg = ( + f"Summary generation timeout: {len(not_done)} chunks did not complete within {timeout_seconds}s" + ) + logger.warning("%s. Cancelling remaining tasks...", timeout_error_msg) + # In preview mode, timeout is also an error + errors.append(TimeoutError(timeout_error_msg)) + for future in not_done: + future.cancel() + # Wait a bit for cancellation to take effect + concurrent.futures.wait(not_done, timeout=5) + + # Collect exceptions from completed futures + for future in done: + try: + future.result() # This will raise any exception that occurred + except Exception as e: + logger.exception("Error in summary generation future") + errors.append(e) + + # In preview mode (indexing-estimate), if there are any errors, fail the request + if errors: + error_messages = [str(e) for e in errors] + error_summary = ( + f"Failed to generate summaries for {len(errors)} chunk(s). " + f"Errors: {'; '.join(error_messages[:3])}" # Show first 3 errors + ) + if len(errors) > 3: + error_summary += f" (and {len(errors) - 3} more)" + logger.error("Summary generation failed in preview mode: %s", error_summary) + raise ValueError(error_summary) + + return preview_texts + + @staticmethod + def generate_summary( + tenant_id: str, + text: str, + summary_index_setting: dict | None = None, + segment_id: str | None = None, + ) -> tuple[str, LLMUsage]: + """ + Generate summary for the given text using ModelInstance.invoke_llm and the default or custom summary prompt, + and supports vision models by including images from the segment attachments or text content. + + Args: + tenant_id: Tenant ID + text: Text content to summarize + summary_index_setting: Summary index configuration + segment_id: Optional segment ID to fetch attachments from SegmentAttachmentBinding table + + Returns: + Tuple of (summary_content, llm_usage) where llm_usage is LLMUsage object + """ + if not summary_index_setting or not summary_index_setting.get("enable"): + raise ValueError("summary_index_setting is required and must be enabled to generate summary.") + + model_name = summary_index_setting.get("model_name") + model_provider_name = summary_index_setting.get("model_provider_name") + summary_prompt = summary_index_setting.get("summary_prompt") + + if not model_name or not model_provider_name: + raise ValueError("model_name and model_provider_name are required in summary_index_setting") + + # Import default summary prompt + if not summary_prompt: + summary_prompt = DEFAULT_GENERATOR_SUMMARY_PROMPT + + provider_manager = ProviderManager() + provider_model_bundle = provider_manager.get_provider_model_bundle( + tenant_id, model_provider_name, ModelType.LLM + ) + model_instance = ModelInstance(provider_model_bundle, model_name) + + # Get model schema to check if vision is supported + model_schema = model_instance.model_type_instance.get_model_schema(model_name, model_instance.credentials) + supports_vision = model_schema and model_schema.features and ModelFeature.VISION in model_schema.features + + # Extract images if model supports vision + image_files = [] + if supports_vision: + # First, try to get images from SegmentAttachmentBinding (preferred method) + if segment_id: + image_files = ParagraphIndexProcessor._extract_images_from_segment_attachments(tenant_id, segment_id) + + # If no images from attachments, fall back to extracting from text + if not image_files: + image_files = ParagraphIndexProcessor._extract_images_from_text(tenant_id, text) + + # Build prompt messages + prompt_messages = [] + + if image_files: + # If we have images, create a UserPromptMessage with both text and images + prompt_message_contents: list[PromptMessageContentUnionTypes] = [] + + # Add images first + for file in image_files: + try: + file_content = file_manager.to_prompt_message_content( + file, image_detail_config=ImagePromptMessageContent.DETAIL.LOW + ) + prompt_message_contents.append(file_content) + except Exception as e: + logger.warning("Failed to convert image file to prompt message content: %s", str(e)) + continue + + # Add text content + if prompt_message_contents: # Only add text if we successfully added images + prompt_message_contents.append(TextPromptMessageContent(data=f"{summary_prompt}\n{text}")) + prompt_messages.append(UserPromptMessage(content=prompt_message_contents)) + else: + # If image conversion failed, fall back to text-only + prompt = f"{summary_prompt}\n{text}" + prompt_messages.append(UserPromptMessage(content=prompt)) + else: + # No images, use simple text prompt + prompt = f"{summary_prompt}\n{text}" + prompt_messages.append(UserPromptMessage(content=prompt)) + + result = model_instance.invoke_llm( + prompt_messages=cast(list[PromptMessage], prompt_messages), model_parameters={}, stream=False + ) + + # Type assertion: when stream=False, invoke_llm returns LLMResult, not Generator + if not isinstance(result, LLMResult): + raise ValueError("Expected LLMResult when stream=False") + + summary_content = getattr(result.message, "content", "") + usage = result.usage + + # Deduct quota for summary generation (same as workflow nodes) + try: + llm_utils.deduct_llm_quota(tenant_id=tenant_id, model_instance=model_instance, usage=usage) + except Exception as e: + # Log but don't fail summary generation if quota deduction fails + logger.warning("Failed to deduct quota for summary generation: %s", str(e)) + + return summary_content, usage + + @staticmethod + def _extract_images_from_text(tenant_id: str, text: str) -> list[File]: + """ + Extract images from markdown text and convert them to File objects. + + Args: + tenant_id: Tenant ID + text: Text content that may contain markdown image links + + Returns: + List of File objects representing images found in the text + """ + # Extract markdown images using regex pattern + pattern = r"!\[.*?\]\((.*?)\)" + images = re.findall(pattern, text) + + if not images: + return [] + + upload_file_id_list = [] + + for image in images: + # For data before v0.10.0 + pattern = r"/files/([a-f0-9\-]+)/image-preview(?:\?.*?)?" + match = re.search(pattern, image) + if match: + upload_file_id = match.group(1) + upload_file_id_list.append(upload_file_id) + continue + + # For data after v0.10.0 + pattern = r"/files/([a-f0-9\-]+)/file-preview(?:\?.*?)?" + match = re.search(pattern, image) + if match: + upload_file_id = match.group(1) + upload_file_id_list.append(upload_file_id) + continue + + # For tools directory - direct file formats (e.g., .png, .jpg, etc.) + pattern = r"/files/tools/([a-f0-9\-]+)\.([a-zA-Z0-9]+)(?:\?[^\s\)\"\']*)?" + match = re.search(pattern, image) + if match: + # Tool files are handled differently, skip for now + continue + + if not upload_file_id_list: + return [] + + # Get unique IDs for database query + unique_upload_file_ids = list(set(upload_file_id_list)) + upload_files = ( + db.session.query(UploadFile) + .where(UploadFile.id.in_(unique_upload_file_ids), UploadFile.tenant_id == tenant_id) + .all() + ) + + # Create File objects from UploadFile records + file_objects = [] + for upload_file in upload_files: + # Only process image files + if not upload_file.mime_type or "image" not in upload_file.mime_type: + continue + + mapping = { + "upload_file_id": upload_file.id, + "transfer_method": FileTransferMethod.LOCAL_FILE.value, + "type": FileType.IMAGE.value, + } + + try: + file_obj = build_from_mapping( + mapping=mapping, + tenant_id=tenant_id, + ) + file_objects.append(file_obj) + except Exception as e: + logger.warning("Failed to create File object from UploadFile %s: %s", upload_file.id, str(e)) + continue + + return file_objects + + @staticmethod + def _extract_images_from_segment_attachments(tenant_id: str, segment_id: str) -> list[File]: + """ + Extract images from SegmentAttachmentBinding table (preferred method). + This matches how DatasetRetrieval gets segment attachments. + + Args: + tenant_id: Tenant ID + segment_id: Segment ID to fetch attachments for + + Returns: + List of File objects representing images found in segment attachments + """ + from sqlalchemy import select + + # Query attachments from SegmentAttachmentBinding table + attachments_with_bindings = db.session.execute( + select(SegmentAttachmentBinding, UploadFile) + .join(UploadFile, UploadFile.id == SegmentAttachmentBinding.attachment_id) + .where( + SegmentAttachmentBinding.segment_id == segment_id, + SegmentAttachmentBinding.tenant_id == tenant_id, + ) + ).all() + + if not attachments_with_bindings: + return [] + + file_objects = [] + for _, upload_file in attachments_with_bindings: + # Only process image files + if not upload_file.mime_type or "image" not in upload_file.mime_type: + continue + + try: + # Create File object directly (similar to DatasetRetrieval) + file_obj = File( + id=upload_file.id, + filename=upload_file.name, + extension="." + upload_file.extension, + mime_type=upload_file.mime_type, + tenant_id=tenant_id, + type=FileType.IMAGE, + transfer_method=FileTransferMethod.LOCAL_FILE, + remote_url=upload_file.source_url, + related_id=upload_file.id, + size=upload_file.size, + storage_key=upload_file.key, + ) + file_objects.append(file_obj) + except Exception as e: + logger.warning("Failed to create File object from UploadFile %s: %s", upload_file.id, str(e)) + continue + + return file_objects diff --git a/api/core/rag/index_processor/processor/parent_child_index_processor.py b/api/core/rag/index_processor/processor/parent_child_index_processor.py index 0366f3259f..961df2e50c 100644 --- a/api/core/rag/index_processor/processor/parent_child_index_processor.py +++ b/api/core/rag/index_processor/processor/parent_child_index_processor.py @@ -1,11 +1,14 @@ """Paragraph index processor.""" import json +import logging import uuid from collections.abc import Mapping from typing import Any from configs import dify_config +from core.db.session_factory import session_factory +from core.entities.knowledge_entities import PreviewDetail from core.model_manager import ModelInstance from core.rag.cleaner.clean_processor import CleanProcessor from core.rag.datasource.retrieval_service import RetrievalService @@ -25,6 +28,9 @@ from models.dataset import ChildChunk, Dataset, DatasetProcessRule, DocumentSegm from models.dataset import Document as DatasetDocument from services.account_service import AccountService from services.entities.knowledge_entities.knowledge_entities import ParentMode, Rule +from services.summary_index_service import SummaryIndexService + +logger = logging.getLogger(__name__) class ParentChildIndexProcessor(BaseIndexProcessor): @@ -135,6 +141,30 @@ class ParentChildIndexProcessor(BaseIndexProcessor): def clean(self, dataset: Dataset, node_ids: list[str] | None, with_keywords: bool = True, **kwargs): # node_ids is segment's node_ids + # Note: Summary indexes are now disabled (not deleted) when segments are disabled. + # This method is called for actual deletion scenarios (e.g., when segment is deleted). + # For disable operations, disable_summaries_for_segments is called directly in the task. + # Only delete summaries if explicitly requested (e.g., when segment is actually deleted) + delete_summaries = kwargs.get("delete_summaries", False) + if delete_summaries: + if node_ids: + # Find segments by index_node_id + with session_factory.create_session() as session: + segments = ( + session.query(DocumentSegment) + .filter( + DocumentSegment.dataset_id == dataset.id, + DocumentSegment.index_node_id.in_(node_ids), + ) + .all() + ) + segment_ids = [segment.id for segment in segments] + if segment_ids: + SummaryIndexService.delete_summaries_for_segments(dataset, segment_ids) + else: + # Delete all summaries for the dataset + SummaryIndexService.delete_summaries_for_segments(dataset, None) + if dataset.indexing_technique == "high_quality": delete_child_chunks = kwargs.get("delete_child_chunks") or False precomputed_child_node_ids = kwargs.get("precomputed_child_node_ids") @@ -326,3 +356,91 @@ class ParentChildIndexProcessor(BaseIndexProcessor): "preview": preview, "total_segments": len(parent_childs.parent_child_chunks), } + + def generate_summary_preview( + self, tenant_id: str, preview_texts: list[PreviewDetail], summary_index_setting: dict + ) -> list[PreviewDetail]: + """ + For each parent chunk in preview_texts, concurrently call generate_summary to generate a summary + and write it to the summary attribute of PreviewDetail. + In preview mode (indexing-estimate), if any summary generation fails, the method will raise an exception. + + Note: For parent-child structure, we only generate summaries for parent chunks. + """ + import concurrent.futures + + from flask import current_app + + # Capture Flask app context for worker threads + flask_app = None + try: + flask_app = current_app._get_current_object() # type: ignore + except RuntimeError: + logger.warning("No Flask application context available, summary generation may fail") + + def process(preview: PreviewDetail) -> None: + """Generate summary for a single preview item (parent chunk).""" + from core.rag.index_processor.processor.paragraph_index_processor import ParagraphIndexProcessor + + if flask_app: + # Ensure Flask app context in worker thread + with flask_app.app_context(): + summary, _ = ParagraphIndexProcessor.generate_summary( + tenant_id=tenant_id, + text=preview.content, + summary_index_setting=summary_index_setting, + ) + preview.summary = summary + else: + # Fallback: try without app context (may fail) + summary, _ = ParagraphIndexProcessor.generate_summary( + tenant_id=tenant_id, + text=preview.content, + summary_index_setting=summary_index_setting, + ) + preview.summary = summary + + # Generate summaries concurrently using ThreadPoolExecutor + # Set a reasonable timeout to prevent hanging (60 seconds per chunk, max 5 minutes total) + timeout_seconds = min(300, 60 * len(preview_texts)) + errors: list[Exception] = [] + + with concurrent.futures.ThreadPoolExecutor(max_workers=min(10, len(preview_texts))) as executor: + futures = [executor.submit(process, preview) for preview in preview_texts] + # Wait for all tasks to complete with timeout + done, not_done = concurrent.futures.wait(futures, timeout=timeout_seconds) + + # Cancel tasks that didn't complete in time + if not_done: + timeout_error_msg = ( + f"Summary generation timeout: {len(not_done)} chunks did not complete within {timeout_seconds}s" + ) + logger.warning("%s. Cancelling remaining tasks...", timeout_error_msg) + # In preview mode, timeout is also an error + errors.append(TimeoutError(timeout_error_msg)) + for future in not_done: + future.cancel() + # Wait a bit for cancellation to take effect + concurrent.futures.wait(not_done, timeout=5) + + # Collect exceptions from completed futures + for future in done: + try: + future.result() # This will raise any exception that occurred + except Exception as e: + logger.exception("Error in summary generation future") + errors.append(e) + + # In preview mode (indexing-estimate), if there are any errors, fail the request + if errors: + error_messages = [str(e) for e in errors] + error_summary = ( + f"Failed to generate summaries for {len(errors)} chunk(s). " + f"Errors: {'; '.join(error_messages[:3])}" # Show first 3 errors + ) + if len(errors) > 3: + error_summary += f" (and {len(errors) - 3} more)" + logger.error("Summary generation failed in preview mode: %s", error_summary) + raise ValueError(error_summary) + + return preview_texts diff --git a/api/core/rag/index_processor/processor/qa_index_processor.py b/api/core/rag/index_processor/processor/qa_index_processor.py index 1183d5fbd7..272d2ed351 100644 --- a/api/core/rag/index_processor/processor/qa_index_processor.py +++ b/api/core/rag/index_processor/processor/qa_index_processor.py @@ -11,6 +11,8 @@ import pandas as pd from flask import Flask, current_app from werkzeug.datastructures import FileStorage +from core.db.session_factory import session_factory +from core.entities.knowledge_entities import PreviewDetail from core.llm_generator.llm_generator import LLMGenerator from core.rag.cleaner.clean_processor import CleanProcessor from core.rag.datasource.retrieval_service import RetrievalService @@ -25,9 +27,10 @@ from core.rag.retrieval.retrieval_methods import RetrievalMethod from core.tools.utils.text_processing_utils import remove_leading_symbols from libs import helper from models.account import Account -from models.dataset import Dataset +from models.dataset import Dataset, DocumentSegment from models.dataset import Document as DatasetDocument from services.entities.knowledge_entities.knowledge_entities import Rule +from services.summary_index_service import SummaryIndexService logger = logging.getLogger(__name__) @@ -144,6 +147,31 @@ class QAIndexProcessor(BaseIndexProcessor): vector.create_multimodal(multimodal_documents) def clean(self, dataset: Dataset, node_ids: list[str] | None, with_keywords: bool = True, **kwargs): + # Note: Summary indexes are now disabled (not deleted) when segments are disabled. + # This method is called for actual deletion scenarios (e.g., when segment is deleted). + # For disable operations, disable_summaries_for_segments is called directly in the task. + # Note: qa_model doesn't generate summaries, but we clean them for completeness + # Only delete summaries if explicitly requested (e.g., when segment is actually deleted) + delete_summaries = kwargs.get("delete_summaries", False) + if delete_summaries: + if node_ids: + # Find segments by index_node_id + with session_factory.create_session() as session: + segments = ( + session.query(DocumentSegment) + .filter( + DocumentSegment.dataset_id == dataset.id, + DocumentSegment.index_node_id.in_(node_ids), + ) + .all() + ) + segment_ids = [segment.id for segment in segments] + if segment_ids: + SummaryIndexService.delete_summaries_for_segments(dataset, segment_ids) + else: + # Delete all summaries for the dataset + SummaryIndexService.delete_summaries_for_segments(dataset, None) + vector = Vector(dataset) if node_ids: vector.delete_by_ids(node_ids) @@ -212,6 +240,17 @@ class QAIndexProcessor(BaseIndexProcessor): "total_segments": len(qa_chunks.qa_chunks), } + def generate_summary_preview( + self, tenant_id: str, preview_texts: list[PreviewDetail], summary_index_setting: dict + ) -> list[PreviewDetail]: + """ + QA model doesn't generate summaries, so this method returns preview_texts unchanged. + + Note: QA model uses question-answer pairs, which don't require summary generation. + """ + # QA model doesn't generate summaries, return as-is + return preview_texts + def _format_qa_document(self, flask_app: Flask, tenant_id: str, document_node, all_qa_documents, document_language): format_documents = [] if document_node.page_content is None or not document_node.page_content.strip(): diff --git a/api/core/rag/retrieval/dataset_retrieval.py b/api/core/rag/retrieval/dataset_retrieval.py index f8f85d141a..541c241ae5 100644 --- a/api/core/rag/retrieval/dataset_retrieval.py +++ b/api/core/rag/retrieval/dataset_retrieval.py @@ -236,20 +236,24 @@ class DatasetRetrieval: if records: for record in records: segment = record.segment + # Build content: if summary exists, add it before the segment content if segment.answer: - document_context_list.append( - DocumentContext( - content=f"question:{segment.get_sign_content()} answer:{segment.answer}", - score=record.score, - ) - ) + segment_content = f"question:{segment.get_sign_content()} answer:{segment.answer}" else: - document_context_list.append( - DocumentContext( - content=segment.get_sign_content(), - score=record.score, - ) + segment_content = segment.get_sign_content() + + # If summary exists, prepend it to the content + if record.summary: + final_content = f"{record.summary}\n{segment_content}" + else: + final_content = segment_content + + document_context_list.append( + DocumentContext( + content=final_content, + score=record.score, ) + ) if vision_enabled: attachments_with_bindings = db.session.execute( select(SegmentAttachmentBinding, UploadFile) @@ -316,6 +320,9 @@ class DatasetRetrieval: source.content = f"question:{segment.content} \nanswer:{segment.answer}" else: source.content = segment.content + # Add summary if this segment was retrieved via summary + if hasattr(record, "summary") and record.summary: + source.summary = record.summary retrieval_resource_list.append(source) if hit_callback and retrieval_resource_list: retrieval_resource_list = sorted(retrieval_resource_list, key=lambda x: x.score or 0.0, reverse=True) diff --git a/api/core/sandbox/inspector/archive_source.py b/api/core/sandbox/inspector/archive_source.py index 8876d878cf..a9b0b360b4 100644 --- a/api/core/sandbox/inspector/archive_source.py +++ b/api/core/sandbox/inspector/archive_source.py @@ -7,9 +7,9 @@ from uuid import UUID, uuid4 from core.sandbox.entities.files import SandboxFileDownloadTicket, SandboxFileNode from core.sandbox.inspector.base import SandboxFileSource -from core.sandbox.security.archive_signer import SandboxArchivePath, SandboxArchiveSigner -from core.sandbox.security.sandbox_file_signer import SandboxFileDownloadPath from core.sandbox.storage import sandbox_file_storage +from core.sandbox.storage.archive_storage import SandboxArchivePath +from core.sandbox.storage.sandbox_file_storage import SandboxFileDownloadPath from core.virtual_environment.__base.exec import CommandExecutionError from core.virtual_environment.__base.helpers import execute from extensions.ext_storage import storage @@ -68,15 +68,14 @@ print(json.dumps(entries)) def _get_archive_download_url(self) -> str: """Get a pre-signed download URL for the sandbox archive.""" + from extensions.storage.file_presign_storage import FilePresignStorage + archive_path = SandboxArchivePath(tenant_id=UUID(self._tenant_id), sandbox_id=UUID(self._sandbox_id)) storage_key = archive_path.get_storage_key() if not storage.exists(storage_key): raise ValueError("Sandbox archive not found") - return SandboxArchiveSigner.build_signed_url( - archive_path=archive_path, - expires_in=self._EXPORT_EXPIRES_IN_SECONDS, - action=SandboxArchiveSigner.OPERATION_DOWNLOAD, - ) + presign_storage = FilePresignStorage(storage.storage_runner) + return presign_storage.get_download_url(storage_key, self._EXPORT_EXPIRES_IN_SECONDS) def _create_zip_sandbox(self) -> ZipSandbox: """Create a ZipSandbox instance for archive operations.""" diff --git a/api/core/sandbox/inspector/runtime_source.py b/api/core/sandbox/inspector/runtime_source.py index 7481169212..052092e00d 100644 --- a/api/core/sandbox/inspector/runtime_source.py +++ b/api/core/sandbox/inspector/runtime_source.py @@ -7,8 +7,8 @@ from uuid import UUID, uuid4 from core.sandbox.entities.files import SandboxFileDownloadTicket, SandboxFileNode from core.sandbox.inspector.base import SandboxFileSource -from core.sandbox.security.sandbox_file_signer import SandboxFileDownloadPath from core.sandbox.storage import sandbox_file_storage +from core.sandbox.storage.sandbox_file_storage import SandboxFileDownloadPath from core.virtual_environment.__base.exec import CommandExecutionError from core.virtual_environment.__base.helpers import execute from core.virtual_environment.__base.virtual_environment import VirtualEnvironment diff --git a/api/core/sandbox/security/__init__.py b/api/core/sandbox/security/__init__.py deleted file mode 100644 index c8c4ebefae..0000000000 --- a/api/core/sandbox/security/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Sandbox security helpers.""" diff --git a/api/core/sandbox/security/archive_signer.py b/api/core/sandbox/security/archive_signer.py deleted file mode 100644 index 5fd48b73db..0000000000 --- a/api/core/sandbox/security/archive_signer.py +++ /dev/null @@ -1,152 +0,0 @@ -from __future__ import annotations - -import base64 -import hashlib -import hmac -import os -import time -import urllib.parse -from dataclasses import dataclass -from uuid import UUID - -from configs import dify_config -from libs import rsa - - -@dataclass(frozen=True) -class SandboxArchivePath: - tenant_id: UUID - sandbox_id: UUID - - def get_storage_key(self) -> str: - return f"sandbox/{self.tenant_id}/{self.sandbox_id}.tar.gz" - - def proxy_path(self) -> str: - return f"{self.tenant_id}/{self.sandbox_id}" - - -class SandboxArchiveSigner: - SIGNATURE_PREFIX = "sandbox-archive" - SIGNATURE_VERSION = "v1" - OPERATION_DOWNLOAD = "download" - OPERATION_UPLOAD = "upload" - - @classmethod - def create_download_signature(cls, archive_path: SandboxArchivePath, expires_at: int, nonce: str) -> str: - return cls._create_signature( - archive_path=archive_path, - operation=cls.OPERATION_DOWNLOAD, - expires_at=expires_at, - nonce=nonce, - ) - - @classmethod - def create_upload_signature(cls, archive_path: SandboxArchivePath, expires_at: int, nonce: str) -> str: - return cls._create_signature( - archive_path=archive_path, - operation=cls.OPERATION_UPLOAD, - expires_at=expires_at, - nonce=nonce, - ) - - @classmethod - def verify_download_signature( - cls, archive_path: SandboxArchivePath, expires_at: int, nonce: str, sign: str - ) -> bool: - return cls._verify_signature( - archive_path=archive_path, - operation=cls.OPERATION_DOWNLOAD, - expires_at=expires_at, - nonce=nonce, - sign=sign, - ) - - @classmethod - def verify_upload_signature(cls, archive_path: SandboxArchivePath, expires_at: int, nonce: str, sign: str) -> bool: - return cls._verify_signature( - archive_path=archive_path, - operation=cls.OPERATION_UPLOAD, - expires_at=expires_at, - nonce=nonce, - sign=sign, - ) - - @classmethod - def _verify_signature( - cls, - *, - archive_path: SandboxArchivePath, - operation: str, - expires_at: int, - nonce: str, - sign: str, - ) -> bool: - if expires_at <= 0: - return False - - expected_sign = cls._create_signature( - archive_path=archive_path, - operation=operation, - expires_at=expires_at, - nonce=nonce, - ) - if not hmac.compare_digest(sign, expected_sign): - return False - - current_time = int(time.time()) - if expires_at < current_time: - return False - - if expires_at - current_time > dify_config.FILES_ACCESS_TIMEOUT: - return False - - return True - - @classmethod - def build_signed_url( - cls, - *, - archive_path: SandboxArchivePath, - expires_in: int, - action: str, - ) -> str: - expires_in = min(expires_in, dify_config.FILES_ACCESS_TIMEOUT) - expires_at = int(time.time()) + max(expires_in, 1) - nonce = os.urandom(16).hex() - sign = cls._create_signature( - archive_path=archive_path, - operation=action, - expires_at=expires_at, - nonce=nonce, - ) - - base_url = dify_config.FILES_URL - url = f"{base_url}/files/sandbox-archives/{archive_path.proxy_path()}/{action}" - query = urllib.parse.urlencode({"expires_at": expires_at, "nonce": nonce, "sign": sign}) - return f"{url}?{query}" - - @classmethod - def _create_signature( - cls, - *, - archive_path: SandboxArchivePath, - operation: str, - expires_at: int, - nonce: str, - ) -> str: - key = cls._tenant_key(str(archive_path.tenant_id)) - message = ( - f"{cls.SIGNATURE_PREFIX}|{cls.SIGNATURE_VERSION}|{operation}|" - f"{archive_path.tenant_id}|{archive_path.sandbox_id}|{expires_at}|{nonce}" - ) - sign = hmac.new(key, message.encode(), hashlib.sha256).digest() - return base64.urlsafe_b64encode(sign).decode() - - @classmethod - def _tenant_key(cls, tenant_id: str) -> bytes: - try: - rsa_key, _ = rsa.get_decrypt_decoding(tenant_id) - except rsa.PrivkeyNotFoundError as exc: - raise ValueError(f"Tenant private key missing for tenant_id={tenant_id}") from exc - private_key = rsa_key.export_key() - return hashlib.sha256(private_key).digest() diff --git a/api/core/sandbox/security/sandbox_file_signer.py b/api/core/sandbox/security/sandbox_file_signer.py deleted file mode 100644 index dd59023ba9..0000000000 --- a/api/core/sandbox/security/sandbox_file_signer.py +++ /dev/null @@ -1,155 +0,0 @@ -from __future__ import annotations - -import base64 -import hashlib -import hmac -import os -import time -import urllib.parse -from dataclasses import dataclass -from uuid import UUID - -from configs import dify_config -from libs import rsa - - -@dataclass(frozen=True) -class SandboxFileDownloadPath: - tenant_id: UUID - sandbox_id: UUID - export_id: str - filename: str - - def get_storage_key(self) -> str: - return f"sandbox_file_downloads/{self.tenant_id}/{self.sandbox_id}/{self.export_id}/{self.filename}" - - def proxy_path(self) -> str: - encoded_parts = [ - urllib.parse.quote(str(self.tenant_id), safe=""), - urllib.parse.quote(str(self.sandbox_id), safe=""), - urllib.parse.quote(self.export_id, safe=""), - urllib.parse.quote(self.filename, safe=""), - ] - return "/".join(encoded_parts) - - -class SandboxFileSigner: - SIGNATURE_PREFIX = "sandbox-file-download" - SIGNATURE_VERSION = "v1" - OPERATION_DOWNLOAD = "download" - OPERATION_UPLOAD = "upload" - - @classmethod - def build_signed_url( - cls, - *, - export_path: SandboxFileDownloadPath, - expires_in: int, - action: str, - ) -> str: - expires_in = min(expires_in, dify_config.FILES_ACCESS_TIMEOUT) - expires_at = int(time.time()) + max(expires_in, 1) - nonce = os.urandom(16).hex() - sign = cls._create_signature( - export_path=export_path, - operation=action, - expires_at=expires_at, - nonce=nonce, - ) - - base_url = dify_config.FILES_URL - url = f"{base_url}/files/sandbox-file-downloads/{export_path.proxy_path()}/{action}" - query = urllib.parse.urlencode({"expires_at": expires_at, "nonce": nonce, "sign": sign}) - return f"{url}?{query}" - - @classmethod - def verify_download_signature( - cls, - *, - export_path: SandboxFileDownloadPath, - expires_at: int, - nonce: str, - sign: str, - ) -> bool: - return cls._verify_signature( - export_path=export_path, - operation=cls.OPERATION_DOWNLOAD, - expires_at=expires_at, - nonce=nonce, - sign=sign, - ) - - @classmethod - def verify_upload_signature( - cls, - *, - export_path: SandboxFileDownloadPath, - expires_at: int, - nonce: str, - sign: str, - ) -> bool: - return cls._verify_signature( - export_path=export_path, - operation=cls.OPERATION_UPLOAD, - expires_at=expires_at, - nonce=nonce, - sign=sign, - ) - - @classmethod - def _verify_signature( - cls, - *, - export_path: SandboxFileDownloadPath, - operation: str, - expires_at: int, - nonce: str, - sign: str, - ) -> bool: - if expires_at <= 0: - return False - - expected_sign = cls._create_signature( - export_path=export_path, - operation=operation, - expires_at=expires_at, - nonce=nonce, - ) - if not hmac.compare_digest(sign, expected_sign): - return False - - current_time = int(time.time()) - if expires_at < current_time: - return False - - if expires_at - current_time > dify_config.FILES_ACCESS_TIMEOUT: - return False - - return True - - @classmethod - def _create_signature( - cls, - *, - export_path: SandboxFileDownloadPath, - operation: str, - expires_at: int, - nonce: str, - ) -> str: - key = cls._tenant_key(str(export_path.tenant_id)) - message = ( - f"{cls.SIGNATURE_PREFIX}|{cls.SIGNATURE_VERSION}|{operation}|" - f"{export_path.tenant_id}|{export_path.sandbox_id}|{export_path.export_id}|{export_path.filename}|" - f"{expires_at}|{nonce}" - ) - digest = hmac.new(key, message.encode(), hashlib.sha256).digest() - return base64.urlsafe_b64encode(digest).decode() - - @classmethod - def _tenant_key(cls, tenant_id: str) -> bytes: - try: - rsa_key, _ = rsa.get_decrypt_decoding(tenant_id) - except rsa.PrivkeyNotFoundError as exc: - raise ValueError(f"Tenant private key missing for tenant_id={tenant_id}") from exc - private_key = rsa_key.export_key() - return hashlib.sha256(private_key).digest() diff --git a/api/core/sandbox/storage/__init__.py b/api/core/sandbox/storage/__init__.py index c7c405204e..b0eeb8940a 100644 --- a/api/core/sandbox/storage/__init__.py +++ b/api/core/sandbox/storage/__init__.py @@ -1,11 +1,13 @@ -from .archive_storage import ArchiveSandboxStorage +from .archive_storage import ArchiveSandboxStorage, SandboxArchivePath from .noop_storage import NoopSandboxStorage -from .sandbox_file_storage import SandboxFileStorage, sandbox_file_storage +from .sandbox_file_storage import SandboxFileDownloadPath, SandboxFileStorage, sandbox_file_storage from .sandbox_storage import SandboxStorage __all__ = [ "ArchiveSandboxStorage", "NoopSandboxStorage", + "SandboxArchivePath", + "SandboxFileDownloadPath", "SandboxFileStorage", "SandboxStorage", "sandbox_file_storage", diff --git a/api/core/sandbox/storage/archive_storage.py b/api/core/sandbox/storage/archive_storage.py index 3b44da5fc9..b67b1b8825 100644 --- a/api/core/sandbox/storage/archive_storage.py +++ b/api/core/sandbox/storage/archive_storage.py @@ -1,18 +1,31 @@ +"""Archive-based sandbox storage for persisting sandbox state. + +This module provides storage operations for sandbox workspace archives (tar.gz), +enabling state persistence across sandbox sessions. + +Storage key format: sandbox/{tenant_id}/{sandbox_id}.tar.gz + +All presign operations use the unified FilePresignStorage wrapper, which automatically +falls back to Dify's file proxy when the underlying storage doesn't support presigned URLs. +""" + +from __future__ import annotations + import logging +from dataclasses import dataclass from uuid import UUID -from core.sandbox.security.archive_signer import SandboxArchivePath, SandboxArchiveSigner from core.virtual_environment.__base.exec import PipelineExecutionError from core.virtual_environment.__base.helpers import pipeline from core.virtual_environment.__base.virtual_environment import VirtualEnvironment -from extensions.ext_storage import storage +from extensions.storage.base_storage import BaseStorage +from extensions.storage.file_presign_storage import FilePresignStorage from .sandbox_storage import SandboxStorage logger = logging.getLogger(__name__) WORKSPACE_DIR = "." - ARCHIVE_DOWNLOAD_TIMEOUT = 60 * 5 ARCHIVE_UPLOAD_TIMEOUT = 60 * 5 @@ -21,40 +34,67 @@ def build_tar_exclude_args(patterns: list[str]) -> list[str]: return [f"--exclude={p}" for p in patterns] +@dataclass(frozen=True) +class SandboxArchivePath: + """Path for sandbox workspace archives.""" + + tenant_id: UUID + sandbox_id: UUID + + def get_storage_key(self) -> str: + return f"sandbox/{self.tenant_id}/{self.sandbox_id}.tar.gz" + + class ArchiveSandboxStorage(SandboxStorage): + """Archive-based storage for sandbox workspace persistence. + + Uses tar.gz archives to save and restore sandbox workspace state. + Requires a presign-capable storage wrapper for generating download/upload URLs. + """ + _tenant_id: str _sandbox_id: str _exclude_patterns: list[str] + _storage: FilePresignStorage - def __init__(self, tenant_id: str, sandbox_id: str, exclude_patterns: list[str] | None = None): + def __init__( + self, + tenant_id: str, + sandbox_id: str, + storage: BaseStorage, + exclude_patterns: list[str] | None = None, + ): self._tenant_id = tenant_id self._sandbox_id = sandbox_id self._exclude_patterns = exclude_patterns or [] + # Wrap with FilePresignStorage for presign fallback support + self._storage = FilePresignStorage(storage) + + @property + def _archive_path(self) -> SandboxArchivePath: + return SandboxArchivePath(UUID(self._tenant_id), UUID(self._sandbox_id)) @property def _storage_key(self) -> str: - return SandboxArchivePath(UUID(self._tenant_id), UUID(self._sandbox_id)).get_storage_key() + return self._archive_path.get_storage_key() @property def _archive_name(self) -> str: return f"{self._sandbox_id}.tar.gz" @property - def _archive_path(self) -> str: + def _archive_tmp_path(self) -> str: return f"/tmp/{self._archive_name}" def mount(self, sandbox: VirtualEnvironment) -> bool: + """Load archive from storage into sandbox workspace.""" if not self.exists(): logger.debug("No archive found for sandbox %s, skipping mount", self._sandbox_id) return False - archive_path = SandboxArchivePath(UUID(self._tenant_id), UUID(self._sandbox_id)) - download_url = SandboxArchiveSigner.build_signed_url( - archive_path=archive_path, - expires_in=ARCHIVE_DOWNLOAD_TIMEOUT, - action=SandboxArchiveSigner.OPERATION_DOWNLOAD, - ) + download_url = self._storage.get_download_url(self._storage_key, ARCHIVE_DOWNLOAD_TIMEOUT) archive_name = self._archive_name + try: ( pipeline(sandbox) @@ -74,13 +114,10 @@ class ArchiveSandboxStorage(SandboxStorage): return True def unmount(self, sandbox: VirtualEnvironment) -> bool: - archive_path = SandboxArchivePath(UUID(self._tenant_id), UUID(self._sandbox_id)) - upload_url = SandboxArchiveSigner.build_signed_url( - archive_path=archive_path, - expires_in=ARCHIVE_UPLOAD_TIMEOUT, - action=SandboxArchiveSigner.OPERATION_UPLOAD, - ) - archive_path = self._archive_path + """Save sandbox workspace to storage as archive.""" + upload_url = self._storage.get_upload_url(self._storage_key, ARCHIVE_UPLOAD_TIMEOUT) + archive_path = self._archive_tmp_path + ( pipeline(sandbox) .add( @@ -105,11 +142,13 @@ class ArchiveSandboxStorage(SandboxStorage): return True def exists(self) -> bool: - return storage.exists(self._storage_key) + """Check if archive exists in storage.""" + return self._storage.exists(self._storage_key) def delete(self) -> None: + """Delete archive from storage.""" try: - storage.delete(self._storage_key) + self._storage.delete(self._storage_key) logger.info("Deleted archive for sandbox %s", self._sandbox_id) except Exception: logger.exception("Failed to delete archive for sandbox %s", self._sandbox_id) diff --git a/api/core/sandbox/storage/sandbox_file_storage.py b/api/core/sandbox/storage/sandbox_file_storage.py index da7c17b402..2d0b5482fc 100644 --- a/api/core/sandbox/storage/sandbox_file_storage.py +++ b/api/core/sandbox/storage/sandbox_file_storage.py @@ -1,23 +1,58 @@ +"""Sandbox file storage for exporting files from sandbox environments. + +This module provides storage operations for files exported from sandbox environments, +including download tickets for both runtime and archive-based file sources. + +Storage key format: sandbox_file_downloads/{tenant_id}/{sandbox_id}/{export_id}/{filename} + +All presign operations use the unified FilePresignStorage wrapper, which automatically +falls back to Dify's file proxy when the underlying storage doesn't support presigned URLs. +""" + from __future__ import annotations +from dataclasses import dataclass from typing import Any +from uuid import UUID -from core.sandbox.security.sandbox_file_signer import SandboxFileDownloadPath, SandboxFileSigner -from extensions.ext_redis import redis_client -from extensions.ext_storage import storage from extensions.storage.base_storage import BaseStorage from extensions.storage.cached_presign_storage import CachedPresignStorage -from extensions.storage.silent_storage import SilentStorage +from extensions.storage.file_presign_storage import FilePresignStorage + + +@dataclass(frozen=True) +class SandboxFileDownloadPath: + """Path for sandbox file exports.""" + + tenant_id: UUID + sandbox_id: UUID + export_id: str + filename: str + + def get_storage_key(self) -> str: + return f"sandbox_file_downloads/{self.tenant_id}/{self.sandbox_id}/{self.export_id}/{self.filename}" class SandboxFileStorage: - _base_storage: BaseStorage + """Storage operations for sandbox file exports. + + Wraps BaseStorage with: + - FilePresignStorage for presign fallback support + - CachedPresignStorage for URL caching + + Usage: + storage = SandboxFileStorage(base_storage, redis_client=redis) + storage.save(download_path, content) + url = storage.get_download_url(download_path) + """ + _storage: CachedPresignStorage def __init__(self, storage: BaseStorage, *, redis_client: Any) -> None: - self._base_storage = storage + # Wrap with FilePresignStorage for fallback support, then CachedPresignStorage for caching + presign_storage = FilePresignStorage(storage) self._storage = CachedPresignStorage( - storage=storage, + storage=presign_storage, redis_client=redis_client, cache_key_prefix="sandbox_file_downloads", ) @@ -26,29 +61,19 @@ class SandboxFileStorage: self._storage.save(download_path.get_storage_key(), content) def get_download_url(self, download_path: SandboxFileDownloadPath, expires_in: int = 3600) -> str: - storage_key = download_path.get_storage_key() - try: - return self._storage.get_download_url(storage_key, expires_in) - except NotImplementedError: - return SandboxFileSigner.build_signed_url( - export_path=download_path, - expires_in=expires_in, - action=SandboxFileSigner.OPERATION_DOWNLOAD, - ) + return self._storage.get_download_url(download_path.get_storage_key(), expires_in) def get_upload_url(self, download_path: SandboxFileDownloadPath, expires_in: int = 3600) -> str: - storage_key = download_path.get_storage_key() - try: - return self._storage.get_upload_url(storage_key, expires_in) - except NotImplementedError: - return SandboxFileSigner.build_signed_url( - export_path=download_path, - expires_in=expires_in, - action=SandboxFileSigner.OPERATION_UPLOAD, - ) + return self._storage.get_upload_url(download_path.get_storage_key(), expires_in) class _LazySandboxFileStorage: + """Lazy initializer for singleton SandboxFileStorage. + + Delays storage initialization until first access, ensuring Flask app + context is available. + """ + _instance: SandboxFileStorage | None def __init__(self) -> None: @@ -56,12 +81,16 @@ class _LazySandboxFileStorage: def _get_instance(self) -> SandboxFileStorage: if self._instance is None: + from extensions.ext_redis import redis_client + from extensions.ext_storage import storage + if not hasattr(storage, "storage_runner"): raise RuntimeError( "Storage is not initialized; call storage.init_app before using sandbox_file_storage" ) self._instance = SandboxFileStorage( - storage=SilentStorage(storage.storage_runner), redis_client=redis_client + storage=storage.storage_runner, + redis_client=redis_client, ) return self._instance @@ -69,4 +98,4 @@ class _LazySandboxFileStorage: return getattr(self._get_instance(), name) -sandbox_file_storage = _LazySandboxFileStorage() +sandbox_file_storage: SandboxFileStorage = _LazySandboxFileStorage() # type: ignore[assignment] diff --git a/api/core/skill/skill_manager.py b/api/core/skill/skill_manager.py index 29be138501..e5d3880ac2 100644 --- a/api/core/skill/skill_manager.py +++ b/api/core/skill/skill_manager.py @@ -2,21 +2,40 @@ import logging from core.app_assets.storage import AssetPath from core.skill.entities.skill_bundle import SkillBundle +from extensions.ext_redis import redis_client from services.app_asset_service import AppAssetService logger = logging.getLogger(__name__) class SkillManager: + _CACHE_KEY_PREFIX = "skill_bundle" + _CACHE_TTL_SECONDS = 60 * 60 * 24 + + @staticmethod + def get_cache_key( + tenant_id: str, + app_id: str, + assets_id: str, + ) -> str: + return f"{SkillManager._CACHE_KEY_PREFIX}:{tenant_id}:{app_id}:{assets_id}" + @staticmethod def load_bundle( tenant_id: str, app_id: str, assets_id: str, ) -> SkillBundle: + cache_key = SkillManager.get_cache_key(tenant_id, app_id, assets_id) + data = redis_client.get(cache_key) + if data: + return SkillBundle.model_validate_json(data) + asset_path = AssetPath.skill_bundle(tenant_id, app_id, assets_id) data = AppAssetService.get_storage().load(asset_path) - return SkillBundle.model_validate_json(data) + bundle = SkillBundle.model_validate_json(data) + redis_client.setex(cache_key, SkillManager._CACHE_TTL_SECONDS, bundle.model_dump_json(indent=2).encode("utf-8")) + return bundle @staticmethod def save_bundle( @@ -30,3 +49,5 @@ class SkillManager: asset_path, bundle.model_dump_json(indent=2).encode("utf-8"), ) + cache_key = SkillManager.get_cache_key(tenant_id, app_id, assets_id) + redis_client.delete(cache_key) diff --git a/api/core/tools/utils/dataset_retriever/dataset_retriever_tool.py b/api/core/tools/utils/dataset_retriever/dataset_retriever_tool.py index f96510fb45..057ec41f65 100644 --- a/api/core/tools/utils/dataset_retriever/dataset_retriever_tool.py +++ b/api/core/tools/utils/dataset_retriever/dataset_retriever_tool.py @@ -169,20 +169,24 @@ class DatasetRetrieverTool(DatasetRetrieverBaseTool): if records: for record in records: segment = record.segment + # Build content: if summary exists, add it before the segment content if segment.answer: - document_context_list.append( - DocumentContext( - content=f"question:{segment.get_sign_content()} answer:{segment.answer}", - score=record.score, - ) - ) + segment_content = f"question:{segment.get_sign_content()} answer:{segment.answer}" else: - document_context_list.append( - DocumentContext( - content=segment.get_sign_content(), - score=record.score, - ) + segment_content = segment.get_sign_content() + + # If summary exists, prepend it to the content + if record.summary: + final_content = f"{record.summary}\n{segment_content}" + else: + final_content = segment_content + + document_context_list.append( + DocumentContext( + content=final_content, + score=record.score, ) + ) if self.return_resource: for record in records: @@ -216,6 +220,9 @@ class DatasetRetrieverTool(DatasetRetrieverBaseTool): source.content = f"question:{segment.content} \nanswer:{segment.answer}" else: source.content = segment.content + # Add summary if this segment was retrieved via summary + if hasattr(record, "summary") and record.summary: + source.summary = record.summary retrieval_resource_list.append(source) if self.return_resource and retrieval_resource_list: diff --git a/api/core/virtual_environment/providers/docker_daemon_sandbox.py b/api/core/virtual_environment/providers/docker_daemon_sandbox.py index 2824856d09..52007591c0 100644 --- a/api/core/virtual_environment/providers/docker_daemon_sandbox.py +++ b/api/core/virtual_environment/providers/docker_daemon_sandbox.py @@ -148,7 +148,8 @@ class DockerDemuxer: to periodically check for errors and closed state instead of blocking forever. """ if self._error: - raise TransportEOFError(f"Demuxer error: {self._error}") from self._error + error = cast(BaseException, self._error) + raise TransportEOFError(f"Demuxer error: {error}") from error while True: try: @@ -163,7 +164,8 @@ class DockerDemuxer: if self._closed: raise TransportEOFError("Demuxer closed") if self._error: - raise TransportEOFError(f"Demuxer error: {self._error}") from self._error + error = cast(BaseException, self._error) + raise TransportEOFError(f"Demuxer error: {error}") from error # No error, continue waiting def close(self) -> None: @@ -292,6 +294,8 @@ class DockerDaemonEnvironment(VirtualEnvironment): @classmethod def validate(cls, options: Mapping[str, Any]) -> None: # Import Docker SDK lazily so it is loaded after gevent monkey-patching. + import docker.errors + import docker docker_sock = options.get(cls.OptionsKey.DOCKER_SOCK, cls._DEFAULT_DOCKER_SOCK) @@ -364,6 +368,7 @@ class DockerDaemonEnvironment(VirtualEnvironment): NOTE: I guess nobody will use more than 5 different docker sockets in practice.... """ import docker + return docker.DockerClient(base_url=docker_sock) @classmethod @@ -373,6 +378,7 @@ class DockerDaemonEnvironment(VirtualEnvironment): Get the Docker low-level API client. """ import docker + return docker.APIClient(base_url=docker_sock) def get_docker_sock(self) -> str: @@ -431,6 +437,12 @@ class DockerDaemonEnvironment(VirtualEnvironment): return self._container_path(path) def upload_file(self, path: str, content: BytesIO) -> None: + """Upload a file to the container. + + Files and intermediate directories are created with world-writable permissions + (0o777 for directories, 0o666 for files) to avoid permission issues when the container + runs as a non-root user but Docker's put_archive creates files as root. + """ container = self._get_container() normalized = PurePosixPath(path) @@ -442,6 +454,7 @@ class DockerDaemonEnvironment(VirtualEnvironment): with tarfile.open(fileobj=tar_stream, mode="w") as tar: tar_info = tarfile.TarInfo(name=file_name) tar_info.size = len(payload) + tar_info.mode = 0o666 tar.addfile(tar_info, BytesIO(payload)) tar_stream.seek(0) container.put_archive(parent_dir, tar_stream.read()) # pyright: ignore[reportUnknownMemberType] # @@ -454,8 +467,18 @@ class DockerDaemonEnvironment(VirtualEnvironment): payload = content.getvalue() tar_stream = BytesIO() with tarfile.open(fileobj=tar_stream, mode="w") as tar: + # Add intermediate directories with proper permissions + for i in range(len(relative_path.parts) - 1): + dir_path = PurePosixPath(*relative_path.parts[: i + 1]) + dir_info = tarfile.TarInfo(name=dir_path.as_posix() + "/") + dir_info.type = tarfile.DIRTYPE + dir_info.mode = 0o777 + tar.addfile(dir_info) + + # Add the file tar_info = tarfile.TarInfo(name=relative_path.as_posix()) tar_info.size = len(payload) + tar_info.mode = 0o666 tar.addfile(tar_info, BytesIO(payload)) tar_stream.seek(0) container.put_archive(self._working_dir, tar_stream.read()) # pyright: ignore[reportUnknownMemberType] # @@ -479,7 +502,7 @@ class DockerDaemonEnvironment(VirtualEnvironment): return BytesIO(extracted.read()) def list_files(self, directory_path: str, limit: int) -> Sequence[FileState]: - import docker + import docker.errors container = self._get_container() container_path = self._container_path(directory_path) @@ -525,7 +548,7 @@ class DockerDaemonEnvironment(VirtualEnvironment): pass def release_environment(self) -> None: - import docker + import docker.errors try: container = self._get_container() diff --git a/api/core/workflow/nodes/knowledge_index/entities.py b/api/core/workflow/nodes/knowledge_index/entities.py index 3daca90b9b..bfeb9b5b79 100644 --- a/api/core/workflow/nodes/knowledge_index/entities.py +++ b/api/core/workflow/nodes/knowledge_index/entities.py @@ -158,3 +158,5 @@ class KnowledgeIndexNodeData(BaseNodeData): type: str = "knowledge-index" chunk_structure: str index_chunk_variable_selector: list[str] + indexing_technique: str | None = None + summary_index_setting: dict | None = None diff --git a/api/core/workflow/nodes/knowledge_index/knowledge_index_node.py b/api/core/workflow/nodes/knowledge_index/knowledge_index_node.py index 17ca4bef7b..b88c2d510f 100644 --- a/api/core/workflow/nodes/knowledge_index/knowledge_index_node.py +++ b/api/core/workflow/nodes/knowledge_index/knowledge_index_node.py @@ -1,9 +1,11 @@ +import concurrent.futures import datetime import logging import time from collections.abc import Mapping from typing import Any +from flask import current_app from sqlalchemy import func, select from core.app.entities.app_invoke_entities import InvokeFrom @@ -16,7 +18,9 @@ from core.workflow.nodes.base.node import Node from core.workflow.nodes.base.template import Template from core.workflow.runtime import VariablePool from extensions.ext_database import db -from models.dataset import Dataset, Document, DocumentSegment +from models.dataset import Dataset, Document, DocumentSegment, DocumentSegmentSummary +from services.summary_index_service import SummaryIndexService +from tasks.generate_summary_index_task import generate_summary_index_task from .entities import KnowledgeIndexNodeData from .exc import ( @@ -67,7 +71,20 @@ class KnowledgeIndexNode(Node[KnowledgeIndexNodeData]): # index knowledge try: if is_preview: - outputs = self._get_preview_output(node_data.chunk_structure, chunks) + # Preview mode: generate summaries for chunks directly without saving to database + # Format preview and generate summaries on-the-fly + # Get indexing_technique and summary_index_setting from node_data (workflow graph config) + # or fallback to dataset if not available in node_data + indexing_technique = node_data.indexing_technique or dataset.indexing_technique + summary_index_setting = node_data.summary_index_setting or dataset.summary_index_setting + + outputs = self._get_preview_output_with_summaries( + node_data.chunk_structure, + chunks, + dataset=dataset, + indexing_technique=indexing_technique, + summary_index_setting=summary_index_setting, + ) return NodeRunResult( status=WorkflowNodeExecutionStatus.SUCCEEDED, inputs=variables, @@ -148,6 +165,11 @@ class KnowledgeIndexNode(Node[KnowledgeIndexNodeData]): ) .scalar() ) + # Update need_summary based on dataset's summary_index_setting + if dataset.summary_index_setting and dataset.summary_index_setting.get("enable") is True: + document.need_summary = True + else: + document.need_summary = False db.session.add(document) # update document segment status db.session.query(DocumentSegment).where( @@ -163,6 +185,9 @@ class KnowledgeIndexNode(Node[KnowledgeIndexNodeData]): db.session.commit() + # Generate summary index if enabled + self._handle_summary_index_generation(dataset, document, variable_pool) + return { "dataset_id": ds_id_value, "dataset_name": dataset_name_value, @@ -173,9 +198,304 @@ class KnowledgeIndexNode(Node[KnowledgeIndexNodeData]): "display_status": "completed", } - def _get_preview_output(self, chunk_structure: str, chunks: Any) -> Mapping[str, Any]: + def _handle_summary_index_generation( + self, + dataset: Dataset, + document: Document, + variable_pool: VariablePool, + ) -> None: + """ + Handle summary index generation based on mode (debug/preview or production). + + Args: + dataset: Dataset containing the document + document: Document to generate summaries for + variable_pool: Variable pool to check invoke_from + """ + # Only generate summary index for high_quality indexing technique + if dataset.indexing_technique != "high_quality": + return + + # Check if summary index is enabled + summary_index_setting = dataset.summary_index_setting + if not summary_index_setting or not summary_index_setting.get("enable"): + return + + # Skip qa_model documents + if document.doc_form == "qa_model": + return + + # Determine if in preview/debug mode + invoke_from = variable_pool.get(["sys", SystemVariableKey.INVOKE_FROM]) + is_preview = invoke_from and invoke_from.value == InvokeFrom.DEBUGGER + + if is_preview: + try: + # Query segments that need summary generation + query = db.session.query(DocumentSegment).filter_by( + dataset_id=dataset.id, + document_id=document.id, + status="completed", + enabled=True, + ) + segments = query.all() + + if not segments: + logger.info("No segments found for document %s", document.id) + return + + # Filter segments based on mode + segments_to_process = [] + for segment in segments: + # Skip if summary already exists + existing_summary = ( + db.session.query(DocumentSegmentSummary) + .filter_by(chunk_id=segment.id, dataset_id=dataset.id, status="completed") + .first() + ) + if existing_summary: + continue + + # For parent-child mode, all segments are parent chunks, so process all + segments_to_process.append(segment) + + if not segments_to_process: + logger.info("No segments need summary generation for document %s", document.id) + return + + # Use ThreadPoolExecutor for concurrent generation + flask_app = current_app._get_current_object() # type: ignore + max_workers = min(10, len(segments_to_process)) # Limit to 10 workers + + def process_segment(segment: DocumentSegment) -> None: + """Process a single segment in a thread with Flask app context.""" + with flask_app.app_context(): + try: + SummaryIndexService.generate_and_vectorize_summary(segment, dataset, summary_index_setting) + except Exception: + logger.exception( + "Failed to generate summary for segment %s", + segment.id, + ) + # Continue processing other segments + + with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: + futures = [executor.submit(process_segment, segment) for segment in segments_to_process] + # Wait for all tasks to complete + concurrent.futures.wait(futures) + + logger.info( + "Successfully generated summary index for %s segments in document %s", + len(segments_to_process), + document.id, + ) + except Exception: + logger.exception("Failed to generate summary index for document %s", document.id) + # Don't fail the entire indexing process if summary generation fails + else: + # Production mode: asynchronous generation + logger.info( + "Queuing summary index generation task for document %s (production mode)", + document.id, + ) + try: + generate_summary_index_task.delay(dataset.id, document.id, None) + logger.info("Summary index generation task queued for document %s", document.id) + except Exception: + logger.exception( + "Failed to queue summary index generation task for document %s", + document.id, + ) + # Don't fail the entire indexing process if task queuing fails + + def _get_preview_output_with_summaries( + self, + chunk_structure: str, + chunks: Any, + dataset: Dataset, + indexing_technique: str | None = None, + summary_index_setting: dict | None = None, + ) -> Mapping[str, Any]: + """ + Generate preview output with summaries for chunks in preview mode. + This method generates summaries on-the-fly without saving to database. + + Args: + chunk_structure: Chunk structure type + chunks: Chunks to generate preview for + dataset: Dataset object (for tenant_id) + indexing_technique: Indexing technique from node config or dataset + summary_index_setting: Summary index setting from node config or dataset + """ index_processor = IndexProcessorFactory(chunk_structure).init_index_processor() - return index_processor.format_preview(chunks) + preview_output = index_processor.format_preview(chunks) + + # Check if summary index is enabled + if indexing_technique != "high_quality": + return preview_output + + if not summary_index_setting or not summary_index_setting.get("enable"): + return preview_output + + # Generate summaries for chunks + if "preview" in preview_output and isinstance(preview_output["preview"], list): + chunk_count = len(preview_output["preview"]) + logger.info( + "Generating summaries for %s chunks in preview mode (dataset: %s)", + chunk_count, + dataset.id, + ) + # Use ParagraphIndexProcessor's generate_summary method + from core.rag.index_processor.processor.paragraph_index_processor import ParagraphIndexProcessor + + # Get Flask app for application context in worker threads + flask_app = None + try: + flask_app = current_app._get_current_object() # type: ignore + except RuntimeError: + logger.warning("No Flask application context available, summary generation may fail") + + def generate_summary_for_chunk(preview_item: dict) -> None: + """Generate summary for a single chunk.""" + if "content" in preview_item: + # Set Flask application context in worker thread + if flask_app: + with flask_app.app_context(): + summary, _ = ParagraphIndexProcessor.generate_summary( + tenant_id=dataset.tenant_id, + text=preview_item["content"], + summary_index_setting=summary_index_setting, + ) + if summary: + preview_item["summary"] = summary + else: + # Fallback: try without app context (may fail) + summary, _ = ParagraphIndexProcessor.generate_summary( + tenant_id=dataset.tenant_id, + text=preview_item["content"], + summary_index_setting=summary_index_setting, + ) + if summary: + preview_item["summary"] = summary + + # Generate summaries concurrently using ThreadPoolExecutor + # Set a reasonable timeout to prevent hanging (60 seconds per chunk, max 5 minutes total) + timeout_seconds = min(300, 60 * len(preview_output["preview"])) + errors: list[Exception] = [] + + with concurrent.futures.ThreadPoolExecutor(max_workers=min(10, len(preview_output["preview"]))) as executor: + futures = [ + executor.submit(generate_summary_for_chunk, preview_item) + for preview_item in preview_output["preview"] + ] + # Wait for all tasks to complete with timeout + done, not_done = concurrent.futures.wait(futures, timeout=timeout_seconds) + + # Cancel tasks that didn't complete in time + if not_done: + timeout_error_msg = ( + f"Summary generation timeout: {len(not_done)} chunks did not complete within {timeout_seconds}s" + ) + logger.warning("%s. Cancelling remaining tasks...", timeout_error_msg) + # In preview mode, timeout is also an error + errors.append(TimeoutError(timeout_error_msg)) + for future in not_done: + future.cancel() + # Wait a bit for cancellation to take effect + concurrent.futures.wait(not_done, timeout=5) + + # Collect exceptions from completed futures + for future in done: + try: + future.result() # This will raise any exception that occurred + except Exception as e: + logger.exception("Error in summary generation future") + errors.append(e) + + # In preview mode, if there are any errors, fail the request + if errors: + error_messages = [str(e) for e in errors] + error_summary = ( + f"Failed to generate summaries for {len(errors)} chunk(s). " + f"Errors: {'; '.join(error_messages[:3])}" # Show first 3 errors + ) + if len(errors) > 3: + error_summary += f" (and {len(errors) - 3} more)" + logger.error("Summary generation failed in preview mode: %s", error_summary) + raise KnowledgeIndexNodeError(error_summary) + + completed_count = sum(1 for item in preview_output["preview"] if item.get("summary") is not None) + logger.info( + "Completed summary generation for preview chunks: %s/%s succeeded", + completed_count, + len(preview_output["preview"]), + ) + + return preview_output + + def _get_preview_output( + self, + chunk_structure: str, + chunks: Any, + dataset: Dataset | None = None, + variable_pool: VariablePool | None = None, + ) -> Mapping[str, Any]: + index_processor = IndexProcessorFactory(chunk_structure).init_index_processor() + preview_output = index_processor.format_preview(chunks) + + # If dataset is provided, try to enrich preview with summaries + if dataset and variable_pool: + document_id = variable_pool.get(["sys", SystemVariableKey.DOCUMENT_ID]) + if document_id: + document = db.session.query(Document).filter_by(id=document_id.value).first() + if document: + # Query summaries for this document + summaries = ( + db.session.query(DocumentSegmentSummary) + .filter_by( + dataset_id=dataset.id, + document_id=document.id, + status="completed", + enabled=True, + ) + .all() + ) + + if summaries: + # Create a map of segment content to summary for matching + # Use content matching as chunks in preview might not be indexed yet + summary_by_content = {} + for summary in summaries: + segment = ( + db.session.query(DocumentSegment) + .filter_by(id=summary.chunk_id, dataset_id=dataset.id) + .first() + ) + if segment: + # Normalize content for matching (strip whitespace) + normalized_content = segment.content.strip() + summary_by_content[normalized_content] = summary.summary_content + + # Enrich preview with summaries by content matching + if "preview" in preview_output and isinstance(preview_output["preview"], list): + matched_count = 0 + for preview_item in preview_output["preview"]: + if "content" in preview_item: + # Normalize content for matching + normalized_chunk_content = preview_item["content"].strip() + if normalized_chunk_content in summary_by_content: + preview_item["summary"] = summary_by_content[normalized_chunk_content] + matched_count += 1 + + if matched_count > 0: + logger.info( + "Enriched preview with %s existing summaries (dataset: %s, document: %s)", + matched_count, + dataset.id, + document.id, + ) + + return preview_output @classmethod def version(cls) -> str: diff --git a/api/core/workflow/nodes/knowledge_retrieval/knowledge_retrieval_node.py b/api/core/workflow/nodes/knowledge_retrieval/knowledge_retrieval_node.py index 8670a71aa3..3c4850ebac 100644 --- a/api/core/workflow/nodes/knowledge_retrieval/knowledge_retrieval_node.py +++ b/api/core/workflow/nodes/knowledge_retrieval/knowledge_retrieval_node.py @@ -419,6 +419,9 @@ class KnowledgeRetrievalNode(LLMUsageTrackingMixin, Node[KnowledgeRetrievalNodeD source["content"] = f"question:{segment.get_sign_content()} \nanswer:{segment.answer}" else: source["content"] = segment.get_sign_content() + # Add summary if available + if record.summary: + source["summary"] = record.summary retrieval_resource_list.append(source) if retrieval_resource_list: retrieval_resource_list = sorted( diff --git a/api/core/workflow/nodes/llm/node.py b/api/core/workflow/nodes/llm/node.py index 93916cd3c7..feb76e6510 100644 --- a/api/core/workflow/nodes/llm/node.py +++ b/api/core/workflow/nodes/llm/node.py @@ -522,7 +522,6 @@ class LLMNode(Node[LLMNodeData]): json_schema=output_schema, model_parameters=node_data_model.completion_params, stop=list(stop or []), - stream=False, user=user_id, tenant_id=tenant_id, ) @@ -1093,6 +1092,8 @@ class LLMNode(Node[LLMNodeData]): if "content" not in item: raise InvalidContextStructureError(f"Invalid context structure: {item}") + if item.get("summary"): + context_str += item["summary"] + "\n" context_str += item["content"] + "\n" retriever_resource = self._convert_to_original_retriever_resource(item) @@ -1154,6 +1155,7 @@ class LLMNode(Node[LLMNodeData]): page=metadata.get("page"), doc_metadata=metadata.get("doc_metadata"), files=context_dict.get("files"), + summary=context_dict.get("summary"), ) return source @@ -1915,6 +1917,7 @@ class LLMNode(Node[LLMNodeData]): ) -> Generator[NodeEventBase, None, LLMGenerationData]: result: LLMGenerationData | None = None + # FIXME(Mairuis): Async processing for bash session. with SandboxBashSession(sandbox=sandbox, node_id=self.id, tools=tool_dependencies) as session: prompt_files = self._extract_prompt_files(variable_pool) model_features = self._get_model_features(model_instance) diff --git a/api/core/zip_sandbox/__init__.py b/api/core/zip_sandbox/__init__.py index 746bb99a79..71d36e6ee9 100644 --- a/api/core/zip_sandbox/__init__.py +++ b/api/core/zip_sandbox/__init__.py @@ -1,7 +1,8 @@ -from .zip_sandbox import SandboxDownloadItem, SandboxFile, ZipSandbox +from .zip_sandbox import SandboxDownloadItem, SandboxFile, SandboxUploadItem, ZipSandbox __all__ = [ "SandboxDownloadItem", "SandboxFile", + "SandboxUploadItem", "ZipSandbox", ] diff --git a/api/core/zip_sandbox/zip_sandbox.py b/api/core/zip_sandbox/zip_sandbox.py index bd71096d47..d58814781a 100644 --- a/api/core/zip_sandbox/zip_sandbox.py +++ b/api/core/zip_sandbox/zip_sandbox.py @@ -27,10 +27,20 @@ from .strategy import ZipStrategy @dataclass(frozen=True) class SandboxDownloadItem: + """Item for downloading: URL -> sandbox path.""" + url: str path: str +@dataclass(frozen=True) +class SandboxUploadItem: + """Item for uploading: sandbox path -> URL.""" + + path: str + url: str + + @dataclass(frozen=True) class SandboxFile: """A handle to a file in the sandbox.""" @@ -210,25 +220,6 @@ class ZipSandbox: # ========== Download operations ========== - def download(self, urls: list[str], *, dest_dir: str = ".") -> list[str]: - if not urls: - return [] - - dest_dir = self._normalize_path(dest_dir) - paths = [self._dest_path_for_url(dest_dir, u) for u in urls] - - p = pipeline(self.vm) - p.add(["mkdir", "-p", dest_dir], error_message="Failed to create download directory") - for url, out_path in zip(urls, paths, strict=True): - p.add(["curl", "-fsSL", url, "-o", out_path], error_message="Failed to download file") - - try: - p.execute(timeout=self._DEFAULT_TIMEOUT_SECONDS, raise_on_error=True) - except Exception as exc: - raise RuntimeError(str(exc)) from exc - - return paths - def download_items(self, items: list[SandboxDownloadItem], *, dest_dir: str = ".") -> list[str]: if not items: return [] @@ -286,6 +277,32 @@ class ZipSandbox: except CommandExecutionError as exc: raise RuntimeError(str(exc)) from exc + def upload_items(self, items: list[SandboxUploadItem], *, src_dir: str = ".") -> None: + """Upload multiple files from sandbox to target URLs. + + Args: + items: List of SandboxUploadItem(path, url) + src_dir: Base directory containing the files + """ + if not items: + return + + src_dir = self._normalize_path(src_dir) + p = pipeline(self.vm) + + for item in items: + rel = self._normalize_path(item.path) + src_path = posixpath.join(src_dir, rel) if src_dir not in ("", ".") else rel + p.add( + ["curl", "-fsSL", "-X", "PUT", "-T", src_path, item.url], + error_message=f"Failed to upload {item.path}", + ) + + try: + p.execute(timeout=self._DEFAULT_TIMEOUT_SECONDS, raise_on_error=True) + except Exception as exc: + raise RuntimeError(str(exc)) from exc + # ========== Archive operations ========== def zip(self, src: str = ".", *, include_base: bool = True) -> SandboxFile: diff --git a/api/extensions/ext_celery.py b/api/extensions/ext_celery.py index 08cf96c1c1..af983f6d87 100644 --- a/api/extensions/ext_celery.py +++ b/api/extensions/ext_celery.py @@ -102,6 +102,8 @@ def init_app(app: DifyApp) -> Celery: imports = [ "tasks.async_workflow_tasks", # trigger workers "tasks.trigger_processing_tasks", # async trigger processing + "tasks.generate_summary_index_task", # summary index generation + "tasks.regenerate_summary_index_task", # summary index regeneration ] day = dify_config.CELERY_BEAT_SCHEDULER_TIME diff --git a/api/extensions/storage/file_presign_storage.py b/api/extensions/storage/file_presign_storage.py index ae7dee3033..27cc2ea5e2 100644 --- a/api/extensions/storage/file_presign_storage.py +++ b/api/extensions/storage/file_presign_storage.py @@ -1,71 +1,56 @@ -"""Storage wrapper that provides presigned URL support with fallback to signed proxy URLs.""" +"""Storage wrapper that provides presigned URL support with fallback to ticket-based URLs. -import base64 -import hashlib -import hmac -import os -import time -import urllib.parse +This is the unified presign wrapper for all storage operations. When the underlying +storage backend doesn't support presigned URLs (raises NotImplementedError), it falls +back to generating ticket-based URLs that route through Dify's file proxy endpoints. + +Usage: + from extensions.storage.file_presign_storage import FilePresignStorage + + # Wrap any BaseStorage to add presign support + presign_storage = FilePresignStorage(base_storage) + download_url = presign_storage.get_download_url("path/to/file.txt", expires_in=3600) + upload_url = presign_storage.get_upload_url("path/to/file.txt", expires_in=3600) + +When the underlying storage doesn't support presigned URLs, the fallback URLs follow the format: + {FILES_URL}/files/storage-tickets/{token} + +The token is a UUID that maps to the real storage key in Redis. +""" -from configs import dify_config from extensions.storage.storage_wrapper import StorageWrapper class FilePresignStorage(StorageWrapper): - """Storage wrapper that provides presigned URL support. + """Storage wrapper that provides presigned URL support with ticket fallback. If the wrapped storage supports presigned URLs, delegates to it. - Otherwise, generates signed proxy URLs for download. + Otherwise, generates ticket-based URLs for both download and upload operations. """ - SIGNATURE_PREFIX = "storage-download" - def get_download_url(self, filename: str, expires_in: int = 3600) -> str: + """Get a presigned download URL, falling back to ticket URL if not supported.""" try: - return super().get_download_url(filename, expires_in) + return self._storage.get_download_url(filename, expires_in) except NotImplementedError: - return self._generate_signed_proxy_url(filename, expires_in) + from services.storage_ticket_service import StorageTicketService - def get_upload_url(self, filename: str, expires_in: int = 3600) -> str: - try: - return super().get_upload_url(filename, expires_in) - except NotImplementedError: - return self._generate_signed_upload_url(filename) + return StorageTicketService.create_download_url(filename, expires_in=expires_in) def get_download_urls(self, filenames: list[str], expires_in: int = 3600) -> list[str]: + """Get presigned download URLs for multiple files.""" try: - return super().get_download_urls(filenames, expires_in) + return self._storage.get_download_urls(filenames, expires_in) except NotImplementedError: - return [self._generate_signed_proxy_url(filename, expires_in) for filename in filenames] + from services.storage_ticket_service import StorageTicketService - def _generate_signed_upload_url(self, filename: str) -> str: - # TODO: Implement this - raise NotImplementedError("This storage backend doesn't support pre-signed URLs") + return [StorageTicketService.create_download_url(f, expires_in=expires_in) for f in filenames] - def _generate_signed_proxy_url(self, filename: str, expires_in: int = 3600) -> str: - base_url = dify_config.FILES_URL - encoded_filename = urllib.parse.quote(filename, safe="") - url = f"{base_url}/files/storage/{encoded_filename}/download" + def get_upload_url(self, filename: str, expires_in: int = 3600) -> str: + """Get a presigned upload URL, falling back to ticket URL if not supported.""" + try: + return self._storage.get_upload_url(filename, expires_in) + except NotImplementedError: + from services.storage_ticket_service import StorageTicketService - timestamp = str(int(time.time())) - nonce = os.urandom(16).hex() - sign = self._create_signature(filename, timestamp, nonce) - - query = urllib.parse.urlencode({"timestamp": timestamp, "nonce": nonce, "sign": sign}) - return f"{url}?{query}" - - @classmethod - def _create_signature(cls, filename: str, timestamp: str, nonce: str) -> str: - key = dify_config.SECRET_KEY.encode() - msg = f"{cls.SIGNATURE_PREFIX}|{filename}|{timestamp}|{nonce}" - sign = hmac.new(key, msg.encode(), hashlib.sha256).digest() - return base64.urlsafe_b64encode(sign).decode() - - @classmethod - def verify_signature(cls, *, filename: str, timestamp: str, nonce: str, sign: str) -> bool: - expected_sign = cls._create_signature(filename, timestamp, nonce) - if sign != expected_sign: - return False - - current_time = int(time.time()) - return current_time - int(timestamp) <= dify_config.FILES_ACCESS_TIMEOUT + return StorageTicketService.create_upload_url(filename, expires_in=expires_in) diff --git a/api/fields/dataset_fields.py b/api/fields/dataset_fields.py index 1e5ec7d200..ff6578098b 100644 --- a/api/fields/dataset_fields.py +++ b/api/fields/dataset_fields.py @@ -39,6 +39,14 @@ dataset_retrieval_model_fields = { "score_threshold_enabled": fields.Boolean, "score_threshold": fields.Float, } + +dataset_summary_index_fields = { + "enable": fields.Boolean, + "model_name": fields.String, + "model_provider_name": fields.String, + "summary_prompt": fields.String, +} + external_retrieval_model_fields = { "top_k": fields.Integer, "score_threshold": fields.Float, @@ -83,6 +91,7 @@ dataset_detail_fields = { "embedding_model_provider": fields.String, "embedding_available": fields.Boolean, "retrieval_model_dict": fields.Nested(dataset_retrieval_model_fields), + "summary_index_setting": fields.Nested(dataset_summary_index_fields), "tags": fields.List(fields.Nested(tag_fields)), "doc_form": fields.String, "external_knowledge_info": fields.Nested(external_knowledge_info_fields), diff --git a/api/fields/document_fields.py b/api/fields/document_fields.py index 9be59f7454..35a2a04f3e 100644 --- a/api/fields/document_fields.py +++ b/api/fields/document_fields.py @@ -33,6 +33,11 @@ document_fields = { "hit_count": fields.Integer, "doc_form": fields.String, "doc_metadata": fields.List(fields.Nested(document_metadata_fields), attribute="doc_metadata_details"), + # Summary index generation status: + # "SUMMARIZING" (when task is queued and generating) + "summary_index_status": fields.String, + # Whether this document needs summary index generation + "need_summary": fields.Boolean, } document_with_segments_fields = { @@ -60,6 +65,10 @@ document_with_segments_fields = { "completed_segments": fields.Integer, "total_segments": fields.Integer, "doc_metadata": fields.List(fields.Nested(document_metadata_fields), attribute="doc_metadata_details"), + # Summary index generation status: + # "SUMMARIZING" (when task is queued and generating) + "summary_index_status": fields.String, + "need_summary": fields.Boolean, # Whether this document needs summary index generation } dataset_and_document_fields = { diff --git a/api/fields/hit_testing_fields.py b/api/fields/hit_testing_fields.py index e70f9fa722..0b54992835 100644 --- a/api/fields/hit_testing_fields.py +++ b/api/fields/hit_testing_fields.py @@ -58,4 +58,5 @@ hit_testing_record_fields = { "score": fields.Float, "tsne_position": fields.Raw, "files": fields.List(fields.Nested(files_fields)), + "summary": fields.String, # Summary content if retrieved via summary index } diff --git a/api/fields/message_fields.py b/api/fields/message_fields.py index f052a9f1ab..c8b0d0802d 100644 --- a/api/fields/message_fields.py +++ b/api/fields/message_fields.py @@ -36,6 +36,7 @@ class RetrieverResource(ResponseModel): segment_position: int | None = None index_node_hash: str | None = None content: str | None = None + summary: str | None = None created_at: int | None = None @field_validator("created_at", mode="before") diff --git a/api/fields/segment_fields.py b/api/fields/segment_fields.py index 56d6b68378..2ce9fb154c 100644 --- a/api/fields/segment_fields.py +++ b/api/fields/segment_fields.py @@ -49,4 +49,5 @@ segment_fields = { "stopped_at": TimestampField, "child_chunks": fields.List(fields.Nested(child_chunk_fields)), "attachments": fields.List(fields.Nested(attachment_fields)), + "summary": fields.String, # Summary content for the segment } diff --git a/api/migrations/versions/2026_01_08_1031-aab323465866_sandbox_providers.py b/api/migrations/versions/2026_01_08_1031-aab323465866_sandbox_providers.py index 7f099f7147..f6146ead67 100644 --- a/api/migrations/versions/2026_01_08_1031-aab323465866_sandbox_providers.py +++ b/api/migrations/versions/2026_01_08_1031-aab323465866_sandbox_providers.py @@ -1,7 +1,7 @@ """sandbox_providers Revision ID: aab323465866 -Revises: 9d77545f524e +Revises: 788d3099ae3a Create Date: 2026-01-08 10:31:05.062722 """ @@ -11,7 +11,7 @@ import sqlalchemy as sa # revision identifiers, used by Alembic. revision = 'aab323465866' -down_revision = '9d77545f524e' +down_revision = '788d3099ae3a' branch_labels = None depends_on = None diff --git a/api/migrations/versions/2026_01_27_1815-788d3099ae3a_add_summary_index_feature.py b/api/migrations/versions/2026_01_27_1815-788d3099ae3a_add_summary_index_feature.py new file mode 100644 index 0000000000..3c2e0822e1 --- /dev/null +++ b/api/migrations/versions/2026_01_27_1815-788d3099ae3a_add_summary_index_feature.py @@ -0,0 +1,107 @@ +"""add summary index feature + +Revision ID: 788d3099ae3a +Revises: 9d77545f524e +Create Date: 2026-01-27 18:15:45.277928 + +""" +from alembic import op +import models as models +import sqlalchemy as sa + +def _is_pg(conn): + return conn.dialect.name == "postgresql" + +# revision identifiers, used by Alembic. +revision = '788d3099ae3a' +down_revision = '9d77545f524e' +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + conn = op.get_bind() + if _is_pg(conn): + op.create_table('document_segment_summaries', + sa.Column('id', models.types.StringUUID(), nullable=False), + sa.Column('dataset_id', models.types.StringUUID(), nullable=False), + sa.Column('document_id', models.types.StringUUID(), nullable=False), + sa.Column('chunk_id', models.types.StringUUID(), nullable=False), + sa.Column('summary_content', models.types.LongText(), nullable=True), + sa.Column('summary_index_node_id', sa.String(length=255), nullable=True), + sa.Column('summary_index_node_hash', sa.String(length=255), nullable=True), + sa.Column('tokens', sa.Integer(), nullable=True), + sa.Column('status', sa.String(length=32), server_default=sa.text("'generating'"), nullable=False), + sa.Column('error', models.types.LongText(), nullable=True), + sa.Column('enabled', sa.Boolean(), server_default=sa.text('true'), nullable=False), + sa.Column('disabled_at', sa.DateTime(), nullable=True), + sa.Column('disabled_by', models.types.StringUUID(), nullable=True), + sa.Column('created_at', sa.DateTime(), server_default=sa.text('CURRENT_TIMESTAMP'), nullable=False), + sa.Column('updated_at', sa.DateTime(), server_default=sa.text('CURRENT_TIMESTAMP'), nullable=False), + sa.PrimaryKeyConstraint('id', name='document_segment_summaries_pkey') + ) + with op.batch_alter_table('document_segment_summaries', schema=None) as batch_op: + batch_op.create_index('document_segment_summaries_chunk_id_idx', ['chunk_id'], unique=False) + batch_op.create_index('document_segment_summaries_dataset_id_idx', ['dataset_id'], unique=False) + batch_op.create_index('document_segment_summaries_document_id_idx', ['document_id'], unique=False) + batch_op.create_index('document_segment_summaries_status_idx', ['status'], unique=False) + + with op.batch_alter_table('datasets', schema=None) as batch_op: + batch_op.add_column(sa.Column('summary_index_setting', models.types.AdjustedJSON(), nullable=True)) + + with op.batch_alter_table('documents', schema=None) as batch_op: + batch_op.add_column(sa.Column('need_summary', sa.Boolean(), server_default=sa.text('false'), nullable=True)) + else: + # MySQL: Use compatible syntax + op.create_table( + 'document_segment_summaries', + sa.Column('id', models.types.StringUUID(), nullable=False), + sa.Column('dataset_id', models.types.StringUUID(), nullable=False), + sa.Column('document_id', models.types.StringUUID(), nullable=False), + sa.Column('chunk_id', models.types.StringUUID(), nullable=False), + sa.Column('summary_content', models.types.LongText(), nullable=True), + sa.Column('summary_index_node_id', sa.String(length=255), nullable=True), + sa.Column('summary_index_node_hash', sa.String(length=255), nullable=True), + sa.Column('tokens', sa.Integer(), nullable=True), + sa.Column('status', sa.String(length=32), server_default=sa.text("'generating'"), nullable=False), + sa.Column('error', models.types.LongText(), nullable=True), + sa.Column('enabled', sa.Boolean(), server_default=sa.text('true'), nullable=False), + sa.Column('disabled_at', sa.DateTime(), nullable=True), + sa.Column('disabled_by', models.types.StringUUID(), nullable=True), + sa.Column('created_at', sa.DateTime(), server_default=sa.text('CURRENT_TIMESTAMP'), nullable=False), + sa.Column('updated_at', sa.DateTime(), server_default=sa.text('CURRENT_TIMESTAMP'), nullable=False), + sa.PrimaryKeyConstraint('id', name='document_segment_summaries_pkey'), + ) + with op.batch_alter_table('document_segment_summaries', schema=None) as batch_op: + batch_op.create_index('document_segment_summaries_chunk_id_idx', ['chunk_id'], unique=False) + batch_op.create_index('document_segment_summaries_dataset_id_idx', ['dataset_id'], unique=False) + batch_op.create_index('document_segment_summaries_document_id_idx', ['document_id'], unique=False) + batch_op.create_index('document_segment_summaries_status_idx', ['status'], unique=False) + + with op.batch_alter_table('datasets', schema=None) as batch_op: + batch_op.add_column(sa.Column('summary_index_setting', models.types.AdjustedJSON(), nullable=True)) + + with op.batch_alter_table('documents', schema=None) as batch_op: + batch_op.add_column(sa.Column('need_summary', sa.Boolean(), server_default=sa.text('false'), nullable=True)) + + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + + with op.batch_alter_table('documents', schema=None) as batch_op: + batch_op.drop_column('need_summary') + + with op.batch_alter_table('datasets', schema=None) as batch_op: + batch_op.drop_column('summary_index_setting') + + with op.batch_alter_table('document_segment_summaries', schema=None) as batch_op: + batch_op.drop_index('document_segment_summaries_status_idx') + batch_op.drop_index('document_segment_summaries_document_id_idx') + batch_op.drop_index('document_segment_summaries_dataset_id_idx') + batch_op.drop_index('document_segment_summaries_chunk_id_idx') + + op.drop_table('document_segment_summaries') + # ### end Alembic commands ### diff --git a/api/models/dataset.py b/api/models/dataset.py index 62f11b8c72..6ab8f372bf 100644 --- a/api/models/dataset.py +++ b/api/models/dataset.py @@ -72,6 +72,7 @@ class Dataset(Base): keyword_number = mapped_column(sa.Integer, nullable=True, server_default=sa.text("10")) collection_binding_id = mapped_column(StringUUID, nullable=True) retrieval_model = mapped_column(AdjustedJSON, nullable=True) + summary_index_setting = mapped_column(AdjustedJSON, nullable=True) built_in_field_enabled = mapped_column(sa.Boolean, nullable=False, server_default=sa.text("false")) icon_info = mapped_column(AdjustedJSON, nullable=True) runtime_mode = mapped_column(sa.String(255), nullable=True, server_default=sa.text("'general'")) @@ -419,6 +420,7 @@ class Document(Base): doc_metadata = mapped_column(AdjustedJSON, nullable=True) doc_form = mapped_column(String(255), nullable=False, server_default=sa.text("'text_model'")) doc_language = mapped_column(String(255), nullable=True) + need_summary: Mapped[bool | None] = mapped_column(sa.Boolean, nullable=True, server_default=sa.text("false")) DATA_SOURCES = ["upload_file", "notion_import", "website_crawl"] @@ -1575,3 +1577,36 @@ class SegmentAttachmentBinding(Base): segment_id: Mapped[str] = mapped_column(StringUUID, nullable=False) attachment_id: Mapped[str] = mapped_column(StringUUID, nullable=False) created_at: Mapped[datetime] = mapped_column(sa.DateTime, nullable=False, server_default=func.current_timestamp()) + + +class DocumentSegmentSummary(Base): + __tablename__ = "document_segment_summaries" + __table_args__ = ( + sa.PrimaryKeyConstraint("id", name="document_segment_summaries_pkey"), + sa.Index("document_segment_summaries_dataset_id_idx", "dataset_id"), + sa.Index("document_segment_summaries_document_id_idx", "document_id"), + sa.Index("document_segment_summaries_chunk_id_idx", "chunk_id"), + sa.Index("document_segment_summaries_status_idx", "status"), + ) + + id: Mapped[str] = mapped_column(StringUUID, nullable=False, default=lambda: str(uuid4())) + dataset_id: Mapped[str] = mapped_column(StringUUID, nullable=False) + document_id: Mapped[str] = mapped_column(StringUUID, nullable=False) + # corresponds to DocumentSegment.id or parent chunk id + chunk_id: Mapped[str] = mapped_column(StringUUID, nullable=False) + summary_content: Mapped[str] = mapped_column(LongText, nullable=True) + summary_index_node_id: Mapped[str] = mapped_column(String(255), nullable=True) + summary_index_node_hash: Mapped[str] = mapped_column(String(255), nullable=True) + tokens: Mapped[int | None] = mapped_column(sa.Integer, nullable=True) + status: Mapped[str] = mapped_column(String(32), nullable=False, server_default=sa.text("'generating'")) + error: Mapped[str] = mapped_column(LongText, nullable=True) + enabled: Mapped[bool] = mapped_column(sa.Boolean, nullable=False, server_default=sa.text("true")) + disabled_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True) + disabled_by = mapped_column(StringUUID, nullable=True) + created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp()) + updated_at: Mapped[datetime] = mapped_column( + DateTime, nullable=False, server_default=func.current_timestamp(), onupdate=func.current_timestamp() + ) + + def __repr__(self): + return f"" diff --git a/api/models/model.py b/api/models/model.py index 5cef46dbc0..94661db9da 100644 --- a/api/models/model.py +++ b/api/models/model.py @@ -659,16 +659,22 @@ class AccountTrialAppRecord(Base): return user -class ExporleBanner(Base): +class ExporleBanner(TypeBase): __tablename__ = "exporle_banners" __table_args__ = (sa.PrimaryKeyConstraint("id", name="exporler_banner_pkey"),) - id = mapped_column(StringUUID, server_default=sa.text("uuid_generate_v4()")) - content = mapped_column(sa.JSON, nullable=False) - link = mapped_column(String(255), nullable=False) - sort = mapped_column(sa.Integer, nullable=False) - status = mapped_column(sa.String(255), nullable=False, server_default=sa.text("'enabled'::character varying")) - created_at = mapped_column(sa.DateTime, nullable=False, server_default=func.current_timestamp()) - language = mapped_column(String(255), nullable=False, server_default=sa.text("'en-US'::character varying")) + id: Mapped[str] = mapped_column(StringUUID, server_default=sa.text("uuid_generate_v4()"), init=False) + content: Mapped[dict[str, Any]] = mapped_column(sa.JSON, nullable=False) + link: Mapped[str] = mapped_column(String(255), nullable=False) + sort: Mapped[int] = mapped_column(sa.Integer, nullable=False) + status: Mapped[str] = mapped_column( + sa.String(255), nullable=False, server_default=sa.text("'enabled'::character varying"), default="enabled" + ) + created_at: Mapped[datetime] = mapped_column( + sa.DateTime, nullable=False, server_default=func.current_timestamp(), init=False + ) + language: Mapped[str] = mapped_column( + String(255), nullable=False, server_default=sa.text("'en-US'::character varying"), default="en-US" + ) class OAuthProviderApp(TypeBase): diff --git a/api/pyproject.toml b/api/pyproject.toml index 95589199fc..40a598629f 100644 --- a/api/pyproject.toml +++ b/api/pyproject.toml @@ -181,6 +181,7 @@ dev = [ # "locust>=2.40.4", # Temporarily removed due to compatibility issues. Uncomment when resolved. "sseclient-py>=1.8.0", "pytest-timeout>=2.4.0", + "pytest-xdist>=3.8.0", ] ############################################################ diff --git a/api/services/app_asset_service.py b/api/services/app_asset_service.py index c8aeca8605..3efa985620 100644 --- a/api/services/app_asset_service.py +++ b/api/services/app_asset_service.py @@ -18,7 +18,6 @@ from core.app_assets.storage import AppAssetStorage, AssetPath from extensions.ext_database import db from extensions.ext_redis import redis_client from extensions.ext_storage import storage -from extensions.storage.silent_storage import SilentStorage from models.app_asset import AppAssets from models.model import App @@ -43,9 +42,12 @@ class AppAssetService: This method creates an AppAssetStorage each time it's called, ensuring storage.storage_runner is only accessed after init_app. + + The storage is wrapped with FilePresignStorage for presign fallback support + and CachedPresignStorage for URL caching. """ return AppAssetStorage( - storage=SilentStorage(storage.storage_runner), + storage=storage.storage_runner, redis_client=redis_client, cache_key_prefix="app_assets", ) @@ -54,6 +56,22 @@ class AppAssetService: def _lock(app_id: str): return redis_client.lock(f"app_asset:lock:{app_id}", timeout=AppAssetService._LOCK_TIMEOUT_SECONDS) + @staticmethod + def get_assets_by_version(tenant_id: str, app_id: str, workflow_id: str | None = None) -> AppAssets: + """Get asset tree by workflow_id (published) or draft if workflow_id is None.""" + with Session(db.engine) as session: + version = workflow_id or AppAssets.VERSION_DRAFT + assets = ( + session.query(AppAssets) + .filter( + AppAssets.tenant_id == tenant_id, + AppAssets.app_id == app_id, + AppAssets.version == version, + ) + .first() + ) + return assets or AppAssets(tenant_id=tenant_id, app_id=app_id, version=version) + @staticmethod def get_draft_assets(tenant_id: str, app_id: str) -> list[AssetItem]: with Session(db.engine) as session: diff --git a/api/services/app_bundle_service.py b/api/services/app_bundle_service.py index 1a1102a66d..4fd8f1371f 100644 --- a/api/services/app_bundle_service.py +++ b/api/services/app_bundle_service.py @@ -1,26 +1,47 @@ +"""Service for exporting and importing App Bundles (DSL + assets). + +Bundle structure: + bundle.zip/ + {app_name}.yml # DSL file + manifest.json # Asset manifest (required for import) + {app_name}/ # Asset files + folder/file.txt + ... + +Import flow (sandbox-based): + 1. prepare_import: Frontend gets upload URL, stores import_id in Redis + 2. Frontend uploads zip to storage + 3. confirm_import: Sandbox downloads zip, extracts, uploads assets via presigned URLs + +Manifest format (schema_version 1.0): + - app_assets.tree: Full AppAssetFileTree for 100% ID restoration + - files: node_id -> path mapping for file nodes + - integrity.file_count: Basic validation +""" + from __future__ import annotations -import io +import json import logging import re -import zipfile +from dataclasses import dataclass from uuid import uuid4 -import yaml +from pydantic import ValidationError from sqlalchemy.orm import Session from core.app.entities.app_bundle_entities import ( - BUNDLE_DSL_FILENAME_PATTERN, - BUNDLE_MAX_SIZE, + MANIFEST_FILENAME, BundleExportResult, BundleFormatError, - ZipSecurityError, + BundleManifest, ) -from core.app_assets.storage import AssetPath -from core.app_bundle import SourceZipExtractor -from core.zip_sandbox import SandboxDownloadItem, ZipSandbox +from core.app_assets.storage import AppAssetStorage, AssetPath, BundleImportZipPath +from core.zip_sandbox import SandboxDownloadItem, SandboxUploadItem, ZipSandbox from extensions.ext_database import db -from models import Account, App +from extensions.ext_redis import redis_client +from models.account import Account +from models.model import App from .app_asset_package_service import AppAssetPackageService from .app_asset_service import AppAssetService @@ -28,6 +49,15 @@ from .app_dsl_service import AppDslService, Import logger = logging.getLogger(__name__) +_IMPORT_REDIS_PREFIX = "app_bundle:import:" +_IMPORT_TTL_SECONDS = 3600 # 1 hour + + +@dataclass +class ImportPrepareResult: + import_id: str + upload_url: str + class AppBundleService: @staticmethod @@ -38,14 +68,10 @@ class AppBundleService: marked_name: str = "", marked_comment: str = "", ): - """ - Publish App Bundle (workflow + assets). - Coordinates WorkflowService and AppAssetService publishing in a single transaction. - """ + """Publish App Bundle (workflow + assets) in a single transaction.""" from models.workflow import Workflow from services.workflow_service import WorkflowService - # 1. Publish workflow workflow: Workflow = WorkflowService().publish_workflow( session=session, app_model=app_model, @@ -53,17 +79,16 @@ class AppBundleService: marked_name=marked_name, marked_comment=marked_comment, ) - - # 2. Publish assets (bound to workflow_id) AppAssetPackageService.publish( session=session, app_model=app_model, account_id=account.id, workflow_id=workflow.id, ) - return workflow + # ========== Export ========== + @staticmethod def export_bundle( *, @@ -73,14 +98,14 @@ class AppBundleService: workflow_id: str | None = None, expires_in: int = 10 * 60, ) -> BundleExportResult: - """Export bundle and return a temporary download URL. - - Uses sandbox VM to build the ZIP, avoiding memory pressure in API process. - """ + """Export bundle with manifest.json and return a temporary download URL.""" tenant_id = app_model.tenant_id app_id = app_model.id safe_name = AppBundleService._sanitize_filename(app_model.name) - filename = f"{safe_name}.zip" + + dsl_filename = f"{safe_name}.yml" + app_assets = AppAssetService.get_assets_by_version(tenant_id, app_id, workflow_id) + manifest = BundleManifest.from_tree(app_assets.asset_tree, dsl_filename) export_id = uuid4().hex export_path = AssetPath.bundle_export_zip(tenant_id, app_id, export_id) @@ -95,147 +120,170 @@ class AppBundleService: with ZipSandbox(tenant_id=tenant_id, user_id=account_id, app_id="app-bundle-export") as zs: zs.write_file(f"bundle_root/{safe_name}.yml", dsl_content.encode("utf-8")) + zs.write_file(f"bundle_root/{MANIFEST_FILENAME}", manifest.model_dump_json(indent=2).encode("utf-8")) - # Published assets: use stored source zip and unzip into /... if workflow_id is not None: source_zip_path = AssetPath.source_zip(tenant_id, app_id, workflow_id) source_url = asset_storage.get_download_url(source_zip_path, expires_in) zs.download_archive(source_url, path="tmp/source_assets.zip") zs.unzip(archive_path="tmp/source_assets.zip", dest_dir=f"bundle_root/{safe_name}") else: - # Draft assets: download individual files and place under /... asset_items = AppAssetService.get_draft_assets(tenant_id, app_id) - asset_urls = asset_storage.get_download_urls( - [AssetPath.draft(tenant_id, app_id, a.asset_id) for a in asset_items], expires_in - ) - zs.download_items( - [ - SandboxDownloadItem(url=url, path=f"{safe_name}/{a.path}") - for a, url in zip(asset_items, asset_urls, strict=True) - ], - dest_dir="bundle_root", - ) + if asset_items: + asset_urls = asset_storage.get_download_urls( + [AssetPath.draft(tenant_id, app_id, a.asset_id) for a in asset_items], expires_in + ) + zs.download_items( + [ + SandboxDownloadItem(url=url, path=f"{safe_name}/{a.path}") + for a, url in zip(asset_items, asset_urls, strict=True) + ], + dest_dir="bundle_root", + ) archive = zs.zip(src="bundle_root", include_base=False) zs.upload(archive, upload_url) download_url = asset_storage.get_download_url(export_path, expires_in) - return BundleExportResult(download_url=download_url, filename=filename) + return BundleExportResult(download_url=download_url, filename=f"{safe_name}.zip") + + # ========== Import ========== @staticmethod - def import_bundle( + def prepare_import(tenant_id: str, account_id: str) -> ImportPrepareResult: + """Prepare import: generate import_id and upload URL.""" + import_id = uuid4().hex + import_path = AssetPath.bundle_import_zip(tenant_id, import_id) + asset_storage = AppAssetService.get_storage() + upload_url = asset_storage.get_import_upload_url(import_path, _IMPORT_TTL_SECONDS) + + redis_client.setex( + f"{_IMPORT_REDIS_PREFIX}{import_id}", + _IMPORT_TTL_SECONDS, + json.dumps({"tenant_id": tenant_id, "account_id": account_id}), + ) + + return ImportPrepareResult(import_id=import_id, upload_url=upload_url) + + @staticmethod + def confirm_import( + import_id: str, account: Account, - zip_bytes: bytes, + *, name: str | None = None, description: str | None = None, icon_type: str | None = None, icon: str | None = None, icon_background: str | None = None, ) -> Import: - if len(zip_bytes) > BUNDLE_MAX_SIZE: - raise BundleFormatError(f"Bundle size exceeds limit: {BUNDLE_MAX_SIZE} bytes") + """Confirm import: download zip in sandbox, extract, and upload assets.""" + redis_key = f"{_IMPORT_REDIS_PREFIX}{import_id}" + redis_data = redis_client.get(redis_key) + if not redis_data: + raise BundleFormatError("Import session expired or not found") - dsl_content, assets_prefix = AppBundleService._extract_dsl_from_bundle(zip_bytes) + import_meta = json.loads(redis_data) + tenant_id: str = import_meta["tenant_id"] - with Session(db.engine) as session: - dsl_service = AppDslService(session) - import_result = dsl_service.import_app( + if tenant_id != account.current_tenant_id: + raise BundleFormatError("Import session tenant mismatch") + + import_path = AssetPath.bundle_import_zip(tenant_id, import_id) + asset_storage = AppAssetService.get_storage() + + try: + result = AppBundleService.import_bundle( + tenant_id=tenant_id, account=account, - import_mode="yaml-content", - yaml_content=dsl_content, + import_path=import_path, + asset_storage=asset_storage, name=name, description=description, icon_type=icon_type, icon=icon, icon_background=icon_background, - app_id=None, ) - session.commit() + finally: + redis_client.delete(redis_key) + asset_storage.delete_import_zip(import_path) - if import_result.app_id and assets_prefix: - AppBundleService._import_assets_from_bundle( - zip_bytes=zip_bytes, - assets_prefix=assets_prefix, - app_id=import_result.app_id, - account_id=account.id, - ) + return result + + @staticmethod + def import_bundle( + *, + tenant_id: str, + account: Account, + import_path: BundleImportZipPath, + asset_storage: AppAssetStorage, + name: str | None, + description: str | None, + icon_type: str | None, + icon: str | None, + icon_background: str | None, + ) -> Import: + """Execute import in sandbox.""" + download_url = asset_storage.get_import_download_url(import_path, _IMPORT_TTL_SECONDS) + + with ZipSandbox(tenant_id=tenant_id, user_id=account.id, app_id="app-bundle-import") as zs: + zs.download_archive(download_url, path="import.zip") + zs.unzip(archive_path="import.zip", dest_dir="bundle") + + manifest_bytes = zs.read_file(f"bundle/{MANIFEST_FILENAME}") + try: + manifest = BundleManifest.model_validate_json(manifest_bytes) + except ValidationError as e: + raise BundleFormatError(f"Invalid manifest.json: {e}") from e + + dsl_content = zs.read_file(f"bundle/{manifest.dsl_filename}").decode("utf-8") + + with Session(db.engine) as session: + dsl_service = AppDslService(session) + import_result = dsl_service.import_app( + account=account, + import_mode="yaml-content", + yaml_content=dsl_content, + name=name, + description=description, + icon_type=icon_type, + icon=icon, + icon_background=icon_background, + app_id=None, + ) + session.commit() + + if not import_result.app_id: + return import_result + + app_id = import_result.app_id + tree = manifest.app_assets.tree + + upload_items: list[SandboxUploadItem] = [] + for file_entry in manifest.files: + asset_path = AssetPath.draft(tenant_id, app_id, file_entry.node_id) + file_upload_url = asset_storage.get_upload_url(asset_path, _IMPORT_TTL_SECONDS) + src_path = f"{manifest.assets_prefix}/{file_entry.path}" + upload_items.append(SandboxUploadItem(path=src_path, url=file_upload_url)) + + if upload_items: + zs.upload_items(upload_items, src_dir="bundle") + + # Tree sizes are already set from manifest; no need to update + app_model = db.session.query(App).filter(App.id == app_id).first() + if app_model: + AppAssetService.set_draft_assets( + app_model=app_model, + account_id=account.id, + new_tree=tree, + ) return import_result - @staticmethod - def _extract_dsl_from_bundle(zip_bytes: bytes) -> tuple[str, str | None]: - dsl_content: str | None = None - dsl_filename: str | None = None - - with zipfile.ZipFile(io.BytesIO(zip_bytes), "r") as zf: - for info in zf.infolist(): - if info.is_dir(): - continue - if BUNDLE_DSL_FILENAME_PATTERN.match(info.filename): - if dsl_content is not None: - raise BundleFormatError("Multiple DSL files found in bundle") - dsl_content = zf.read(info).decode("utf-8") - dsl_filename = info.filename - - if dsl_content is None or dsl_filename is None: - raise BundleFormatError("No DSL file (*.yml or *.yaml) found in bundle root") - - yaml.safe_load(dsl_content) - - assets_prefix = dsl_filename.rsplit(".", 1)[0] - has_assets = AppBundleService._check_assets_prefix_exists(zip_bytes, assets_prefix) - - return dsl_content, assets_prefix if has_assets else None - - @staticmethod - def _check_assets_prefix_exists(zip_bytes: bytes, prefix: str) -> bool: - with zipfile.ZipFile(io.BytesIO(zip_bytes), "r") as zf: - for info in zf.infolist(): - if info.filename.startswith(f"{prefix}/"): - return True - return False - - @staticmethod - def _import_assets_from_bundle( - zip_bytes: bytes, - assets_prefix: str, - app_id: str, - account_id: str, - ) -> None: - app_model = db.session.query(App).filter(App.id == app_id).first() - if not app_model: - logger.warning("App not found for asset import: %s", app_id) - return - - asset_storage = AppAssetService.get_storage() - extractor = SourceZipExtractor(asset_storage) - try: - folders, files = extractor.extract_entries( - zip_bytes, - expected_prefix=f"{assets_prefix}/", - ) - except ZipSecurityError as e: - logger.warning("Zip security error during asset import: %s", e) - return - - if not folders and not files: - return - - new_tree = extractor.build_tree_and_save( - folders=folders, - files=files, - tenant_id=app_model.tenant_id, - app_id=app_model.id, - ) - - AppAssetService.set_draft_assets( - app_model=app_model, - account_id=account_id, - new_tree=new_tree, - ) + # ========== Helpers ========== @staticmethod def _sanitize_filename(name: str) -> str: + """Sanitize app name for use as filename.""" safe = re.sub(r'[<>:"/\\|?*\x00-\x1f]', "_", name) safe = safe.strip(". ") return safe[:100] if safe else "app" diff --git a/api/services/dataset_service.py b/api/services/dataset_service.py index be9a0e9279..0b3fcbe4ae 100644 --- a/api/services/dataset_service.py +++ b/api/services/dataset_service.py @@ -89,6 +89,7 @@ from tasks.disable_segments_from_index_task import disable_segments_from_index_t from tasks.document_indexing_update_task import document_indexing_update_task from tasks.enable_segments_to_index_task import enable_segments_to_index_task from tasks.recover_document_indexing_task import recover_document_indexing_task +from tasks.regenerate_summary_index_task import regenerate_summary_index_task from tasks.remove_document_from_index_task import remove_document_from_index_task from tasks.retry_document_indexing_task import retry_document_indexing_task from tasks.sync_website_document_indexing_task import sync_website_document_indexing_task @@ -211,6 +212,7 @@ class DatasetService: embedding_model_provider: str | None = None, embedding_model_name: str | None = None, retrieval_model: RetrievalModel | None = None, + summary_index_setting: dict | None = None, ): # check if dataset name already exists if db.session.query(Dataset).filter_by(name=name, tenant_id=tenant_id).first(): @@ -253,6 +255,8 @@ class DatasetService: dataset.retrieval_model = retrieval_model.model_dump() if retrieval_model else None dataset.permission = permission or DatasetPermissionEnum.ONLY_ME dataset.provider = provider + if summary_index_setting is not None: + dataset.summary_index_setting = summary_index_setting db.session.add(dataset) db.session.flush() @@ -476,6 +480,11 @@ class DatasetService: if external_retrieval_model: dataset.retrieval_model = external_retrieval_model + # Update summary index setting if provided + summary_index_setting = data.get("summary_index_setting", None) + if summary_index_setting is not None: + dataset.summary_index_setting = summary_index_setting + # Update basic dataset properties dataset.name = data.get("name", dataset.name) dataset.description = data.get("description", dataset.description) @@ -564,6 +573,9 @@ class DatasetService: # update Retrieval model if data.get("retrieval_model"): filtered_data["retrieval_model"] = data["retrieval_model"] + # update summary index setting + if data.get("summary_index_setting"): + filtered_data["summary_index_setting"] = data.get("summary_index_setting") # update icon info if data.get("icon_info"): filtered_data["icon_info"] = data.get("icon_info") @@ -572,12 +584,27 @@ class DatasetService: db.session.query(Dataset).filter_by(id=dataset.id).update(filtered_data) db.session.commit() + # Reload dataset to get updated values + db.session.refresh(dataset) + # update pipeline knowledge base node data DatasetService._update_pipeline_knowledge_base_node_data(dataset, user.id) # Trigger vector index task if indexing technique changed if action: deal_dataset_vector_index_task.delay(dataset.id, action) + # If embedding_model changed, also regenerate summary vectors + if action == "update": + regenerate_summary_index_task.delay( + dataset.id, + regenerate_reason="embedding_model_changed", + regenerate_vectors_only=True, + ) + + # Note: summary_index_setting changes do not trigger automatic regeneration of existing summaries. + # The new setting will only apply to: + # 1. New documents added after the setting change + # 2. Manual summary generation requests return dataset @@ -616,6 +643,7 @@ class DatasetService: knowledge_index_node_data["chunk_structure"] = dataset.chunk_structure knowledge_index_node_data["indexing_technique"] = dataset.indexing_technique # pyright: ignore[reportAttributeAccessIssue] knowledge_index_node_data["keyword_number"] = dataset.keyword_number + knowledge_index_node_data["summary_index_setting"] = dataset.summary_index_setting node["data"] = knowledge_index_node_data updated = True except Exception: @@ -854,6 +882,54 @@ class DatasetService: ) filtered_data["collection_binding_id"] = dataset_collection_binding.id + @staticmethod + def _check_summary_index_setting_model_changed(dataset: Dataset, data: dict[str, Any]) -> bool: + """ + Check if summary_index_setting model (model_name or model_provider_name) has changed. + + Args: + dataset: Current dataset object + data: Update data dictionary + + Returns: + bool: True if summary model changed, False otherwise + """ + # Check if summary_index_setting is being updated + if "summary_index_setting" not in data or data.get("summary_index_setting") is None: + return False + + new_summary_setting = data.get("summary_index_setting") + old_summary_setting = dataset.summary_index_setting + + # If new setting is disabled, no need to regenerate + if not new_summary_setting or not new_summary_setting.get("enable"): + return False + + # If old setting doesn't exist, no need to regenerate (no existing summaries to regenerate) + # Note: This task only regenerates existing summaries, not generates new ones + if not old_summary_setting: + return False + + # Compare model_name and model_provider_name + old_model_name = old_summary_setting.get("model_name") + old_model_provider = old_summary_setting.get("model_provider_name") + new_model_name = new_summary_setting.get("model_name") + new_model_provider = new_summary_setting.get("model_provider_name") + + # Check if model changed + if old_model_name != new_model_name or old_model_provider != new_model_provider: + logger.info( + "Summary index setting model changed for dataset %s: old=%s/%s, new=%s/%s", + dataset.id, + old_model_provider, + old_model_name, + new_model_provider, + new_model_name, + ) + return True + + return False + @staticmethod def update_rag_pipeline_dataset_settings( session: Session, dataset: Dataset, knowledge_configuration: KnowledgeConfiguration, has_published: bool = False @@ -889,6 +965,9 @@ class DatasetService: else: raise ValueError("Invalid index method") dataset.retrieval_model = knowledge_configuration.retrieval_model.model_dump() + # Update summary_index_setting if provided + if knowledge_configuration.summary_index_setting is not None: + dataset.summary_index_setting = knowledge_configuration.summary_index_setting session.add(dataset) else: if dataset.chunk_structure and dataset.chunk_structure != knowledge_configuration.chunk_structure: @@ -994,6 +1073,9 @@ class DatasetService: if dataset.keyword_number != knowledge_configuration.keyword_number: dataset.keyword_number = knowledge_configuration.keyword_number dataset.retrieval_model = knowledge_configuration.retrieval_model.model_dump() + # Update summary_index_setting if provided + if knowledge_configuration.summary_index_setting is not None: + dataset.summary_index_setting = knowledge_configuration.summary_index_setting session.add(dataset) session.commit() if action: @@ -1314,6 +1396,50 @@ class DocumentService: upload_file = DocumentService._get_upload_file_for_upload_file_document(document) return file_helpers.get_signed_file_url(upload_file_id=upload_file.id, as_attachment=True) + @staticmethod + def enrich_documents_with_summary_index_status( + documents: Sequence[Document], + dataset: Dataset, + tenant_id: str, + ) -> None: + """ + Enrich documents with summary_index_status based on dataset summary index settings. + + This method calculates and sets the summary_index_status for each document that needs summary. + Documents that don't need summary or when summary index is disabled will have status set to None. + + Args: + documents: List of Document instances to enrich + dataset: Dataset instance containing summary_index_setting + tenant_id: Tenant ID for summary status lookup + """ + # Check if dataset has summary index enabled + has_summary_index = dataset.summary_index_setting and dataset.summary_index_setting.get("enable") is True + + # Filter documents that need summary calculation + documents_need_summary = [doc for doc in documents if doc.need_summary is True] + document_ids_need_summary = [str(doc.id) for doc in documents_need_summary] + + # Calculate summary_index_status for documents that need summary (only if dataset summary index is enabled) + summary_status_map: dict[str, str | None] = {} + if has_summary_index and document_ids_need_summary: + from services.summary_index_service import SummaryIndexService + + summary_status_map = SummaryIndexService.get_documents_summary_index_status( + document_ids=document_ids_need_summary, + dataset_id=dataset.id, + tenant_id=tenant_id, + ) + + # Add summary_index_status to each document + for document in documents: + if has_summary_index and document.need_summary is True: + # Get status from map, default to None (not queued yet) + document.summary_index_status = summary_status_map.get(str(document.id)) # type: ignore[attr-defined] + else: + # Return null if summary index is not enabled or document doesn't need summary + document.summary_index_status = None # type: ignore[attr-defined] + @staticmethod def prepare_document_batch_download_zip( *, @@ -1964,6 +2090,8 @@ class DocumentService: DuplicateDocumentIndexingTaskProxy( dataset.tenant_id, dataset.id, duplicate_document_ids ).delay() + # Note: Summary index generation is triggered in document_indexing_task after indexing completes + # to ensure segments are available. See tasks/document_indexing_task.py except LockNotOwnedError: pass @@ -2268,6 +2396,11 @@ class DocumentService: name: str, batch: str, ): + # Set need_summary based on dataset's summary_index_setting + need_summary = False + if dataset.summary_index_setting and dataset.summary_index_setting.get("enable") is True: + need_summary = True + document = Document( tenant_id=dataset.tenant_id, dataset_id=dataset.id, @@ -2281,6 +2414,7 @@ class DocumentService: created_by=account.id, doc_form=document_form, doc_language=document_language, + need_summary=need_summary, ) doc_metadata = {} if dataset.built_in_field_enabled: @@ -2505,6 +2639,7 @@ class DocumentService: embedding_model_provider=knowledge_config.embedding_model_provider, collection_binding_id=dataset_collection_binding_id, retrieval_model=retrieval_model.model_dump() if retrieval_model else None, + summary_index_setting=knowledge_config.summary_index_setting, is_multimodal=knowledge_config.is_multimodal, ) @@ -2686,6 +2821,14 @@ class DocumentService: if not isinstance(args["process_rule"]["rules"]["segmentation"]["max_tokens"], int): raise ValueError("Process rule segmentation max_tokens is invalid") + # valid summary index setting + summary_index_setting = args["process_rule"].get("summary_index_setting") + if summary_index_setting and summary_index_setting.get("enable"): + if "model_name" not in summary_index_setting or not summary_index_setting["model_name"]: + raise ValueError("Summary index model name is required") + if "model_provider_name" not in summary_index_setting or not summary_index_setting["model_provider_name"]: + raise ValueError("Summary index model provider name is required") + @staticmethod def batch_update_document_status( dataset: Dataset, document_ids: list[str], action: Literal["enable", "disable", "archive", "un_archive"], user @@ -3154,6 +3297,35 @@ class SegmentService: if args.enabled or keyword_changed: # update segment vector index VectorService.update_segment_vector(args.keywords, segment, dataset) + # update summary index if summary is provided and has changed + if args.summary is not None: + # When user manually provides summary, allow saving even if summary_index_setting doesn't exist + # summary_index_setting is only needed for LLM generation, not for manual summary vectorization + # Vectorization uses dataset.embedding_model, which doesn't require summary_index_setting + if dataset.indexing_technique == "high_quality": + # Query existing summary from database + from models.dataset import DocumentSegmentSummary + + existing_summary = ( + db.session.query(DocumentSegmentSummary) + .where( + DocumentSegmentSummary.chunk_id == segment.id, + DocumentSegmentSummary.dataset_id == dataset.id, + ) + .first() + ) + + # Check if summary has changed + existing_summary_content = existing_summary.summary_content if existing_summary else None + if existing_summary_content != args.summary: + # Summary has changed, update it + from services.summary_index_service import SummaryIndexService + + try: + SummaryIndexService.update_summary_for_segment(segment, dataset, args.summary) + except Exception: + logger.exception("Failed to update summary for segment %s", segment.id) + # Don't fail the entire update if summary update fails else: segment_hash = helper.generate_text_hash(content) tokens = 0 @@ -3228,6 +3400,73 @@ class SegmentService: elif document.doc_form in (IndexStructureType.PARAGRAPH_INDEX, IndexStructureType.QA_INDEX): # update segment vector index VectorService.update_segment_vector(args.keywords, segment, dataset) + # Handle summary index when content changed + if dataset.indexing_technique == "high_quality": + from models.dataset import DocumentSegmentSummary + + existing_summary = ( + db.session.query(DocumentSegmentSummary) + .where( + DocumentSegmentSummary.chunk_id == segment.id, + DocumentSegmentSummary.dataset_id == dataset.id, + ) + .first() + ) + + if args.summary is None: + # User didn't provide summary, auto-regenerate if segment previously had summary + # Auto-regeneration only happens if summary_index_setting exists and enable is True + if ( + existing_summary + and dataset.summary_index_setting + and dataset.summary_index_setting.get("enable") is True + ): + # Segment previously had summary, regenerate it with new content + from services.summary_index_service import SummaryIndexService + + try: + SummaryIndexService.generate_and_vectorize_summary( + segment, dataset, dataset.summary_index_setting + ) + logger.info("Auto-regenerated summary for segment %s after content change", segment.id) + except Exception: + logger.exception("Failed to auto-regenerate summary for segment %s", segment.id) + # Don't fail the entire update if summary regeneration fails + else: + # User provided summary, check if it has changed + # Manual summary updates are allowed even if summary_index_setting doesn't exist + existing_summary_content = existing_summary.summary_content if existing_summary else None + if existing_summary_content != args.summary: + # Summary has changed, use user-provided summary + from services.summary_index_service import SummaryIndexService + + try: + SummaryIndexService.update_summary_for_segment(segment, dataset, args.summary) + logger.info("Updated summary for segment %s with user-provided content", segment.id) + except Exception: + logger.exception("Failed to update summary for segment %s", segment.id) + # Don't fail the entire update if summary update fails + else: + # Summary hasn't changed, regenerate based on new content + # Auto-regeneration only happens if summary_index_setting exists and enable is True + if ( + existing_summary + and dataset.summary_index_setting + and dataset.summary_index_setting.get("enable") is True + ): + from services.summary_index_service import SummaryIndexService + + try: + SummaryIndexService.generate_and_vectorize_summary( + segment, dataset, dataset.summary_index_setting + ) + logger.info( + "Regenerated summary for segment %s after content change (summary unchanged)", + segment.id, + ) + except Exception: + logger.exception("Failed to regenerate summary for segment %s", segment.id) + # Don't fail the entire update if summary regeneration fails # update multimodel vector index VectorService.update_multimodel_vector(segment, args.attachment_ids or [], dataset) except Exception as e: @@ -3616,6 +3855,39 @@ class SegmentService: ) return result if isinstance(result, DocumentSegment) else None + @classmethod + def get_segments_by_document_and_dataset( + cls, + document_id: str, + dataset_id: str, + status: str | None = None, + enabled: bool | None = None, + ) -> Sequence[DocumentSegment]: + """ + Get segments for a document in a dataset with optional filtering. + + Args: + document_id: Document ID + dataset_id: Dataset ID + status: Optional status filter (e.g., "completed") + enabled: Optional enabled filter (True/False) + + Returns: + Sequence of DocumentSegment instances + """ + query = select(DocumentSegment).where( + DocumentSegment.document_id == document_id, + DocumentSegment.dataset_id == dataset_id, + ) + + if status is not None: + query = query.where(DocumentSegment.status == status) + + if enabled is not None: + query = query.where(DocumentSegment.enabled == enabled) + + return db.session.scalars(query).all() + class DatasetCollectionBindingService: @classmethod diff --git a/api/services/entities/knowledge_entities/knowledge_entities.py b/api/services/entities/knowledge_entities/knowledge_entities.py index 7959734e89..8dc5b93501 100644 --- a/api/services/entities/knowledge_entities/knowledge_entities.py +++ b/api/services/entities/knowledge_entities/knowledge_entities.py @@ -119,6 +119,7 @@ class KnowledgeConfig(BaseModel): data_source: DataSource | None = None process_rule: ProcessRule | None = None retrieval_model: RetrievalModel | None = None + summary_index_setting: dict | None = None doc_form: str = "text_model" doc_language: str = "English" embedding_model: str | None = None @@ -141,6 +142,7 @@ class SegmentUpdateArgs(BaseModel): regenerate_child_chunks: bool = False enabled: bool | None = None attachment_ids: list[str] | None = None + summary: str | None = None # Summary content for summary index class ChildChunkUpdateArgs(BaseModel): diff --git a/api/services/entities/knowledge_entities/rag_pipeline_entities.py b/api/services/entities/knowledge_entities/rag_pipeline_entities.py index cbb0efcc2a..041ae4edba 100644 --- a/api/services/entities/knowledge_entities/rag_pipeline_entities.py +++ b/api/services/entities/knowledge_entities/rag_pipeline_entities.py @@ -116,6 +116,8 @@ class KnowledgeConfiguration(BaseModel): embedding_model: str = "" keyword_number: int | None = 10 retrieval_model: RetrievalSetting + # add summary index setting + summary_index_setting: dict | None = None @field_validator("embedding_model_provider", mode="before") @classmethod diff --git a/api/services/rag_pipeline/rag_pipeline_dsl_service.py b/api/services/rag_pipeline/rag_pipeline_dsl_service.py index c1c6e204fb..be1ce834f6 100644 --- a/api/services/rag_pipeline/rag_pipeline_dsl_service.py +++ b/api/services/rag_pipeline/rag_pipeline_dsl_service.py @@ -343,6 +343,9 @@ class RagPipelineDslService: dataset.embedding_model_provider = knowledge_configuration.embedding_model_provider elif knowledge_configuration.indexing_technique == "economy": dataset.keyword_number = knowledge_configuration.keyword_number + # Update summary_index_setting if provided + if knowledge_configuration.summary_index_setting is not None: + dataset.summary_index_setting = knowledge_configuration.summary_index_setting dataset.pipeline_id = pipeline.id self._session.add(dataset) self._session.commit() @@ -477,6 +480,9 @@ class RagPipelineDslService: dataset.embedding_model_provider = knowledge_configuration.embedding_model_provider elif knowledge_configuration.indexing_technique == "economy": dataset.keyword_number = knowledge_configuration.keyword_number + # Update summary_index_setting if provided + if knowledge_configuration.summary_index_setting is not None: + dataset.summary_index_setting = knowledge_configuration.summary_index_setting dataset.pipeline_id = pipeline.id self._session.add(dataset) self._session.commit() diff --git a/api/services/sandbox/sandbox_service.py b/api/services/sandbox/sandbox_service.py index f462a239b2..6229f0ba14 100644 --- a/api/services/sandbox/sandbox_service.py +++ b/api/services/sandbox/sandbox_service.py @@ -10,6 +10,7 @@ from core.sandbox.initializer.draft_app_assets_initializer import DraftAppAssets from core.sandbox.initializer.skill_initializer import SkillInitializer from core.sandbox.sandbox import Sandbox from core.sandbox.storage.archive_storage import ArchiveSandboxStorage +from extensions.ext_storage import storage from services.app_asset_package_service import AppAssetPackageService from services.app_asset_service import AppAssetService @@ -30,7 +31,7 @@ class SandboxService: if not assets: raise ValueError(f"No assets found for tid={tenant_id}, app_id={app_id}") - storage = ArchiveSandboxStorage(tenant_id, workflow_execution_id) + archive_storage = ArchiveSandboxStorage(tenant_id, workflow_execution_id, storage.storage_runner) sandbox = ( SandboxBuilder(tenant_id, SandboxType(sandbox_provider.provider_type)) .options(sandbox_provider.config) @@ -40,7 +41,7 @@ class SandboxService: .initializer(AppAssetsInitializer(tenant_id, app_id, assets.id)) .initializer(DifyCliInitializer(tenant_id, user_id, app_id, assets.id)) .initializer(SkillInitializer(tenant_id, user_id, app_id, assets.id)) - .storage(storage, assets.id) + .storage(archive_storage, assets.id) .build() ) @@ -49,8 +50,8 @@ class SandboxService: @classmethod def delete_draft_storage(cls, tenant_id: str, user_id: str) -> None: - storage = ArchiveSandboxStorage(tenant_id, SandboxBuilder.draft_id(user_id)) - storage.delete() + archive_storage = ArchiveSandboxStorage(tenant_id, SandboxBuilder.draft_id(user_id), storage.storage_runner) + archive_storage.delete() @classmethod def create_draft( @@ -66,7 +67,9 @@ class SandboxService: AppAssetPackageService.build_assets(tenant_id, app_id, assets) sandbox_id = SandboxBuilder.draft_id(user_id) - storage = ArchiveSandboxStorage(tenant_id, sandbox_id, exclude_patterns=[AppAssets.PATH]) + archive_storage = ArchiveSandboxStorage( + tenant_id, sandbox_id, storage.storage_runner, exclude_patterns=[AppAssets.PATH] + ) sandbox = ( SandboxBuilder(tenant_id, SandboxType(sandbox_provider.provider_type)) @@ -77,7 +80,7 @@ class SandboxService: .initializer(DraftAppAssetsInitializer(tenant_id, app_id, assets.id)) .initializer(DifyCliInitializer(tenant_id, user_id, app_id, assets.id)) .initializer(SkillInitializer(tenant_id, user_id, app_id, assets.id)) - .storage(storage, assets.id) + .storage(archive_storage, assets.id) .build() ) @@ -98,7 +101,9 @@ class SandboxService: AppAssetPackageService.build_assets(tenant_id, app_id, assets) sandbox_id = SandboxBuilder.draft_id(user_id) - storage = ArchiveSandboxStorage(tenant_id, sandbox_id, exclude_patterns=[AppAssets.PATH]) + archive_storage = ArchiveSandboxStorage( + tenant_id, sandbox_id, storage.storage_runner, exclude_patterns=[AppAssets.PATH] + ) sandbox = ( SandboxBuilder(tenant_id, SandboxType(sandbox_provider.provider_type)) @@ -109,7 +114,7 @@ class SandboxService: .initializer(DraftAppAssetsInitializer(tenant_id, app_id, assets.id)) .initializer(DifyCliInitializer(tenant_id, user_id, app_id, assets.id)) .initializer(SkillInitializer(tenant_id, user_id, app_id, assets.id)) - .storage(storage, assets.id) + .storage(archive_storage, assets.id) .build() ) diff --git a/api/services/storage_ticket_service.py b/api/services/storage_ticket_service.py new file mode 100644 index 0000000000..e5242517fe --- /dev/null +++ b/api/services/storage_ticket_service.py @@ -0,0 +1,159 @@ +"""Storage ticket service for generating opaque download/upload URLs. + +This service provides a ticket-based approach for file access. Instead of exposing +the real storage key in URLs, it generates a random UUID token and stores the mapping +in Redis with a TTL. + +Usage: + from services.storage_ticket_service import StorageTicketService + + # Generate a download ticket + url = StorageTicketService.create_download_url("path/to/file.txt", expires_in=300) + + # Generate an upload ticket + url = StorageTicketService.create_upload_url("path/to/file.txt", expires_in=300, max_bytes=10*1024*1024) + +URL format: + {FILES_URL}/files/storage-tickets/{token} + +The token is validated by looking up the Redis key, which contains: + - op: "download" or "upload" + - storage_key: the real storage path + - max_bytes: (upload only) maximum allowed upload size + - filename: suggested filename for Content-Disposition header +""" + +import json +import logging +from dataclasses import dataclass +from uuid import uuid4 + +from configs import dify_config +from extensions.ext_redis import redis_client + +logger = logging.getLogger(__name__) + +TICKET_KEY_PREFIX = "storage_files" +DEFAULT_DOWNLOAD_TTL = 300 # 5 minutes +DEFAULT_UPLOAD_TTL = 300 # 5 minutes +DEFAULT_MAX_UPLOAD_BYTES = 100 * 1024 * 1024 # 100MB + + +@dataclass +class StorageTicket: + """Represents a storage access ticket.""" + + op: str # "download" or "upload" + storage_key: str + max_bytes: int | None = None # upload only + filename: str | None = None # suggested filename for download + + def to_dict(self) -> dict: + data = {"op": self.op, "storage_key": self.storage_key} + if self.max_bytes is not None: + data["max_bytes"] = str(self.max_bytes) + if self.filename is not None: + data["filename"] = self.filename + return data + + @classmethod + def from_dict(cls, data: dict) -> "StorageTicket": + return cls( + op=data["op"], + storage_key=data["storage_key"], + max_bytes=data.get("max_bytes"), + filename=data.get("filename"), + ) + + +class StorageTicketService: + """Service for creating and validating storage access tickets.""" + + @classmethod + def create_download_url( + cls, + storage_key: str, + *, + expires_in: int = DEFAULT_DOWNLOAD_TTL, + filename: str | None = None, + ) -> str: + """Create a download ticket and return the URL. + + Args: + storage_key: The real storage path + expires_in: TTL in seconds (default 300) + filename: Suggested filename for Content-Disposition header + + Returns: + Full URL with token + """ + if filename is None: + filename = storage_key.rsplit("/", 1)[-1] + + ticket = StorageTicket(op="download", storage_key=storage_key, filename=filename) + token = cls._store_ticket(ticket, expires_in) + return cls._build_url(token) + + @classmethod + def create_upload_url( + cls, + storage_key: str, + *, + expires_in: int = DEFAULT_UPLOAD_TTL, + max_bytes: int = DEFAULT_MAX_UPLOAD_BYTES, + ) -> str: + """Create an upload ticket and return the URL. + + Args: + storage_key: The real storage path + expires_in: TTL in seconds (default 300) + max_bytes: Maximum allowed upload size in bytes + + Returns: + Full URL with token + """ + ticket = StorageTicket(op="upload", storage_key=storage_key, max_bytes=max_bytes) + token = cls._store_ticket(ticket, expires_in) + return cls._build_url(token) + + @classmethod + def get_ticket(cls, token: str) -> StorageTicket | None: + """Retrieve a ticket by token. + + Args: + token: The UUID token from the URL + + Returns: + StorageTicket if found and valid, None otherwise + """ + key = cls._ticket_key(token) + try: + data = redis_client.get(key) + if data is None: + return None + if isinstance(data, bytes): + data = data.decode("utf-8") + return StorageTicket.from_dict(json.loads(data)) + except Exception: + logger.warning("Failed to retrieve storage ticket: %s", token, exc_info=True) + return None + + @classmethod + def _store_ticket(cls, ticket: StorageTicket, ttl: int) -> str: + """Store a ticket in Redis and return the token.""" + token = str(uuid4()) + key = cls._ticket_key(token) + value = json.dumps(ticket.to_dict()) + redis_client.setex(key, ttl, value) + return token + + @classmethod + def _ticket_key(cls, token: str) -> str: + """Generate Redis key for a token.""" + return f"{TICKET_KEY_PREFIX}:{token}" + + @classmethod + def _build_url(cls, token: str) -> str: + """Build the full URL for a token.""" + base_url = dify_config.FILES_URL + return f"{base_url}/files/storage-files/{token}" diff --git a/api/services/summary_index_service.py b/api/services/summary_index_service.py new file mode 100644 index 0000000000..b8e1f8bc3f --- /dev/null +++ b/api/services/summary_index_service.py @@ -0,0 +1,1432 @@ +"""Summary index service for generating and managing document segment summaries.""" + +import logging +import time +import uuid +from datetime import UTC, datetime +from typing import Any + +from sqlalchemy.orm import Session + +from core.db.session_factory import session_factory +from core.model_manager import ModelManager +from core.model_runtime.entities.llm_entities import LLMUsage +from core.model_runtime.entities.model_entities import ModelType +from core.rag.datasource.vdb.vector_factory import Vector +from core.rag.index_processor.constant.doc_type import DocType +from core.rag.models.document import Document +from libs import helper +from models.dataset import Dataset, DocumentSegment, DocumentSegmentSummary +from models.dataset import Document as DatasetDocument + +logger = logging.getLogger(__name__) + + +class SummaryIndexService: + """Service for generating and managing summary indexes.""" + + @staticmethod + def generate_summary_for_segment( + segment: DocumentSegment, + dataset: Dataset, + summary_index_setting: dict, + ) -> tuple[str, LLMUsage]: + """ + Generate summary for a single segment. + + Args: + segment: DocumentSegment to generate summary for + dataset: Dataset containing the segment + summary_index_setting: Summary index configuration + + Returns: + Tuple of (summary_content, llm_usage) where llm_usage is LLMUsage object + + Raises: + ValueError: If summary_index_setting is invalid or generation fails + """ + # Reuse the existing generate_summary method from ParagraphIndexProcessor + # Use lazy import to avoid circular import + from core.rag.index_processor.processor.paragraph_index_processor import ParagraphIndexProcessor + + summary_content, usage = ParagraphIndexProcessor.generate_summary( + tenant_id=dataset.tenant_id, + text=segment.content, + summary_index_setting=summary_index_setting, + segment_id=segment.id, + ) + + if not summary_content: + raise ValueError("Generated summary is empty") + + return summary_content, usage + + @staticmethod + def create_summary_record( + segment: DocumentSegment, + dataset: Dataset, + summary_content: str, + status: str = "generating", + ) -> DocumentSegmentSummary: + """ + Create or update a DocumentSegmentSummary record. + If a summary record already exists for this segment, it will be updated instead of creating a new one. + + Args: + segment: DocumentSegment to create summary for + dataset: Dataset containing the segment + summary_content: Generated summary content + status: Summary status (default: "generating") + + Returns: + Created or updated DocumentSegmentSummary instance + """ + with session_factory.create_session() as session: + # Check if summary record already exists + existing_summary = ( + session.query(DocumentSegmentSummary).filter_by(chunk_id=segment.id, dataset_id=dataset.id).first() + ) + + if existing_summary: + # Update existing record + existing_summary.summary_content = summary_content + existing_summary.status = status + existing_summary.error = None # type: ignore[assignment] # Clear any previous errors + # Re-enable if it was disabled + if not existing_summary.enabled: + existing_summary.enabled = True + existing_summary.disabled_at = None + existing_summary.disabled_by = None + session.add(existing_summary) + session.flush() + return existing_summary + else: + # Create new record (enabled by default) + summary_record = DocumentSegmentSummary( + dataset_id=dataset.id, + document_id=segment.document_id, + chunk_id=segment.id, + summary_content=summary_content, + status=status, + enabled=True, # Explicitly set enabled to True + ) + session.add(summary_record) + session.flush() + return summary_record + + @staticmethod + def vectorize_summary( + summary_record: DocumentSegmentSummary, + segment: DocumentSegment, + dataset: Dataset, + session: Session | None = None, + ) -> None: + """ + Vectorize summary and store in vector database. + + Args: + summary_record: DocumentSegmentSummary record + segment: Original DocumentSegment + dataset: Dataset containing the segment + session: Optional SQLAlchemy session. If provided, uses this session instead of creating a new one. + If not provided, creates a new session and commits automatically. + """ + if dataset.indexing_technique != "high_quality": + logger.warning( + "Summary vectorization skipped for dataset %s: indexing_technique is not high_quality", + dataset.id, + ) + return + + # Get summary_record_id for later session queries + summary_record_id = summary_record.id + # Save the original session parameter for use in error handling + original_session = session + logger.debug( + "Starting vectorization for segment %s, summary_record_id=%s, using_provided_session=%s", + segment.id, + summary_record_id, + original_session is not None, + ) + + # Reuse existing index_node_id if available (like segment does), otherwise generate new one + old_summary_node_id = summary_record.summary_index_node_id + if old_summary_node_id: + # Reuse existing index_node_id (like segment behavior) + summary_index_node_id = old_summary_node_id + logger.debug("Reusing existing index_node_id %s for segment %s", summary_index_node_id, segment.id) + else: + # Generate new index node ID only for new summaries + summary_index_node_id = str(uuid.uuid4()) + logger.debug("Generated new index_node_id %s for segment %s", summary_index_node_id, segment.id) + + # Always regenerate hash (in case summary content changed) + summary_content = summary_record.summary_content + if not summary_content or not summary_content.strip(): + raise ValueError(f"Summary content is empty for segment {segment.id}, cannot vectorize") + summary_hash = helper.generate_text_hash(summary_content) + + # Delete old vector only if we're reusing the same index_node_id (to overwrite) + # If index_node_id changed, the old vector should have been deleted elsewhere + if old_summary_node_id and old_summary_node_id == summary_index_node_id: + try: + vector = Vector(dataset) + vector.delete_by_ids([old_summary_node_id]) + except Exception as e: + logger.warning( + "Failed to delete old summary vector for segment %s: %s. Continuing with new vectorization.", + segment.id, + str(e), + ) + + # Calculate embedding tokens for summary (for logging and statistics) + embedding_tokens = 0 + try: + model_manager = ModelManager() + embedding_model = model_manager.get_model_instance( + tenant_id=dataset.tenant_id, + provider=dataset.embedding_model_provider, + model_type=ModelType.TEXT_EMBEDDING, + model=dataset.embedding_model, + ) + if embedding_model: + tokens_list = embedding_model.get_text_embedding_num_tokens([summary_content]) + embedding_tokens = tokens_list[0] if tokens_list else 0 + except Exception as e: + logger.warning("Failed to calculate embedding tokens for summary: %s", str(e)) + + # Create document with summary content and metadata + summary_document = Document( + page_content=summary_content, + metadata={ + "doc_id": summary_index_node_id, + "doc_hash": summary_hash, + "dataset_id": dataset.id, + "document_id": segment.document_id, + "original_chunk_id": segment.id, # Key: link to original chunk + "doc_type": DocType.TEXT, + "is_summary": True, # Identifier for summary documents + }, + ) + + # Vectorize and store with retry mechanism for connection errors + max_retries = 3 + retry_delay = 2.0 + + for attempt in range(max_retries): + try: + logger.debug( + "Attempting to vectorize summary for segment %s (attempt %s/%s)", + segment.id, + attempt + 1, + max_retries, + ) + vector = Vector(dataset) + # Use duplicate_check=False to ensure re-vectorization even if old vector still exists + # The old vector should have been deleted above, but if deletion failed, + # we still want to re-vectorize (upsert will overwrite) + vector.add_texts([summary_document], duplicate_check=False) + logger.debug( + "Successfully added summary vector to database for segment %s (attempt %s/%s)", + segment.id, + attempt + 1, + max_retries, + ) + + # Log embedding token usage + if embedding_tokens > 0: + logger.info( + "Summary embedding for segment %s used %s tokens", + segment.id, + embedding_tokens, + ) + + # Success - update summary record with index node info + # Use provided session if available, otherwise create a new one + use_provided_session = session is not None + if not use_provided_session: + logger.debug("Creating new session for vectorization of segment %s", segment.id) + session_context = session_factory.create_session() + session = session_context.__enter__() + else: + logger.debug("Using provided session for vectorization of segment %s", segment.id) + session_context = None # Don't use context manager for provided session + + # At this point, session is guaranteed to be not None + # Type narrowing: session is definitely not None after the if/else above + if session is None: + raise RuntimeError("Session should not be None at this point") + + try: + # Declare summary_record_in_session variable + summary_record_in_session: DocumentSegmentSummary | None + + # If using provided session, merge the summary_record into it + if use_provided_session: + # Merge the summary_record into the provided session + logger.debug( + "Merging summary_record (id=%s) into provided session for segment %s", + summary_record_id, + segment.id, + ) + summary_record_in_session = session.merge(summary_record) + logger.debug( + "Successfully merged summary_record for segment %s, merged_id=%s", + segment.id, + summary_record_in_session.id, + ) + else: + # Query the summary record in the new session + logger.debug( + "Querying summary_record by id=%s for segment %s in new session", + summary_record_id, + segment.id, + ) + summary_record_in_session = ( + session.query(DocumentSegmentSummary).filter_by(id=summary_record_id).first() + ) + + if not summary_record_in_session: + # Record not found - try to find by chunk_id and dataset_id instead + logger.debug( + "Summary record not found by id=%s, trying chunk_id=%s and dataset_id=%s " + "for segment %s", + summary_record_id, + segment.id, + dataset.id, + segment.id, + ) + summary_record_in_session = ( + session.query(DocumentSegmentSummary) + .filter_by(chunk_id=segment.id, dataset_id=dataset.id) + .first() + ) + + if not summary_record_in_session: + # Still not found - create a new one using the parameter data + logger.warning( + "Summary record not found in database for segment %s (id=%s), creating new one. " + "This may indicate a session isolation issue.", + segment.id, + summary_record_id, + ) + summary_record_in_session = DocumentSegmentSummary( + id=summary_record_id, # Use the same ID if available + dataset_id=dataset.id, + document_id=segment.document_id, + chunk_id=segment.id, + summary_content=summary_content, + summary_index_node_id=summary_index_node_id, + summary_index_node_hash=summary_hash, + tokens=embedding_tokens, + status="completed", + enabled=True, + ) + session.add(summary_record_in_session) + logger.info( + "Created new summary record (id=%s) for segment %s after vectorization", + summary_record_id, + segment.id, + ) + else: + # Found by chunk_id - update it + logger.info( + "Found summary record for segment %s by chunk_id " + "(id mismatch: expected %s, found %s). " + "This may indicate the record was created in a different session.", + segment.id, + summary_record_id, + summary_record_in_session.id, + ) + else: + logger.debug( + "Found summary_record (id=%s) for segment %s in new session", + summary_record_id, + segment.id, + ) + + # At this point, summary_record_in_session is guaranteed to be not None + if summary_record_in_session is None: + raise RuntimeError("summary_record_in_session should not be None at this point") + + # Update all fields including summary_content + # Always use the summary_content from the parameter (which is the latest from outer session) + # rather than relying on what's in the database, in case outer session hasn't committed yet + summary_record_in_session.summary_index_node_id = summary_index_node_id + summary_record_in_session.summary_index_node_hash = summary_hash + summary_record_in_session.tokens = embedding_tokens # Save embedding tokens + summary_record_in_session.status = "completed" + # Ensure summary_content is preserved (use the latest from summary_record parameter) + # This is critical: use the parameter value, not the database value + summary_record_in_session.summary_content = summary_content + # Explicitly update updated_at to ensure it's refreshed even if other fields haven't changed + summary_record_in_session.updated_at = datetime.now(UTC).replace(tzinfo=None) + session.add(summary_record_in_session) + + # Only commit if we created the session ourselves + if not use_provided_session: + logger.debug("Committing session for segment %s (self-created session)", segment.id) + session.commit() + logger.debug("Successfully committed session for segment %s", segment.id) + else: + # When using provided session, flush to ensure changes are written to database + # This prevents refresh() from overwriting our changes + logger.debug( + "Flushing session for segment %s (using provided session, caller will commit)", + segment.id, + ) + session.flush() + logger.debug("Successfully flushed session for segment %s", segment.id) + # If using provided session, let the caller handle commit + + logger.info( + "Successfully vectorized summary for segment %s, index_node_id=%s, index_node_hash=%s, " + "tokens=%s, summary_record_id=%s, use_provided_session=%s", + segment.id, + summary_index_node_id, + summary_hash, + embedding_tokens, + summary_record_in_session.id, + use_provided_session, + ) + # Update the original object for consistency + summary_record.summary_index_node_id = summary_index_node_id + summary_record.summary_index_node_hash = summary_hash + summary_record.tokens = embedding_tokens + summary_record.status = "completed" + summary_record.summary_content = summary_content + if summary_record_in_session.updated_at: + summary_record.updated_at = summary_record_in_session.updated_at + finally: + # Only close session if we created it ourselves + if not use_provided_session and session_context: + session_context.__exit__(None, None, None) + # Success, exit function + return + + except (ConnectionError, Exception) as e: + error_str = str(e).lower() + # Check if it's a connection-related error that might be transient + is_connection_error = any( + keyword in error_str + for keyword in [ + "connection", + "disconnected", + "timeout", + "network", + "could not connect", + "server disconnected", + "weaviate", + ] + ) + + if is_connection_error and attempt < max_retries - 1: + # Retry for connection errors + wait_time = retry_delay * (2**attempt) # Exponential backoff + logger.warning( + "Vectorization attempt %s/%s failed for segment %s (connection error): %s. " + "Retrying in %.1f seconds...", + attempt + 1, + max_retries, + segment.id, + str(e), + wait_time, + ) + time.sleep(wait_time) + continue + else: + # Final attempt failed or non-connection error - log and update status + logger.error( + "Failed to vectorize summary for segment %s after %s attempts: %s. " + "summary_record_id=%s, index_node_id=%s, use_provided_session=%s", + segment.id, + attempt + 1, + str(e), + summary_record_id, + summary_index_node_id, + session is not None, + exc_info=True, + ) + # Update error status in session + # Use the original_session saved at function start (the function parameter) + logger.debug( + "Updating error status for segment %s, summary_record_id=%s, has_original_session=%s", + segment.id, + summary_record_id, + original_session is not None, + ) + # Always create a new session for error handling to avoid issues with closed sessions + # Even if original_session was provided, we create a new one for safety + with session_factory.create_session() as error_session: + # Try to find the record by id first + # Note: Using assignment only (no type annotation) to avoid redeclaration error + summary_record_in_session = ( + error_session.query(DocumentSegmentSummary).filter_by(id=summary_record_id).first() + ) + if not summary_record_in_session: + # Try to find by chunk_id and dataset_id + logger.debug( + "Summary record not found by id=%s, trying chunk_id=%s and dataset_id=%s " + "for segment %s", + summary_record_id, + segment.id, + dataset.id, + segment.id, + ) + summary_record_in_session = ( + error_session.query(DocumentSegmentSummary) + .filter_by(chunk_id=segment.id, dataset_id=dataset.id) + .first() + ) + + if summary_record_in_session: + summary_record_in_session.status = "error" + summary_record_in_session.error = f"Vectorization failed: {str(e)}" + summary_record_in_session.updated_at = datetime.now(UTC).replace(tzinfo=None) + error_session.add(summary_record_in_session) + error_session.commit() + logger.info( + "Updated error status in new session for segment %s, record_id=%s", + segment.id, + summary_record_in_session.id, + ) + # Update the original object for consistency + summary_record.status = "error" + summary_record.error = summary_record_in_session.error + summary_record.updated_at = summary_record_in_session.updated_at + else: + logger.warning( + "Could not update error status: summary record not found for segment %s (id=%s). " + "This may indicate a session isolation issue.", + segment.id, + summary_record_id, + ) + raise + + @staticmethod + def batch_create_summary_records( + segments: list[DocumentSegment], + dataset: Dataset, + status: str = "not_started", + ) -> None: + """ + Batch create summary records for segments with specified status. + If a record already exists, update its status. + + Args: + segments: List of DocumentSegment instances + dataset: Dataset containing the segments + status: Initial status for the records (default: "not_started") + """ + segment_ids = [segment.id for segment in segments] + if not segment_ids: + return + + with session_factory.create_session() as session: + # Query existing summary records + existing_summaries = ( + session.query(DocumentSegmentSummary) + .filter( + DocumentSegmentSummary.chunk_id.in_(segment_ids), + DocumentSegmentSummary.dataset_id == dataset.id, + ) + .all() + ) + existing_summary_map = {summary.chunk_id: summary for summary in existing_summaries} + + # Create or update records + for segment in segments: + existing_summary = existing_summary_map.get(segment.id) + if existing_summary: + # Update existing record + existing_summary.status = status + existing_summary.error = None # type: ignore[assignment] # Clear any previous errors + if not existing_summary.enabled: + existing_summary.enabled = True + existing_summary.disabled_at = None + existing_summary.disabled_by = None + session.add(existing_summary) + else: + # Create new record + summary_record = DocumentSegmentSummary( + dataset_id=dataset.id, + document_id=segment.document_id, + chunk_id=segment.id, + summary_content=None, # Will be filled later + status=status, + enabled=True, + ) + session.add(summary_record) + + @staticmethod + def update_summary_record_error( + segment: DocumentSegment, + dataset: Dataset, + error: str, + ) -> None: + """ + Update summary record with error status. + + Args: + segment: DocumentSegment + dataset: Dataset containing the segment + error: Error message + """ + with session_factory.create_session() as session: + summary_record = ( + session.query(DocumentSegmentSummary).filter_by(chunk_id=segment.id, dataset_id=dataset.id).first() + ) + + if summary_record: + summary_record.status = "error" + summary_record.error = error + session.add(summary_record) + session.commit() + else: + logger.warning("Summary record not found for segment %s when updating error", segment.id) + + @staticmethod + def generate_and_vectorize_summary( + segment: DocumentSegment, + dataset: Dataset, + summary_index_setting: dict, + ) -> DocumentSegmentSummary: + """ + Generate summary for a segment and vectorize it. + Assumes summary record already exists (created by batch_create_summary_records). + + Args: + segment: DocumentSegment to generate summary for + dataset: Dataset containing the segment + summary_index_setting: Summary index configuration + + Returns: + Created DocumentSegmentSummary instance + + Raises: + ValueError: If summary generation fails + """ + with session_factory.create_session() as session: + try: + # Get or refresh summary record in this session + summary_record_in_session = ( + session.query(DocumentSegmentSummary).filter_by(chunk_id=segment.id, dataset_id=dataset.id).first() + ) + + if not summary_record_in_session: + # If not found, create one + logger.warning("Summary record not found for segment %s, creating one", segment.id) + summary_record_in_session = DocumentSegmentSummary( + dataset_id=dataset.id, + document_id=segment.document_id, + chunk_id=segment.id, + summary_content="", + status="generating", + enabled=True, + ) + session.add(summary_record_in_session) + session.flush() + + # Update status to "generating" + summary_record_in_session.status = "generating" + summary_record_in_session.error = None # type: ignore[assignment] + session.add(summary_record_in_session) + # Don't flush here - wait until after vectorization succeeds + + # Generate summary (returns summary_content and llm_usage) + summary_content, llm_usage = SummaryIndexService.generate_summary_for_segment( + segment, dataset, summary_index_setting + ) + + # Update summary content + summary_record_in_session.summary_content = summary_content + session.add(summary_record_in_session) + # Flush to ensure summary_content is saved before vectorize_summary queries it + session.flush() + + # Log LLM usage for summary generation + if llm_usage and llm_usage.total_tokens > 0: + logger.info( + "Summary generation for segment %s used %s tokens (prompt: %s, completion: %s)", + segment.id, + llm_usage.total_tokens, + llm_usage.prompt_tokens, + llm_usage.completion_tokens, + ) + + # Vectorize summary (will delete old vector if exists before creating new one) + # Pass the session-managed record to vectorize_summary + # vectorize_summary will update status to "completed" and tokens in its own session + # vectorize_summary will also ensure summary_content is preserved + try: + # Pass the session to vectorize_summary to avoid session isolation issues + SummaryIndexService.vectorize_summary(summary_record_in_session, segment, dataset, session=session) + # Refresh the object from database to get the updated status and tokens from vectorize_summary + session.refresh(summary_record_in_session) + # Commit the session + # (summary_record_in_session should have status="completed" and tokens from refresh) + session.commit() + logger.info("Successfully generated and vectorized summary for segment %s", segment.id) + return summary_record_in_session + except Exception as vectorize_error: + # If vectorization fails, update status to error in current session + logger.exception("Failed to vectorize summary for segment %s", segment.id) + summary_record_in_session.status = "error" + summary_record_in_session.error = f"Vectorization failed: {str(vectorize_error)}" + session.add(summary_record_in_session) + session.commit() + raise + + except Exception as e: + logger.exception("Failed to generate summary for segment %s", segment.id) + # Update summary record with error status + summary_record_in_session = ( + session.query(DocumentSegmentSummary).filter_by(chunk_id=segment.id, dataset_id=dataset.id).first() + ) + if summary_record_in_session: + summary_record_in_session.status = "error" + summary_record_in_session.error = str(e) + session.add(summary_record_in_session) + session.commit() + raise + + @staticmethod + def generate_summaries_for_document( + dataset: Dataset, + document: DatasetDocument, + summary_index_setting: dict, + segment_ids: list[str] | None = None, + only_parent_chunks: bool = False, + ) -> list[DocumentSegmentSummary]: + """ + Generate summaries for all segments in a document including vectorization. + + Args: + dataset: Dataset containing the document + document: DatasetDocument to generate summaries for + summary_index_setting: Summary index configuration + segment_ids: Optional list of specific segment IDs to process + only_parent_chunks: If True, only process parent chunks (for parent-child mode) + + Returns: + List of created DocumentSegmentSummary instances + """ + # Only generate summary index for high_quality indexing technique + if dataset.indexing_technique != "high_quality": + logger.info( + "Skipping summary generation for dataset %s: indexing_technique is %s, not 'high_quality'", + dataset.id, + dataset.indexing_technique, + ) + return [] + + if not summary_index_setting or not summary_index_setting.get("enable"): + logger.info("Summary index is disabled for dataset %s", dataset.id) + return [] + + # Skip qa_model documents + if document.doc_form == "qa_model": + logger.info("Skipping summary generation for qa_model document %s", document.id) + return [] + + logger.info( + "Starting summary generation for document %s in dataset %s, segment_ids: %s, only_parent_chunks: %s", + document.id, + dataset.id, + len(segment_ids) if segment_ids else "all", + only_parent_chunks, + ) + + with session_factory.create_session() as session: + # Query segments (only enabled segments) + query = session.query(DocumentSegment).filter_by( + dataset_id=dataset.id, + document_id=document.id, + status="completed", + enabled=True, # Only generate summaries for enabled segments + ) + + if segment_ids: + query = query.filter(DocumentSegment.id.in_(segment_ids)) + + segments = query.all() + + if not segments: + logger.info("No segments found for document %s", document.id) + return [] + + # Batch create summary records with "not_started" status before processing + # This ensures all records exist upfront, allowing status tracking + SummaryIndexService.batch_create_summary_records( + segments=segments, + dataset=dataset, + status="not_started", + ) + session.commit() # Commit initial records + + summary_records = [] + + for segment in segments: + # For parent-child mode, only process parent chunks + # In parent-child mode, all DocumentSegments are parent chunks, + # so we process all of them. Child chunks are stored in ChildChunk table + # and are not DocumentSegments, so they won't be in the segments list. + # This check is mainly for clarity and future-proofing. + if only_parent_chunks: + # In parent-child mode, all segments in the query are parent chunks + # Child chunks are not DocumentSegments, so they won't appear here + # We can process all segments + pass + + try: + summary_record = SummaryIndexService.generate_and_vectorize_summary( + segment, dataset, summary_index_setting + ) + summary_records.append(summary_record) + except Exception as e: + logger.exception("Failed to generate summary for segment %s", segment.id) + # Update summary record with error status + SummaryIndexService.update_summary_record_error( + segment=segment, + dataset=dataset, + error=str(e), + ) + # Continue with other segments + continue + + logger.info( + "Completed summary generation for document %s: %s summaries generated and vectorized", + document.id, + len(summary_records), + ) + return summary_records + + @staticmethod + def disable_summaries_for_segments( + dataset: Dataset, + segment_ids: list[str] | None = None, + disabled_by: str | None = None, + ) -> None: + """ + Disable summary records and remove vectors from vector database for segments. + Unlike delete, this preserves the summary records but marks them as disabled. + + Args: + dataset: Dataset containing the segments + segment_ids: List of segment IDs to disable summaries for. If None, disable all. + disabled_by: User ID who disabled the summaries + """ + from libs.datetime_utils import naive_utc_now + + with session_factory.create_session() as session: + query = session.query(DocumentSegmentSummary).filter_by( + dataset_id=dataset.id, + enabled=True, # Only disable enabled summaries + ) + + if segment_ids: + query = query.filter(DocumentSegmentSummary.chunk_id.in_(segment_ids)) + + summaries = query.all() + + if not summaries: + return + + logger.info( + "Disabling %s summary records for dataset %s, segment_ids: %s", + len(summaries), + dataset.id, + len(segment_ids) if segment_ids else "all", + ) + + # Remove from vector database (but keep records) + if dataset.indexing_technique == "high_quality": + summary_node_ids = [s.summary_index_node_id for s in summaries if s.summary_index_node_id] + if summary_node_ids: + try: + vector = Vector(dataset) + vector.delete_by_ids(summary_node_ids) + except Exception as e: + logger.warning("Failed to remove summary vectors: %s", str(e)) + + # Disable summary records (don't delete) + now = naive_utc_now() + for summary in summaries: + summary.enabled = False + summary.disabled_at = now + summary.disabled_by = disabled_by + session.add(summary) + + session.commit() + logger.info("Disabled %s summary records for dataset %s", len(summaries), dataset.id) + + @staticmethod + def enable_summaries_for_segments( + dataset: Dataset, + segment_ids: list[str] | None = None, + ) -> None: + """ + Enable summary records and re-add vectors to vector database for segments. + + Note: This method enables summaries based on chunk status, not summary_index_setting.enable. + The summary_index_setting.enable flag only controls automatic generation, + not whether existing summaries can be used. + Summary.enabled should always be kept in sync with chunk.enabled. + + Args: + dataset: Dataset containing the segments + segment_ids: List of segment IDs to enable summaries for. If None, enable all. + """ + # Only enable summary index for high_quality indexing technique + if dataset.indexing_technique != "high_quality": + return + + with session_factory.create_session() as session: + query = session.query(DocumentSegmentSummary).filter_by( + dataset_id=dataset.id, + enabled=False, # Only enable disabled summaries + ) + + if segment_ids: + query = query.filter(DocumentSegmentSummary.chunk_id.in_(segment_ids)) + + summaries = query.all() + + if not summaries: + return + + logger.info( + "Enabling %s summary records for dataset %s, segment_ids: %s", + len(summaries), + dataset.id, + len(segment_ids) if segment_ids else "all", + ) + + # Re-vectorize and re-add to vector database + enabled_count = 0 + for summary in summaries: + # Get the original segment + segment = ( + session.query(DocumentSegment) + .filter_by( + id=summary.chunk_id, + dataset_id=dataset.id, + ) + .first() + ) + + # Summary.enabled stays in sync with chunk.enabled, + # only enable summary if the associated chunk is enabled. + if not segment or not segment.enabled or segment.status != "completed": + continue + + if not summary.summary_content: + continue + + try: + # Re-vectorize summary (this will update status and tokens in its own session) + # Pass the session to vectorize_summary to avoid session isolation issues + SummaryIndexService.vectorize_summary(summary, segment, dataset, session=session) + + # Refresh the object from database to get the updated status and tokens from vectorize_summary + session.refresh(summary) + + # Enable summary record + summary.enabled = True + summary.disabled_at = None + summary.disabled_by = None + session.add(summary) + enabled_count += 1 + except Exception: + logger.exception("Failed to re-vectorize summary %s", summary.id) + # Keep it disabled if vectorization fails + continue + + session.commit() + logger.info("Enabled %s summary records for dataset %s", enabled_count, dataset.id) + + @staticmethod + def delete_summaries_for_segments( + dataset: Dataset, + segment_ids: list[str] | None = None, + ) -> None: + """ + Delete summary records and vectors for segments (used only for actual deletion scenarios). + For disable/enable operations, use disable_summaries_for_segments/enable_summaries_for_segments. + + Args: + dataset: Dataset containing the segments + segment_ids: List of segment IDs to delete summaries for. If None, delete all. + """ + with session_factory.create_session() as session: + query = session.query(DocumentSegmentSummary).filter_by(dataset_id=dataset.id) + + if segment_ids: + query = query.filter(DocumentSegmentSummary.chunk_id.in_(segment_ids)) + + summaries = query.all() + + if not summaries: + return + + # Delete from vector database + if dataset.indexing_technique == "high_quality": + summary_node_ids = [s.summary_index_node_id for s in summaries if s.summary_index_node_id] + if summary_node_ids: + vector = Vector(dataset) + vector.delete_by_ids(summary_node_ids) + + # Delete summary records + for summary in summaries: + session.delete(summary) + + session.commit() + logger.info("Deleted %s summary records for dataset %s", len(summaries), dataset.id) + + @staticmethod + def update_summary_for_segment( + segment: DocumentSegment, + dataset: Dataset, + summary_content: str, + ) -> DocumentSegmentSummary | None: + """ + Update summary for a segment and re-vectorize it. + + Args: + segment: DocumentSegment to update summary for + dataset: Dataset containing the segment + summary_content: New summary content + + Returns: + Updated DocumentSegmentSummary instance, or None if indexing technique is not high_quality + """ + # Only update summary index for high_quality indexing technique + if dataset.indexing_technique != "high_quality": + return None + + # When user manually provides summary, allow saving even if summary_index_setting doesn't exist + # summary_index_setting is only needed for LLM generation, not for manual summary vectorization + # Vectorization uses dataset.embedding_model, which doesn't require summary_index_setting + + # Skip qa_model documents + if segment.document and segment.document.doc_form == "qa_model": + return None + + with session_factory.create_session() as session: + try: + # Check if summary_content is empty (whitespace-only strings are considered empty) + if not summary_content or not summary_content.strip(): + # If summary is empty, only delete existing summary vector and record + summary_record = ( + session.query(DocumentSegmentSummary) + .filter_by(chunk_id=segment.id, dataset_id=dataset.id) + .first() + ) + + if summary_record: + # Delete old vector if exists + old_summary_node_id = summary_record.summary_index_node_id + if old_summary_node_id: + try: + vector = Vector(dataset) + vector.delete_by_ids([old_summary_node_id]) + except Exception as e: + logger.warning( + "Failed to delete old summary vector for segment %s: %s", + segment.id, + str(e), + ) + + # Delete summary record since summary is empty + session.delete(summary_record) + session.commit() + logger.info("Deleted summary for segment %s (empty content provided)", segment.id) + return None + else: + # No existing summary record, nothing to do + logger.info("No summary record found for segment %s, nothing to delete", segment.id) + return None + + # Find existing summary record + summary_record = ( + session.query(DocumentSegmentSummary).filter_by(chunk_id=segment.id, dataset_id=dataset.id).first() + ) + + if summary_record: + # Update existing summary + old_summary_node_id = summary_record.summary_index_node_id + + # Update summary content + summary_record.summary_content = summary_content + summary_record.status = "generating" + summary_record.error = None # type: ignore[assignment] # Clear any previous errors + session.add(summary_record) + # Flush to ensure summary_content is saved before vectorize_summary queries it + session.flush() + + # Delete old vector if exists (before vectorization) + if old_summary_node_id: + try: + vector = Vector(dataset) + vector.delete_by_ids([old_summary_node_id]) + except Exception as e: + logger.warning( + "Failed to delete old summary vector for segment %s: %s", + segment.id, + str(e), + ) + + # Re-vectorize summary (this will update status to "completed" and tokens in its own session) + # vectorize_summary will also ensure summary_content is preserved + # Note: vectorize_summary may take time due to embedding API calls, but we need to complete it + # to ensure the summary is properly indexed + try: + # Pass the session to vectorize_summary to avoid session isolation issues + SummaryIndexService.vectorize_summary(summary_record, segment, dataset, session=session) + # Refresh the object from database to get the updated status and tokens from vectorize_summary + session.refresh(summary_record) + # Now commit the session (summary_record should have status="completed" and tokens from refresh) + session.commit() + logger.info("Successfully updated and re-vectorized summary for segment %s", segment.id) + return summary_record + except Exception as e: + # If vectorization fails, update status to error in current session + # Don't raise the exception - just log it and return the record with error status + # This allows the segment update to complete even if vectorization fails + summary_record.status = "error" + summary_record.error = f"Vectorization failed: {str(e)}" + session.commit() + logger.exception("Failed to vectorize summary for segment %s", segment.id) + # Return the record with error status instead of raising + # The caller can check the status if needed + return summary_record + else: + # Create new summary record if doesn't exist + summary_record = SummaryIndexService.create_summary_record( + segment, dataset, summary_content, status="generating" + ) + # Re-vectorize summary (this will update status to "completed" and tokens in its own session) + # Note: summary_record was created in a different session, + # so we need to merge it into current session + try: + # Merge the record into current session first (since it was created in a different session) + summary_record = session.merge(summary_record) + # Pass the session to vectorize_summary - it will update the merged record + SummaryIndexService.vectorize_summary(summary_record, segment, dataset, session=session) + # Refresh to get updated status and tokens from database + session.refresh(summary_record) + # Commit the session to persist the changes + session.commit() + logger.info("Successfully created and vectorized summary for segment %s", segment.id) + return summary_record + except Exception as e: + # If vectorization fails, update status to error in current session + # Merge the record into current session first + error_record = session.merge(summary_record) + error_record.status = "error" + error_record.error = f"Vectorization failed: {str(e)}" + session.commit() + logger.exception("Failed to vectorize summary for segment %s", segment.id) + # Return the record with error status instead of raising + return error_record + + except Exception as e: + logger.exception("Failed to update summary for segment %s", segment.id) + # Update summary record with error status if it exists + summary_record = ( + session.query(DocumentSegmentSummary).filter_by(chunk_id=segment.id, dataset_id=dataset.id).first() + ) + if summary_record: + summary_record.status = "error" + summary_record.error = str(e) + session.add(summary_record) + session.commit() + raise + + @staticmethod + def get_segment_summary(segment_id: str, dataset_id: str) -> DocumentSegmentSummary | None: + """ + Get summary for a single segment. + + Args: + segment_id: Segment ID (chunk_id) + dataset_id: Dataset ID + + Returns: + DocumentSegmentSummary instance if found, None otherwise + """ + with session_factory.create_session() as session: + return ( + session.query(DocumentSegmentSummary) + .where( + DocumentSegmentSummary.chunk_id == segment_id, + DocumentSegmentSummary.dataset_id == dataset_id, + DocumentSegmentSummary.enabled == True, # Only return enabled summaries + ) + .first() + ) + + @staticmethod + def get_segments_summaries(segment_ids: list[str], dataset_id: str) -> dict[str, DocumentSegmentSummary]: + """ + Get summaries for multiple segments. + + Args: + segment_ids: List of segment IDs (chunk_ids) + dataset_id: Dataset ID + + Returns: + Dictionary mapping segment_id to DocumentSegmentSummary (only enabled summaries) + """ + if not segment_ids: + return {} + + with session_factory.create_session() as session: + summary_records = ( + session.query(DocumentSegmentSummary) + .where( + DocumentSegmentSummary.chunk_id.in_(segment_ids), + DocumentSegmentSummary.dataset_id == dataset_id, + DocumentSegmentSummary.enabled == True, # Only return enabled summaries + ) + .all() + ) + + return {summary.chunk_id: summary for summary in summary_records} + + @staticmethod + def get_document_summaries( + document_id: str, dataset_id: str, segment_ids: list[str] | None = None + ) -> list[DocumentSegmentSummary]: + """ + Get all summary records for a document. + + Args: + document_id: Document ID + dataset_id: Dataset ID + segment_ids: Optional list of segment IDs to filter by + + Returns: + List of DocumentSegmentSummary instances (only enabled summaries) + """ + with session_factory.create_session() as session: + query = session.query(DocumentSegmentSummary).filter( + DocumentSegmentSummary.document_id == document_id, + DocumentSegmentSummary.dataset_id == dataset_id, + DocumentSegmentSummary.enabled == True, # Only return enabled summaries + ) + + if segment_ids: + query = query.filter(DocumentSegmentSummary.chunk_id.in_(segment_ids)) + + return query.all() + + @staticmethod + def get_document_summary_index_status(document_id: str, dataset_id: str, tenant_id: str) -> str | None: + """ + Get summary_index_status for a single document. + + Args: + document_id: Document ID + dataset_id: Dataset ID + tenant_id: Tenant ID + + Returns: + "SUMMARIZING" if there are pending summaries, None otherwise + """ + # Get all segments for this document (excluding qa_model and re_segment) + with session_factory.create_session() as session: + segments = ( + session.query(DocumentSegment.id) + .where( + DocumentSegment.document_id == document_id, + DocumentSegment.status != "re_segment", + DocumentSegment.tenant_id == tenant_id, + ) + .all() + ) + segment_ids = [seg.id for seg in segments] + + if not segment_ids: + return None + + # Get all summary records for these segments + summaries = SummaryIndexService.get_segments_summaries(segment_ids, dataset_id) + summary_status_map = {chunk_id: summary.status for chunk_id, summary in summaries.items()} + + # Check if there are any "not_started" or "generating" status summaries + has_pending_summaries = any( + summary_status_map.get(segment_id) is not None # Ensure summary exists (enabled=True) + and summary_status_map[segment_id] in ("not_started", "generating") + for segment_id in segment_ids + ) + + return "SUMMARIZING" if has_pending_summaries else None + + @staticmethod + def get_documents_summary_index_status( + document_ids: list[str], dataset_id: str, tenant_id: str + ) -> dict[str, str | None]: + """ + Get summary_index_status for multiple documents. + + Args: + document_ids: List of document IDs + dataset_id: Dataset ID + tenant_id: Tenant ID + + Returns: + Dictionary mapping document_id to summary_index_status ("SUMMARIZING" or None) + """ + if not document_ids: + return {} + + # Get all segments for these documents (excluding qa_model and re_segment) + with session_factory.create_session() as session: + segments = ( + session.query(DocumentSegment.id, DocumentSegment.document_id) + .where( + DocumentSegment.document_id.in_(document_ids), + DocumentSegment.status != "re_segment", + DocumentSegment.tenant_id == tenant_id, + ) + .all() + ) + + # Group segments by document_id + document_segments_map: dict[str, list[str]] = {} + for segment in segments: + doc_id = str(segment.document_id) + if doc_id not in document_segments_map: + document_segments_map[doc_id] = [] + document_segments_map[doc_id].append(segment.id) + + # Get all summary records for these segments + all_segment_ids = [seg.id for seg in segments] + summaries = SummaryIndexService.get_segments_summaries(all_segment_ids, dataset_id) + summary_status_map = {chunk_id: summary.status for chunk_id, summary in summaries.items()} + + # Calculate summary_index_status for each document + result: dict[str, str | None] = {} + for doc_id in document_ids: + segment_ids = document_segments_map.get(doc_id, []) + if not segment_ids: + # No segments, status is None (not started) + result[doc_id] = None + continue + + # Check if there are any "not_started" or "generating" status summaries + # Only check enabled=True summaries (already filtered in query) + # If segment has no summary record (summary_status_map.get returns None), + # it means the summary is disabled (enabled=False) or not created yet, ignore it + has_pending_summaries = any( + summary_status_map.get(segment_id) is not None # Ensure summary exists (enabled=True) + and summary_status_map[segment_id] in ("not_started", "generating") + for segment_id in segment_ids + ) + + if has_pending_summaries: + # Task is still running (not started or generating) + result[doc_id] = "SUMMARIZING" + else: + # All enabled=True summaries are "completed" or "error", task finished + # Or no enabled=True summaries exist (all disabled) + result[doc_id] = None + + return result + + @staticmethod + def get_document_summary_status_detail( + document_id: str, + dataset_id: str, + ) -> dict[str, Any]: + """ + Get detailed summary status for a document. + + Args: + document_id: Document ID + dataset_id: Dataset ID + + Returns: + Dictionary containing: + - total_segments: Total number of segments in the document + - summary_status: Dictionary with status counts + - completed: Number of summaries completed + - generating: Number of summaries being generated + - error: Number of summaries with errors + - not_started: Number of segments without summary records + - summaries: List of summary records with status and content preview + """ + from services.dataset_service import SegmentService + + # Get all segments for this document + segments = SegmentService.get_segments_by_document_and_dataset( + document_id=document_id, + dataset_id=dataset_id, + status="completed", + enabled=True, + ) + + total_segments = len(segments) + + # Get all summary records for these segments + segment_ids = [segment.id for segment in segments] + summaries = [] + if segment_ids: + summaries = SummaryIndexService.get_document_summaries( + document_id=document_id, + dataset_id=dataset_id, + segment_ids=segment_ids, + ) + + # Create a mapping of chunk_id to summary + summary_map = {summary.chunk_id: summary for summary in summaries} + + # Count statuses + status_counts = { + "completed": 0, + "generating": 0, + "error": 0, + "not_started": 0, + } + + summary_list = [] + for segment in segments: + summary = summary_map.get(segment.id) + if summary: + status = summary.status + status_counts[status] = status_counts.get(status, 0) + 1 + summary_list.append( + { + "segment_id": segment.id, + "segment_position": segment.position, + "status": summary.status, + "summary_preview": ( + summary.summary_content[:100] + "..." + if summary.summary_content and len(summary.summary_content) > 100 + else summary.summary_content + ), + "error": summary.error, + "created_at": int(summary.created_at.timestamp()) if summary.created_at else None, + "updated_at": int(summary.updated_at.timestamp()) if summary.updated_at else None, + } + ) + else: + status_counts["not_started"] += 1 + summary_list.append( + { + "segment_id": segment.id, + "segment_position": segment.position, + "status": "not_started", + "summary_preview": None, + "error": None, + "created_at": None, + "updated_at": None, + } + ) + + return { + "total_segments": total_segments, + "summary_status": status_counts, + "summaries": summary_list, + } diff --git a/api/tasks/add_document_to_index_task.py b/api/tasks/add_document_to_index_task.py index 62e6497e9d..2d3d00cd50 100644 --- a/api/tasks/add_document_to_index_task.py +++ b/api/tasks/add_document_to_index_task.py @@ -118,6 +118,19 @@ def add_document_to_index_task(dataset_document_id: str): ) session.commit() + # Enable summary indexes for all segments in this document + from services.summary_index_service import SummaryIndexService + + segment_ids_list = [segment.id for segment in segments] + if segment_ids_list: + try: + SummaryIndexService.enable_summaries_for_segments( + dataset=dataset, + segment_ids=segment_ids_list, + ) + except Exception as e: + logger.warning("Failed to enable summaries for document %s: %s", dataset_document.id, str(e)) + end_at = time.perf_counter() logger.info( click.style(f"Document added to index: {dataset_document.id} latency: {end_at - start_at}", fg="green") diff --git a/api/tasks/batch_clean_document_task.py b/api/tasks/batch_clean_document_task.py index 74b939e84d..d388284980 100644 --- a/api/tasks/batch_clean_document_task.py +++ b/api/tasks/batch_clean_document_task.py @@ -50,7 +50,9 @@ def batch_clean_document_task(document_ids: list[str], dataset_id: str, doc_form if segments: index_node_ids = [segment.index_node_id for segment in segments] index_processor = IndexProcessorFactory(doc_form).init_index_processor() - index_processor.clean(dataset, index_node_ids, with_keywords=True, delete_child_chunks=True) + index_processor.clean( + dataset, index_node_ids, with_keywords=True, delete_child_chunks=True, delete_summaries=True + ) for segment in segments: image_upload_file_ids = get_image_upload_file_ids(segment.content) diff --git a/api/tasks/clean_document_task.py b/api/tasks/clean_document_task.py index 86e7cc7160..91ace6be02 100644 --- a/api/tasks/clean_document_task.py +++ b/api/tasks/clean_document_task.py @@ -51,7 +51,9 @@ def clean_document_task(document_id: str, dataset_id: str, doc_form: str, file_i if segments: index_node_ids = [segment.index_node_id for segment in segments] index_processor = IndexProcessorFactory(doc_form).init_index_processor() - index_processor.clean(dataset, index_node_ids, with_keywords=True, delete_child_chunks=True) + index_processor.clean( + dataset, index_node_ids, with_keywords=True, delete_child_chunks=True, delete_summaries=True + ) for segment in segments: image_upload_file_ids = get_image_upload_file_ids(segment.content) diff --git a/api/tasks/clean_notion_document_task.py b/api/tasks/clean_notion_document_task.py index bcca1bf49f..4214f043e0 100644 --- a/api/tasks/clean_notion_document_task.py +++ b/api/tasks/clean_notion_document_task.py @@ -42,7 +42,9 @@ def clean_notion_document_task(document_ids: list[str], dataset_id: str): ).all() index_node_ids = [segment.index_node_id for segment in segments] - index_processor.clean(dataset, index_node_ids, with_keywords=True, delete_child_chunks=True) + index_processor.clean( + dataset, index_node_ids, with_keywords=True, delete_child_chunks=True, delete_summaries=True + ) segment_ids = [segment.id for segment in segments] segment_delete_stmt = delete(DocumentSegment).where(DocumentSegment.id.in_(segment_ids)) session.execute(segment_delete_stmt) diff --git a/api/tasks/delete_segment_from_index_task.py b/api/tasks/delete_segment_from_index_task.py index bfa709502c..764c635d83 100644 --- a/api/tasks/delete_segment_from_index_task.py +++ b/api/tasks/delete_segment_from_index_task.py @@ -47,6 +47,7 @@ def delete_segment_from_index_task( doc_form = dataset_document.doc_form # Proceed with index cleanup using the index_node_ids directly + # For actual deletion, we should delete summaries (not just disable them) index_processor = IndexProcessorFactory(doc_form).init_index_processor() index_processor.clean( dataset, @@ -54,6 +55,7 @@ def delete_segment_from_index_task( with_keywords=True, delete_child_chunks=True, precomputed_child_node_ids=child_node_ids, + delete_summaries=True, # Actually delete summaries when segment is deleted ) if dataset.is_multimodal: # delete segment attachment binding diff --git a/api/tasks/disable_segment_from_index_task.py b/api/tasks/disable_segment_from_index_task.py index 0ce6429a94..bc45171623 100644 --- a/api/tasks/disable_segment_from_index_task.py +++ b/api/tasks/disable_segment_from_index_task.py @@ -60,6 +60,18 @@ def disable_segment_from_index_task(segment_id: str): index_processor = IndexProcessorFactory(index_type).init_index_processor() index_processor.clean(dataset, [segment.index_node_id]) + # Disable summary index for this segment + from services.summary_index_service import SummaryIndexService + + try: + SummaryIndexService.disable_summaries_for_segments( + dataset=dataset, + segment_ids=[segment.id], + disabled_by=segment.disabled_by, + ) + except Exception as e: + logger.warning("Failed to disable summary for segment %s: %s", segment.id, str(e)) + end_at = time.perf_counter() logger.info( click.style( diff --git a/api/tasks/disable_segments_from_index_task.py b/api/tasks/disable_segments_from_index_task.py index 03635902d1..3cc267e821 100644 --- a/api/tasks/disable_segments_from_index_task.py +++ b/api/tasks/disable_segments_from_index_task.py @@ -68,6 +68,21 @@ def disable_segments_from_index_task(segment_ids: list, dataset_id: str, documen index_node_ids.extend(attachment_ids) index_processor.clean(dataset, index_node_ids, with_keywords=True, delete_child_chunks=False) + # Disable summary indexes for these segments + from services.summary_index_service import SummaryIndexService + + segment_ids_list = [segment.id for segment in segments] + try: + # Get disabled_by from first segment (they should all have the same disabled_by) + disabled_by = segments[0].disabled_by if segments else None + SummaryIndexService.disable_summaries_for_segments( + dataset=dataset, + segment_ids=segment_ids_list, + disabled_by=disabled_by, + ) + except Exception as e: + logger.warning("Failed to disable summaries for segments: %s", str(e)) + end_at = time.perf_counter() logger.info(click.style(f"Segments removed from index latency: {end_at - start_at}", fg="green")) except Exception: diff --git a/api/tasks/document_indexing_task.py b/api/tasks/document_indexing_task.py index 3bdff60196..34496e9c6f 100644 --- a/api/tasks/document_indexing_task.py +++ b/api/tasks/document_indexing_task.py @@ -14,6 +14,7 @@ from enums.cloud_plan import CloudPlan from libs.datetime_utils import naive_utc_now from models.dataset import Dataset, Document from services.feature_service import FeatureService +from tasks.generate_summary_index_task import generate_summary_index_task logger = logging.getLogger(__name__) @@ -99,6 +100,78 @@ def _document_indexing(dataset_id: str, document_ids: Sequence[str]): indexing_runner.run(documents) end_at = time.perf_counter() logger.info(click.style(f"Processed dataset: {dataset_id} latency: {end_at - start_at}", fg="green")) + + # Trigger summary index generation for completed documents if enabled + # Only generate for high_quality indexing technique and when summary_index_setting is enabled + # Re-query dataset to get latest summary_index_setting (in case it was updated) + dataset = session.query(Dataset).where(Dataset.id == dataset_id).first() + if not dataset: + logger.warning("Dataset %s not found after indexing", dataset_id) + return + + if dataset.indexing_technique == "high_quality": + summary_index_setting = dataset.summary_index_setting + if summary_index_setting and summary_index_setting.get("enable"): + # expire all session to get latest document's indexing status + session.expire_all() + # Check each document's indexing status and trigger summary generation if completed + for document_id in document_ids: + # Re-query document to get latest status (IndexingRunner may have updated it) + document = ( + session.query(Document) + .where(Document.id == document_id, Document.dataset_id == dataset_id) + .first() + ) + if document: + logger.info( + "Checking document %s for summary generation: status=%s, doc_form=%s, need_summary=%s", + document_id, + document.indexing_status, + document.doc_form, + document.need_summary, + ) + if ( + document.indexing_status == "completed" + and document.doc_form != "qa_model" + and document.need_summary is True + ): + try: + generate_summary_index_task.delay(dataset.id, document_id, None) + logger.info( + "Queued summary index generation task for document %s in dataset %s " + "after indexing completed", + document_id, + dataset.id, + ) + except Exception: + logger.exception( + "Failed to queue summary index generation task for document %s", + document_id, + ) + # Don't fail the entire indexing process if summary task queuing fails + else: + logger.info( + "Skipping summary generation for document %s: " + "status=%s, doc_form=%s, need_summary=%s", + document_id, + document.indexing_status, + document.doc_form, + document.need_summary, + ) + else: + logger.warning("Document %s not found after indexing", document_id) + else: + logger.info( + "Summary index generation skipped for dataset %s: summary_index_setting.enable=%s", + dataset.id, + summary_index_setting.get("enable") if summary_index_setting else None, + ) + else: + logger.info( + "Summary index generation skipped for dataset %s: indexing_technique=%s (not 'high_quality')", + dataset.id, + dataset.indexing_technique, + ) except DocumentIsPausedError as ex: logger.info(click.style(str(ex), fg="yellow")) except Exception: diff --git a/api/tasks/enable_segment_to_index_task.py b/api/tasks/enable_segment_to_index_task.py index 1f9f21aa7e..41ebb0b076 100644 --- a/api/tasks/enable_segment_to_index_task.py +++ b/api/tasks/enable_segment_to_index_task.py @@ -106,6 +106,17 @@ def enable_segment_to_index_task(segment_id: str): # save vector index index_processor.load(dataset, [document], multimodal_documents=multimodel_documents) + # Enable summary index for this segment + from services.summary_index_service import SummaryIndexService + + try: + SummaryIndexService.enable_summaries_for_segments( + dataset=dataset, + segment_ids=[segment.id], + ) + except Exception as e: + logger.warning("Failed to enable summary for segment %s: %s", segment.id, str(e)) + end_at = time.perf_counter() logger.info(click.style(f"Segment enabled to index: {segment.id} latency: {end_at - start_at}", fg="green")) except Exception as e: diff --git a/api/tasks/enable_segments_to_index_task.py b/api/tasks/enable_segments_to_index_task.py index 48d3c8e178..d90eb4c39f 100644 --- a/api/tasks/enable_segments_to_index_task.py +++ b/api/tasks/enable_segments_to_index_task.py @@ -106,6 +106,18 @@ def enable_segments_to_index_task(segment_ids: list, dataset_id: str, document_i # save vector index index_processor.load(dataset, documents, multimodal_documents=multimodal_documents) + # Enable summary indexes for these segments + from services.summary_index_service import SummaryIndexService + + segment_ids_list = [segment.id for segment in segments] + try: + SummaryIndexService.enable_summaries_for_segments( + dataset=dataset, + segment_ids=segment_ids_list, + ) + except Exception as e: + logger.warning("Failed to enable summaries for segments: %s", str(e)) + end_at = time.perf_counter() logger.info(click.style(f"Segments enabled to index latency: {end_at - start_at}", fg="green")) except Exception as e: diff --git a/api/tasks/generate_summary_index_task.py b/api/tasks/generate_summary_index_task.py new file mode 100644 index 0000000000..e4273e16b5 --- /dev/null +++ b/api/tasks/generate_summary_index_task.py @@ -0,0 +1,119 @@ +"""Async task for generating summary indexes.""" + +import logging +import time + +import click +from celery import shared_task + +from core.db.session_factory import session_factory +from models.dataset import Dataset, DocumentSegment +from models.dataset import Document as DatasetDocument +from services.summary_index_service import SummaryIndexService + +logger = logging.getLogger(__name__) + + +@shared_task(queue="dataset") +def generate_summary_index_task(dataset_id: str, document_id: str, segment_ids: list[str] | None = None): + """ + Async generate summary index for document segments. + + Args: + dataset_id: Dataset ID + document_id: Document ID + segment_ids: Optional list of specific segment IDs to process. If None, process all segments. + + Usage: + generate_summary_index_task.delay(dataset_id, document_id) + generate_summary_index_task.delay(dataset_id, document_id, segment_ids) + """ + logger.info( + click.style( + f"Start generating summary index for document {document_id} in dataset {dataset_id}", + fg="green", + ) + ) + start_at = time.perf_counter() + + try: + with session_factory.create_session() as session: + dataset = session.query(Dataset).where(Dataset.id == dataset_id).first() + if not dataset: + logger.error(click.style(f"Dataset not found: {dataset_id}", fg="red")) + return + + document = session.query(DatasetDocument).where(DatasetDocument.id == document_id).first() + if not document: + logger.error(click.style(f"Document not found: {document_id}", fg="red")) + return + + # Check if document needs summary + if not document.need_summary: + logger.info( + click.style( + f"Skipping summary generation for document {document_id}: need_summary is False", + fg="cyan", + ) + ) + return + + # Only generate summary index for high_quality indexing technique + if dataset.indexing_technique != "high_quality": + logger.info( + click.style( + f"Skipping summary generation for dataset {dataset_id}: " + f"indexing_technique is {dataset.indexing_technique}, not 'high_quality'", + fg="cyan", + ) + ) + return + + # Check if summary index is enabled + summary_index_setting = dataset.summary_index_setting + if not summary_index_setting or not summary_index_setting.get("enable"): + logger.info( + click.style( + f"Summary index is disabled for dataset {dataset_id}", + fg="cyan", + ) + ) + return + + # Determine if only parent chunks should be processed + only_parent_chunks = dataset.chunk_structure == "parent_child_index" + + # Generate summaries + summary_records = SummaryIndexService.generate_summaries_for_document( + dataset=dataset, + document=document, + summary_index_setting=summary_index_setting, + segment_ids=segment_ids, + only_parent_chunks=only_parent_chunks, + ) + + end_at = time.perf_counter() + logger.info( + click.style( + f"Summary index generation completed for document {document_id}: " + f"{len(summary_records)} summaries generated, latency: {end_at - start_at}", + fg="green", + ) + ) + + except Exception as e: + logger.exception("Failed to generate summary index for document %s", document_id) + # Update document segments with error status if needed + if segment_ids: + error_message = f"Summary generation failed: {str(e)}" + with session_factory.create_session() as session: + session.query(DocumentSegment).filter( + DocumentSegment.id.in_(segment_ids), + DocumentSegment.dataset_id == dataset_id, + ).update( + { + DocumentSegment.error: error_message, + }, + synchronize_session=False, + ) + session.commit() diff --git a/api/tasks/regenerate_summary_index_task.py b/api/tasks/regenerate_summary_index_task.py new file mode 100644 index 0000000000..cf8988d13e --- /dev/null +++ b/api/tasks/regenerate_summary_index_task.py @@ -0,0 +1,315 @@ +"""Task for regenerating summary indexes when dataset settings change.""" + +import logging +import time +from collections import defaultdict + +import click +from celery import shared_task +from sqlalchemy import or_, select + +from core.db.session_factory import session_factory +from models.dataset import Dataset, DocumentSegment, DocumentSegmentSummary +from models.dataset import Document as DatasetDocument +from services.summary_index_service import SummaryIndexService + +logger = logging.getLogger(__name__) + + +@shared_task(queue="dataset") +def regenerate_summary_index_task( + dataset_id: str, + regenerate_reason: str = "summary_model_changed", + regenerate_vectors_only: bool = False, +): + """ + Regenerate summary indexes for all documents in a dataset. + + This task is triggered when: + 1. summary_index_setting model changes (regenerate_reason="summary_model_changed") + - Regenerates summary content and vectors for all existing summaries + 2. embedding_model changes (regenerate_reason="embedding_model_changed") + - Only regenerates vectors for existing summaries (keeps summary content) + + Args: + dataset_id: Dataset ID + regenerate_reason: Reason for regeneration ("summary_model_changed" or "embedding_model_changed") + regenerate_vectors_only: If True, only regenerate vectors without regenerating summary content + """ + logger.info( + click.style( + f"Start regenerate summary index for dataset {dataset_id}, reason: {regenerate_reason}", + fg="green", + ) + ) + start_at = time.perf_counter() + + try: + with session_factory.create_session() as session: + dataset = session.query(Dataset).filter_by(id=dataset_id).first() + if not dataset: + logger.error(click.style(f"Dataset not found: {dataset_id}", fg="red")) + return + + # Only regenerate summary index for high_quality indexing technique + if dataset.indexing_technique != "high_quality": + logger.info( + click.style( + f"Skipping summary regeneration for dataset {dataset_id}: " + f"indexing_technique is {dataset.indexing_technique}, not 'high_quality'", + fg="cyan", + ) + ) + return + + # Check if summary index is enabled (only for summary_model change) + # For embedding_model change, we still re-vectorize existing summaries even if setting is disabled + summary_index_setting = dataset.summary_index_setting + if not regenerate_vectors_only: + # For summary_model change, require summary_index_setting to be enabled + if not summary_index_setting or not summary_index_setting.get("enable"): + logger.info( + click.style( + f"Summary index is disabled for dataset {dataset_id}", + fg="cyan", + ) + ) + return + + total_segments_processed = 0 + total_segments_failed = 0 + + if regenerate_vectors_only: + # For embedding_model change: directly query all segments with existing summaries + # Don't require document indexing_status == "completed" + # Include summaries with status "completed" or "error" (if they have content) + segments_with_summaries = ( + session.query(DocumentSegment, DocumentSegmentSummary) + .join( + DocumentSegmentSummary, + DocumentSegment.id == DocumentSegmentSummary.chunk_id, + ) + .join( + DatasetDocument, + DocumentSegment.document_id == DatasetDocument.id, + ) + .where( + DocumentSegment.dataset_id == dataset_id, + DocumentSegment.status == "completed", # Segment must be completed + DocumentSegment.enabled == True, + DocumentSegmentSummary.dataset_id == dataset_id, + DocumentSegmentSummary.summary_content.isnot(None), # Must have summary content + # Include completed summaries or error summaries (with content) + or_( + DocumentSegmentSummary.status == "completed", + DocumentSegmentSummary.status == "error", + ), + DatasetDocument.enabled == True, # Document must be enabled + DatasetDocument.archived == False, # Document must not be archived + DatasetDocument.doc_form != "qa_model", # Skip qa_model documents + ) + .order_by(DocumentSegment.document_id.asc(), DocumentSegment.position.asc()) + .all() + ) + + if not segments_with_summaries: + logger.info( + click.style( + f"No segments with summaries found for re-vectorization in dataset {dataset_id}", + fg="cyan", + ) + ) + return + + logger.info( + "Found %s segments with summaries for re-vectorization in dataset %s", + len(segments_with_summaries), + dataset_id, + ) + + # Group by document for logging + segments_by_document = defaultdict(list) + for segment, summary_record in segments_with_summaries: + segments_by_document[segment.document_id].append((segment, summary_record)) + + logger.info( + "Segments grouped into %s documents for re-vectorization", + len(segments_by_document), + ) + + for document_id, segment_summary_pairs in segments_by_document.items(): + logger.info( + "Re-vectorizing summaries for %s segments in document %s", + len(segment_summary_pairs), + document_id, + ) + + for segment, summary_record in segment_summary_pairs: + try: + # Delete old vector + if summary_record.summary_index_node_id: + try: + from core.rag.datasource.vdb.vector_factory import Vector + + vector = Vector(dataset) + vector.delete_by_ids([summary_record.summary_index_node_id]) + except Exception as e: + logger.warning( + "Failed to delete old summary vector for segment %s: %s", + segment.id, + str(e), + ) + + # Re-vectorize with new embedding model + SummaryIndexService.vectorize_summary(summary_record, segment, dataset) + session.commit() + total_segments_processed += 1 + + except Exception as e: + logger.error( + "Failed to re-vectorize summary for segment %s: %s", + segment.id, + str(e), + exc_info=True, + ) + total_segments_failed += 1 + # Update summary record with error status + summary_record.status = "error" + summary_record.error = f"Re-vectorization failed: {str(e)}" + session.add(summary_record) + session.commit() + continue + + else: + # For summary_model change: require document indexing_status == "completed" + # Get all documents with completed indexing status + dataset_documents = session.scalars( + select(DatasetDocument).where( + DatasetDocument.dataset_id == dataset_id, + DatasetDocument.indexing_status == "completed", + DatasetDocument.enabled == True, + DatasetDocument.archived == False, + ) + ).all() + + if not dataset_documents: + logger.info( + click.style( + f"No documents found for summary regeneration in dataset {dataset_id}", + fg="cyan", + ) + ) + return + + logger.info( + "Found %s documents for summary regeneration in dataset %s", + len(dataset_documents), + dataset_id, + ) + + for dataset_document in dataset_documents: + # Skip qa_model documents + if dataset_document.doc_form == "qa_model": + continue + + try: + # Get all segments with existing summaries + segments = ( + session.query(DocumentSegment) + .join( + DocumentSegmentSummary, + DocumentSegment.id == DocumentSegmentSummary.chunk_id, + ) + .where( + DocumentSegment.document_id == dataset_document.id, + DocumentSegment.dataset_id == dataset_id, + DocumentSegment.status == "completed", + DocumentSegment.enabled == True, + DocumentSegmentSummary.dataset_id == dataset_id, + ) + .order_by(DocumentSegment.position.asc()) + .all() + ) + + if not segments: + continue + + logger.info( + "Regenerating summaries for %s segments in document %s", + len(segments), + dataset_document.id, + ) + + for segment in segments: + summary_record = None + try: + # Get existing summary record + summary_record = ( + session.query(DocumentSegmentSummary) + .filter_by( + chunk_id=segment.id, + dataset_id=dataset_id, + ) + .first() + ) + + if not summary_record: + logger.warning("Summary record not found for segment %s, skipping", segment.id) + continue + + # Regenerate both summary content and vectors (for summary_model change) + SummaryIndexService.generate_and_vectorize_summary( + segment, dataset, summary_index_setting + ) + session.commit() + total_segments_processed += 1 + + except Exception as e: + logger.error( + "Failed to regenerate summary for segment %s: %s", + segment.id, + str(e), + exc_info=True, + ) + total_segments_failed += 1 + # Update summary record with error status + if summary_record: + summary_record.status = "error" + summary_record.error = f"Regeneration failed: {str(e)}" + session.add(summary_record) + session.commit() + continue + + except Exception as e: + logger.error( + "Failed to process document %s for summary regeneration: %s", + dataset_document.id, + str(e), + exc_info=True, + ) + continue + + end_at = time.perf_counter() + if regenerate_vectors_only: + logger.info( + click.style( + f"Summary re-vectorization completed for dataset {dataset_id}: " + f"{total_segments_processed} segments processed successfully, " + f"{total_segments_failed} segments failed, " + f"latency: {end_at - start_at:.2f}s", + fg="green", + ) + ) + else: + logger.info( + click.style( + f"Summary index regeneration completed for dataset {dataset_id}: " + f"{total_segments_processed} segments processed successfully, " + f"{total_segments_failed} segments failed, " + f"latency: {end_at - start_at:.2f}s", + fg="green", + ) + ) + + except Exception: + logger.exception("Regenerate summary index failed for dataset %s", dataset_id) diff --git a/api/tasks/remove_document_from_index_task.py b/api/tasks/remove_document_from_index_task.py index c3c255fb17..55259ab527 100644 --- a/api/tasks/remove_document_from_index_task.py +++ b/api/tasks/remove_document_from_index_task.py @@ -46,6 +46,21 @@ def remove_document_from_index_task(document_id: str): index_processor = IndexProcessorFactory(document.doc_form).init_index_processor() segments = session.scalars(select(DocumentSegment).where(DocumentSegment.document_id == document.id)).all() + + # Disable summary indexes for all segments in this document + from services.summary_index_service import SummaryIndexService + + segment_ids_list = [segment.id for segment in segments] + if segment_ids_list: + try: + SummaryIndexService.disable_summaries_for_segments( + dataset=dataset, + segment_ids=segment_ids_list, + disabled_by=document.disabled_by, + ) + except Exception as e: + logger.warning("Failed to disable summaries for document %s: %s", document.id, str(e)) + index_node_ids = [segment.index_node_id for segment in segments] if index_node_ids: try: diff --git a/api/tests/unit_tests/conftest.py b/api/tests/unit_tests/conftest.py index c5e1576186..e3c1a617f7 100644 --- a/api/tests/unit_tests/conftest.py +++ b/api/tests/unit_tests/conftest.py @@ -3,6 +3,7 @@ from unittest.mock import MagicMock, patch import pytest from flask import Flask +from sqlalchemy import create_engine # Getting the absolute path of the current file's directory ABS_PATH = os.path.dirname(os.path.abspath(__file__)) @@ -36,6 +37,7 @@ import sys sys.path.insert(0, PROJECT_DIR) +from core.db.session_factory import configure_session_factory, session_factory from extensions import ext_redis @@ -102,3 +104,18 @@ def reset_secret_key(): yield finally: dify_config.SECRET_KEY = original + + +@pytest.fixture(scope="session") +def _unit_test_engine(): + engine = create_engine("sqlite:///:memory:") + yield engine + engine.dispose() + + +@pytest.fixture(autouse=True) +def _configure_session_factory(_unit_test_engine): + try: + session_factory.get_session_maker() + except RuntimeError: + configure_session_factory(_unit_test_engine, expire_on_commit=False) diff --git a/api/tests/unit_tests/controllers/console/app/test_app_response_models.py b/api/tests/unit_tests/controllers/console/app/test_app_response_models.py index 40eb59a8f4..c557605916 100644 --- a/api/tests/unit_tests/controllers/console/app/test_app_response_models.py +++ b/api/tests/unit_tests/controllers/console/app/test_app_response_models.py @@ -31,6 +31,13 @@ def _load_app_module(): def schema_model(self, name, schema): self.models[name] = schema + return schema + + def model(self, name, model_dict=None, **kwargs): + """Register a model with the namespace (flask-restx compatibility).""" + if model_dict is not None: + self.models[name] = model_dict + return model_dict def _decorator(self, obj): return obj diff --git a/api/tests/unit_tests/core/app_assets/test_storage.py b/api/tests/unit_tests/core/app_assets/test_storage.py index 05a26c7415..3181e6c1be 100644 --- a/api/tests/unit_tests/core/app_assets/test_storage.py +++ b/api/tests/unit_tests/core/app_assets/test_storage.py @@ -1,12 +1,12 @@ -import time +from unittest.mock import MagicMock, patch from uuid import uuid4 import pytest from configs import dify_config -from core.app_assets.storage import AppAssetSigner, AppAssetStorage, AssetPath +from core.app_assets.storage import AppAssetStorage, AssetPath from extensions.storage.base_storage import BaseStorage -from libs import rsa +from services.storage_ticket_service import StorageTicket, StorageTicketService class DummyStorage(BaseStorage): @@ -70,83 +70,133 @@ def test_asset_path_validation(): AssetPath.draft(tenant_id=tenant_id, app_id=app_id, node_id="not-a-uuid") -def test_storage_key_mapping(): - tenant_id = str(uuid4()) - app_id = str(uuid4()) - node_id = str(uuid4()) - - storage = AppAssetStorage(DummyStorage(), redis_client=DummyRedis()) - ref = AssetPath.draft(tenant_id, app_id, node_id) - assert storage.get_storage_key(ref) == ref.get_storage_key() - - -def test_signature_verification(monkeypatch: pytest.MonkeyPatch): - tenant_id = str(uuid4()) - app_id = str(uuid4()) - resource_id = str(uuid4()) - asset_path = AssetPath.draft(tenant_id, app_id, resource_id) - - class _FakeKey: - def export_key(self) -> bytes: - return b"tenant-private-key" - - def _fake_get_decrypt_decoding(_tenant_id: str) -> tuple[_FakeKey, None]: - return _FakeKey(), None - - monkeypatch.setattr(dify_config, "FILES_ACCESS_TIMEOUT", 300, raising=False) - monkeypatch.setattr(rsa, "get_decrypt_decoding", _fake_get_decrypt_decoding) - - expires_at = int(time.time()) + 120 - nonce = "nonce" - sign = AppAssetSigner.create_download_signature(asset_path=asset_path, expires_at=expires_at, nonce=nonce) - - assert AppAssetSigner.verify_download_signature( - asset_path=asset_path, - expires_at=expires_at, - nonce=nonce, - sign=sign, - ) - - expired_at = int(time.time()) - 1 - expired_sign = AppAssetSigner.create_download_signature(asset_path=asset_path, expires_at=expired_at, nonce=nonce) - assert not AppAssetSigner.verify_download_signature( - asset_path=asset_path, - expires_at=expired_at, - nonce=nonce, - sign=expired_sign, - ) - - too_far = int(time.time()) + 3600 - far_sign = AppAssetSigner.create_download_signature(asset_path=asset_path, expires_at=too_far, nonce=nonce) - assert not AppAssetSigner.verify_download_signature( - asset_path=asset_path, - expires_at=too_far, - nonce=nonce, - sign=far_sign, - ) - - -def test_signed_proxy_url_generation(monkeypatch: pytest.MonkeyPatch): - tenant_id = str(uuid4()) - app_id = str(uuid4()) - resource_id = str(uuid4()) - asset_path = AssetPath.draft(tenant_id, app_id, resource_id) - - class _FakeKey: - def export_key(self) -> bytes: - return b"tenant-private-key" - - def _fake_get_decrypt_decoding(_tenant_id: str) -> tuple[_FakeKey, None]: - return _FakeKey(), None - - monkeypatch.setattr(dify_config, "FILES_ACCESS_TIMEOUT", 300, raising=False) - monkeypatch.setattr(rsa, "get_decrypt_decoding", _fake_get_decrypt_decoding) +def test_storage_ticket_service(monkeypatch: pytest.MonkeyPatch): + """Test StorageTicketService creates and retrieves tickets.""" monkeypatch.setattr(dify_config, "FILES_URL", "http://files.local", raising=False) - storage = AppAssetStorage(DummyStorage(), redis_client=DummyRedis()) - url = storage.get_download_url(asset_path, expires_in=120) + mock_redis = MagicMock() + stored_data = {} - assert url.startswith(f"http://files.local/files/app-assets/draft/{tenant_id}/{app_id}/{resource_id}/download?") - assert "expires_at=" in url - assert "nonce=" in url - assert "sign=" in url + def mock_setex(key, ttl, value): + stored_data[key] = value + + def mock_get(key): + return stored_data.get(key) + + mock_redis.setex = mock_setex + mock_redis.get = mock_get + + with patch("services.storage_ticket_service.redis_client", mock_redis): + # Test download URL creation + url = StorageTicketService.create_download_url("test/path/file.txt", expires_in=300, filename="file.txt") + + assert url.startswith("http://files.local/files/storage-files/") + token = url.split("/")[-1] + + # Verify ticket was stored + ticket = StorageTicketService.get_ticket(token) + assert ticket is not None + assert ticket.op == "download" + assert ticket.storage_key == "test/path/file.txt" + assert ticket.filename == "file.txt" + + # Test upload URL creation + upload_url = StorageTicketService.create_upload_url("test/upload.txt", expires_in=300, max_bytes=1024) + + upload_token = upload_url.split("/")[-1] + upload_ticket = StorageTicketService.get_ticket(upload_token) + assert upload_ticket is not None + assert upload_ticket.op == "upload" + assert upload_ticket.storage_key == "test/upload.txt" + assert upload_ticket.max_bytes == 1024 + + +def test_storage_ticket_not_found(monkeypatch: pytest.MonkeyPatch): + """Test StorageTicketService returns None for invalid token.""" + mock_redis = MagicMock() + mock_redis.get.return_value = None + + with patch("services.storage_ticket_service.redis_client", mock_redis): + ticket = StorageTicketService.get_ticket("invalid-token") + assert ticket is None + + +def test_ticket_url_generation(monkeypatch: pytest.MonkeyPatch): + """Test that AppAssetStorage generates correct ticket URLs when presign is not supported.""" + tenant_id = str(uuid4()) + app_id = str(uuid4()) + resource_id = str(uuid4()) + asset_path = AssetPath.draft(tenant_id, app_id, resource_id) + + monkeypatch.setattr(dify_config, "FILES_URL", "http://files.local", raising=False) + + mock_redis = MagicMock() + mock_redis.setex = MagicMock() + + with patch("services.storage_ticket_service.redis_client", mock_redis): + storage = AppAssetStorage(DummyStorage(), redis_client=DummyRedis()) + url = storage.get_download_url(asset_path, expires_in=120) + + # URL should be a ticket URL since DummyStorage doesn't support presign + assert url.startswith("http://files.local/files/storage-files/") + # Token should be a UUID + token = url.split("/")[-1] + assert len(token) == 36 # UUID format + + +def test_upload_ticket_url_generation(monkeypatch: pytest.MonkeyPatch): + """Test that AppAssetStorage generates correct upload ticket URLs.""" + tenant_id = str(uuid4()) + app_id = str(uuid4()) + resource_id = str(uuid4()) + asset_path = AssetPath.draft(tenant_id, app_id, resource_id) + + monkeypatch.setattr(dify_config, "FILES_URL", "http://files.local", raising=False) + + mock_redis = MagicMock() + mock_redis.setex = MagicMock() + + with patch("services.storage_ticket_service.redis_client", mock_redis): + storage = AppAssetStorage(DummyStorage(), redis_client=DummyRedis()) + url = storage.get_upload_url(asset_path, expires_in=120) + + # URL should be a ticket URL since DummyStorage doesn't support presign + assert url.startswith("http://files.local/files/storage-files/") + # Token should be a UUID + token = url.split("/")[-1] + assert len(token) == 36 # UUID format + + +def test_storage_ticket_dataclass(): + """Test StorageTicket serialization and deserialization.""" + ticket = StorageTicket( + op="download", + storage_key="path/to/file.txt", + filename="file.txt", + ) + + data = ticket.to_dict() + assert data == { + "op": "download", + "storage_key": "path/to/file.txt", + "filename": "file.txt", + } + + restored = StorageTicket.from_dict(data) + assert restored.op == ticket.op + assert restored.storage_key == ticket.storage_key + assert restored.filename == ticket.filename + assert restored.max_bytes is None + + # Test upload ticket with max_bytes + upload_ticket = StorageTicket( + op="upload", + storage_key="path/to/upload.txt", + max_bytes=1024, + ) + + upload_data = upload_ticket.to_dict() + assert upload_data["max_bytes"] == 1024 + + restored_upload = StorageTicket.from_dict(upload_data) + assert restored_upload.max_bytes == 1024 diff --git a/api/tests/unit_tests/core/rag/extractor/test_word_extractor.py b/api/tests/unit_tests/core/rag/extractor/test_word_extractor.py index f9e59a5f05..0792ada194 100644 --- a/api/tests/unit_tests/core/rag/extractor/test_word_extractor.py +++ b/api/tests/unit_tests/core/rag/extractor/test_word_extractor.py @@ -1,7 +1,9 @@ """Primarily used for testing merged cell scenarios""" +import io import os import tempfile +from pathlib import Path from types import SimpleNamespace from docx import Document @@ -56,6 +58,42 @@ def test_parse_row(): assert extractor._parse_row(row, {}, 3) == gt[idx] +def test_init_downloads_via_ssrf_proxy(monkeypatch): + doc = Document() + doc.add_paragraph("hello") + buf = io.BytesIO() + doc.save(buf) + docx_bytes = buf.getvalue() + + calls: list[tuple[str, object]] = [] + + class FakeResponse: + status_code = 200 + content = docx_bytes + + def close(self) -> None: + calls.append(("close", None)) + + def fake_get(url: str, **kwargs): + calls.append(("get", (url, kwargs))) + return FakeResponse() + + monkeypatch.setattr(we, "ssrf_proxy", SimpleNamespace(get=fake_get)) + + extractor = WordExtractor("https://example.com/test.docx", "tenant_id", "user_id") + try: + assert calls + assert calls[0][0] == "get" + url, kwargs = calls[0][1] + assert url == "https://example.com/test.docx" + assert kwargs.get("timeout") is None + assert extractor.web_path == "https://example.com/test.docx" + assert extractor.file_path != extractor.web_path + assert Path(extractor.file_path).read_bytes() == docx_bytes + finally: + extractor.temp_file.close() + + def test_extract_images_from_docx(monkeypatch): external_bytes = b"ext-bytes" internal_bytes = b"int-bytes" diff --git a/api/tests/unit_tests/services/test_dataset_service_update_dataset.py b/api/tests/unit_tests/services/test_dataset_service_update_dataset.py index 0aabe2fc30..08818945e3 100644 --- a/api/tests/unit_tests/services/test_dataset_service_update_dataset.py +++ b/api/tests/unit_tests/services/test_dataset_service_update_dataset.py @@ -138,6 +138,7 @@ class TestDatasetServiceUpdateDataset: "services.dataset_service.DatasetCollectionBindingService.get_dataset_collection_binding" ) as mock_get_binding, patch("services.dataset_service.deal_dataset_vector_index_task") as mock_task, + patch("services.dataset_service.regenerate_summary_index_task") as mock_regenerate_task, patch( "services.dataset_service.current_user", create_autospec(Account, instance=True) ) as mock_current_user, @@ -147,6 +148,7 @@ class TestDatasetServiceUpdateDataset: "model_manager": mock_model_manager, "get_binding": mock_get_binding, "task": mock_task, + "regenerate_task": mock_regenerate_task, "current_user": mock_current_user, } @@ -549,6 +551,13 @@ class TestDatasetServiceUpdateDataset: # Verify vector index task was triggered mock_internal_provider_dependencies["task"].delay.assert_called_once_with("dataset-123", "update") + # Verify regenerate summary index task was triggered (when embedding_model changes) + mock_internal_provider_dependencies["regenerate_task"].delay.assert_called_once_with( + "dataset-123", + regenerate_reason="embedding_model_changed", + regenerate_vectors_only=True, + ) + # Verify return value assert result == dataset diff --git a/api/tests/unit_tests/utils/structured_output_parser/test_structured_output_parser.py b/api/tests/unit_tests/utils/structured_output_parser/test_structured_output_parser.py index 52af2e8d7a..df73c29004 100644 --- a/api/tests/unit_tests/utils/structured_output_parser/test_structured_output_parser.py +++ b/api/tests/unit_tests/utils/structured_output_parser/test_structured_output_parser.py @@ -312,7 +312,6 @@ def test_structured_output_parser(): model_instance=model_instance, prompt_messages=prompt_messages, json_schema=case["json_schema"], - stream=case["stream"], ) # Consume the generator to trigger the error list(result_generator) @@ -323,7 +322,6 @@ def test_structured_output_parser(): model_instance=model_instance, prompt_messages=prompt_messages, json_schema=case["json_schema"], - stream=case["stream"], ) else: # Test successful cases @@ -338,7 +336,6 @@ def test_structured_output_parser(): model_instance=model_instance, prompt_messages=prompt_messages, json_schema=case["json_schema"], - stream=case["stream"], model_parameters={"temperature": 0.7, "max_tokens": 100}, user="test_user", ) @@ -418,7 +415,6 @@ def test_parse_structured_output_edge_cases(): model_instance=model_instance, prompt_messages=prompt_messages, json_schema=testcase_list_with_dict["json_schema"], - stream=testcase_list_with_dict["stream"], ) assert isinstance(result, LLMResultWithStructuredOutput) @@ -456,7 +452,6 @@ def test_model_specific_schema_preparation(): model_instance=model_instance, prompt_messages=prompt_messages, json_schema=gemini_case["json_schema"], - stream=gemini_case["stream"], ) assert isinstance(result, LLMResultWithStructuredOutput) @@ -492,7 +487,6 @@ def test_structured_output_with_pydantic_model_non_streaming(): model_instance=model_instance, prompt_messages=prompt_messages, output_model=ExampleOutput, - stream=False, ) assert isinstance(result, ExampleOutput) @@ -532,8 +526,7 @@ def test_structured_output_with_pydantic_model_streaming(): model_schema=model_schema, model_instance=model_instance, prompt_messages=[UserPromptMessage(content="Return a JSON object with name.")], - output_model=ExampleOutput, - stream=True, + output_model=ExampleOutput ) assert isinstance(result, ExampleOutput) @@ -555,8 +548,7 @@ def test_structured_output_with_pydantic_model_validation_error(): model_schema=model_schema, model_instance=model_instance, prompt_messages=[UserPromptMessage(content="test")], - output_model=ExampleOutput, - stream=False, + output_model=ExampleOutput ) diff --git a/api/ty.toml b/api/ty.toml index bb4ff5bbcf..640ed6cdee 100644 --- a/api/ty.toml +++ b/api/ty.toml @@ -1,11 +1,33 @@ [src] exclude = [ - # TODO: enable when violations fixed + # deps groups (A1/A2/B/C/D/E) + # A1: foundational runtime typing / provider plumbing + "core/mcp/session", + "core/model_runtime/model_providers", + "core/workflow/nodes/protocols.py", + "libs/gmpy2_pkcs10aep_cipher.py", + # A2: workflow engine/nodes + "core/workflow", + "core/app/workflow", + "core/helper/code_executor", + # B: app runner + prompt + "core/prompt", + "core/app/apps/base_app_runner.py", "core/app/apps/workflow_app_runner.py", + # C: services/controllers/fields/libs + "services", "controllers/console/app", "controllers/console/explore", "controllers/console/datasets", "controllers/console/workspace", + "controllers/service_api/wraps.py", + "fields/conversation_fields.py", + "libs/external_api.py", + # D: observability + integrations + "core/ops", + "extensions", + # E: vector DB integrations + "core/rag/datasource/vdb", # non-producition or generated code "migrations", "tests", diff --git a/api/uv.lock b/api/uv.lock index 3d18286fb7..1d959782da 100644 --- a/api/uv.lock +++ b/api/uv.lock @@ -136,21 +136,21 @@ wheels = [ [[package]] name = "alembic" -version = "1.18.1" +version = "1.18.2" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "mako" }, { name = "sqlalchemy" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/49/cc/aca263693b2ece99fa99a09b6d092acb89973eb2bb575faef1777e04f8b4/alembic-1.18.1.tar.gz", hash = "sha256:83ac6b81359596816fb3b893099841a0862f2117b2963258e965d70dc62fb866", size = 2044319, upload-time = "2026-01-14T18:53:14.907Z" } +sdist = { url = "https://files.pythonhosted.org/packages/a7/93/07f5ba5d8e4f4049e864faa9d822bbbbfb6f3223a4ffb1376768ab9ee4b8/alembic-1.18.2.tar.gz", hash = "sha256:1c3ddb635f26efbc80b1b90c5652548202022d4e760f6a78d6d85959280e3684", size = 2048272, upload-time = "2026-01-28T21:23:30.914Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/83/36/cd9cb6101e81e39076b2fbe303bfa3c85ca34e55142b0324fcbf22c5c6e2/alembic-1.18.1-py3-none-any.whl", hash = "sha256:f1c3b0920b87134e851c25f1f7f236d8a332c34b75416802d06971df5d1b7810", size = 260973, upload-time = "2026-01-14T18:53:17.533Z" }, + { url = "https://files.pythonhosted.org/packages/1a/60/ced4277ccf61f91eb03c4ac9f63b9567eb814f9ab1cd7835f00fbd5d0c14/alembic-1.18.2-py3-none-any.whl", hash = "sha256:18a5f6448af4864cc308aadf33eb37c0116da9a60fd9bb3f31ccb1b522b4a9b9", size = 261953, upload-time = "2026-01-28T21:23:32.508Z" }, ] [[package]] name = "alibabacloud-credentials" -version = "1.0.5" +version = "1.0.7" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "aiofiles" }, @@ -158,9 +158,9 @@ dependencies = [ { name = "alibabacloud-tea" }, { name = "apscheduler" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/57/0e/5633e96dc9f42eac7e387b617a032c9136e129b8b5f75a9685a36bf17fcb/alibabacloud_credentials-1.0.5.tar.gz", hash = "sha256:2b79a674e51609826fc5c78595782c7997d0887fa29df840895b926df1e98624", size = 40461, upload-time = "2026-01-23T10:29:10.804Z" } +sdist = { url = "https://files.pythonhosted.org/packages/3c/2b/596a8b2cb6d08a75a6c85a98996d2a6f3a43a40aea5f892728bfce025b54/alibabacloud_credentials-1.0.7.tar.gz", hash = "sha256:80428280b4bcf95461d41d1490a22360b8b67d1829bf1eb38f74fabcc693f1b3", size = 40606, upload-time = "2026-01-27T05:56:44.444Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/da/3d/cafabe877fe48243f037d01d76427d64cf66216101866831ecc7888294f2/alibabacloud_credentials-1.0.5-py3-none-any.whl", hash = "sha256:d13dd3a6088e45f5f43e911b4277821ceb0218ecede285ba58834016393036b7", size = 48826, upload-time = "2026-01-23T10:29:09.491Z" }, + { url = "https://files.pythonhosted.org/packages/25/86/f8dbcc689d6f4ba0e1e709a9b401b633052138daf20f7ce661c073a45823/alibabacloud_credentials-1.0.7-py3-none-any.whl", hash = "sha256:465c779cfa284e8900c08880d764197289b1edd4c72c0087c3effe6bb2b4dea3", size = 48963, upload-time = "2026-01-27T05:56:43.466Z" }, ] [[package]] @@ -531,16 +531,16 @@ wheels = [ [[package]] name = "bce-python-sdk" -version = "0.9.59" +version = "0.9.60" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "future" }, { name = "pycryptodome" }, { name = "six" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/c1/8e/ddfacf065fd0a514bda38b489988ea21636ac3be09c79239f24cdc36d71b/bce_python_sdk-0.9.59.tar.gz", hash = "sha256:54ad09394b0a5baf8c8ef87ac919f9d111c1b0536086286b80ada71651d8e4c8", size = 278672, upload-time = "2026-01-05T11:46:14.19Z" } +sdist = { url = "https://files.pythonhosted.org/packages/dc/00/7b84673285ede23fd3ca8d33a90a6963cd7f16755f4e8228025710acb078/bce_python_sdk-0.9.60.tar.gz", hash = "sha256:e0d04b8377cdfa264b1c217db3208dcb8ba58d02c9bad052dc3cbecf61c9eb0d", size = 279370, upload-time = "2026-01-27T03:05:29.502Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/2f/b0/38ea413e3a4aa44c199ff74001b3b2510b6b0f237c7840237976094ab574/bce_python_sdk-0.9.59-py3-none-any.whl", hash = "sha256:9a63ffc36ac5cb984b79ce6909288f00862010eda576f7575c7f0fb7cdef419c", size = 394807, upload-time = "2026-01-05T11:45:59.752Z" }, + { url = "https://files.pythonhosted.org/packages/39/45/1ef7b8db8716bf072e13e3857c2aa5f62e36b904cf88ceb796adbe7957e7/bce_python_sdk-0.9.60-py3-none-any.whl", hash = "sha256:50f13df97e79ff8e8b5ab22fbf38a78ff711e878b5976b8950e1b318d3d6df61", size = 395377, upload-time = "2026-01-27T03:05:26.404Z" }, ] [[package]] @@ -640,16 +640,16 @@ wheels = [ [[package]] name = "boto3-stubs" -version = "1.42.34" +version = "1.42.37" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "botocore-stubs" }, { name = "types-s3transfer" }, { name = "typing-extensions", marker = "python_full_version < '3.12'" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/4d/e4/959e63b009194cae2fad6ddff8ef1c0e7e2f9113bca4c7ec20fa579e4d7a/boto3_stubs-1.42.34.tar.gz", hash = "sha256:fafcc3713c331bac11bf55fe913e5a3a01820f0cde640cfc4694df5a94aa9557", size = 100898, upload-time = "2026-01-23T20:42:10.353Z" } +sdist = { url = "https://files.pythonhosted.org/packages/7f/b7/15995a1261cb3dccaee7e53e1e27d14fb542c56b95883598b53190e7d979/boto3_stubs-1.42.37.tar.gz", hash = "sha256:1620519a55bbb26cebed95b6d8f26ba96b8ea91dadd05eafc3b8f17a587e2108", size = 100870, upload-time = "2026-01-28T20:56:37.32Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/a3/c4/1aba1653afc3cf5ef985235cea05d3e9e6736033f10ebbf102a23fc0152d/boto3_stubs-1.42.34-py3-none-any.whl", hash = "sha256:eb98cf3cc0a74ed75ea4945152cf10da57c8c9628104a13db16cde10176219ab", size = 69782, upload-time = "2026-01-23T20:42:05.699Z" }, + { url = "https://files.pythonhosted.org/packages/12/10/ad4f3ffcdc46df83df4ba06d7692ea0869700537163cd867dd66f966835b/boto3_stubs-1.42.37-py3-none-any.whl", hash = "sha256:07b9ac27196b233b802f8fadff2fa9c01d656927943c618dc862ff00fd592b24", size = 69785, upload-time = "2026-01-28T20:56:29.211Z" }, ] [package.optional-dependencies] @@ -673,14 +673,14 @@ wheels = [ [[package]] name = "botocore-stubs" -version = "1.42.34" +version = "1.42.37" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "types-awscrt" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/8f/1e/024e45fb46a21d085b541ce0ad8f1bef97ce17c5e72d1dc0e4d09d29e399/botocore_stubs-1.42.34.tar.gz", hash = "sha256:f3d1c5b45c2cbe16f63719abe639b23a1eeb3fec9c3ea0a72688585b462e8ce3", size = 42408, upload-time = "2026-01-23T20:33:38.691Z" } +sdist = { url = "https://files.pythonhosted.org/packages/e2/d4/0348543289c57b766622958ebf0b9cc2d9ebd36e803f25e0e55455bbb165/botocore_stubs-1.42.37.tar.gz", hash = "sha256:7357d1876ae198757dbe0a73f887449ffdda18eb075d7d3cc2e22d3580dcb17c", size = 42399, upload-time = "2026-01-28T21:35:52.863Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/4c/c8/3845c17b89ff19e2c2474801a6737d1766ee8e80cf38d7d97e1fedc28537/botocore_stubs-1.42.34-py3-none-any.whl", hash = "sha256:afc08661122eff6939d88cd250084ac148e392f8a1a389d51a31a4b9dab59358", size = 66760, upload-time = "2026-01-23T20:33:37.146Z" }, + { url = "https://files.pythonhosted.org/packages/1a/c8/41d4e54f92865aac2afcae22c6e892659f5172b264d7dec28cf1bb36de7a/botocore_stubs-1.42.37-py3-none-any.whl", hash = "sha256:5a9b2a4062f7cc19e0648508f67d3f1a1fd8d3e0d6f5a0d3244cc9656e54cc67", size = 66761, upload-time = "2026-01-28T21:35:51.749Z" }, ] [[package]] @@ -1269,63 +1269,61 @@ wheels = [ [[package]] name = "cryptography" -version = "46.0.3" +version = "46.0.4" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "cffi", marker = "platform_python_implementation != 'PyPy'" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/9f/33/c00162f49c0e2fe8064a62cb92b93e50c74a72bc370ab92f86112b33ff62/cryptography-46.0.3.tar.gz", hash = "sha256:a8b17438104fed022ce745b362294d9ce35b4c2e45c1d958ad4a4b019285f4a1", size = 749258, upload-time = "2025-10-15T23:18:31.74Z" } +sdist = { url = "https://files.pythonhosted.org/packages/78/19/f748958276519adf6a0c1e79e7b8860b4830dda55ccdf29f2719b5fc499c/cryptography-46.0.4.tar.gz", hash = "sha256:bfd019f60f8abc2ed1b9be4ddc21cfef059c841d86d710bb69909a688cbb8f59", size = 749301, upload-time = "2026-01-28T00:24:37.379Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/1d/42/9c391dd801d6cf0d561b5890549d4b27bafcc53b39c31a817e69d87c625b/cryptography-46.0.3-cp311-abi3-macosx_10_9_universal2.whl", hash = "sha256:109d4ddfadf17e8e7779c39f9b18111a09efb969a301a31e987416a0191ed93a", size = 7225004, upload-time = "2025-10-15T23:16:52.239Z" }, - { url = "https://files.pythonhosted.org/packages/1c/67/38769ca6b65f07461eb200e85fc1639b438bdc667be02cf7f2cd6a64601c/cryptography-46.0.3-cp311-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:09859af8466b69bc3c27bdf4f5d84a665e0f7ab5088412e9e2ec49758eca5cbc", size = 4296667, upload-time = "2025-10-15T23:16:54.369Z" }, - { url = "https://files.pythonhosted.org/packages/5c/49/498c86566a1d80e978b42f0d702795f69887005548c041636df6ae1ca64c/cryptography-46.0.3-cp311-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:01ca9ff2885f3acc98c29f1860552e37f6d7c7d013d7334ff2a9de43a449315d", size = 4450807, upload-time = "2025-10-15T23:16:56.414Z" }, - { url = "https://files.pythonhosted.org/packages/4b/0a/863a3604112174c8624a2ac3c038662d9e59970c7f926acdcfaed8d61142/cryptography-46.0.3-cp311-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:6eae65d4c3d33da080cff9c4ab1f711b15c1d9760809dad6ea763f3812d254cb", size = 4299615, upload-time = "2025-10-15T23:16:58.442Z" }, - { url = "https://files.pythonhosted.org/packages/64/02/b73a533f6b64a69f3cd3872acb6ebc12aef924d8d103133bb3ea750dc703/cryptography-46.0.3-cp311-abi3-manylinux_2_28_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:e5bf0ed4490068a2e72ac03d786693adeb909981cc596425d09032d372bcc849", size = 4016800, upload-time = "2025-10-15T23:17:00.378Z" }, - { url = "https://files.pythonhosted.org/packages/25/d5/16e41afbfa450cde85a3b7ec599bebefaef16b5c6ba4ec49a3532336ed72/cryptography-46.0.3-cp311-abi3-manylinux_2_28_ppc64le.whl", hash = "sha256:5ecfccd2329e37e9b7112a888e76d9feca2347f12f37918facbb893d7bb88ee8", size = 4984707, upload-time = "2025-10-15T23:17:01.98Z" }, - { url = "https://files.pythonhosted.org/packages/c9/56/e7e69b427c3878352c2fb9b450bd0e19ed552753491d39d7d0a2f5226d41/cryptography-46.0.3-cp311-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:a2c0cd47381a3229c403062f764160d57d4d175e022c1df84e168c6251a22eec", size = 4482541, upload-time = "2025-10-15T23:17:04.078Z" }, - { url = "https://files.pythonhosted.org/packages/78/f6/50736d40d97e8483172f1bb6e698895b92a223dba513b0ca6f06b2365339/cryptography-46.0.3-cp311-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:549e234ff32571b1f4076ac269fcce7a808d3bf98b76c8dd560e42dbc66d7d91", size = 4299464, upload-time = "2025-10-15T23:17:05.483Z" }, - { url = "https://files.pythonhosted.org/packages/00/de/d8e26b1a855f19d9994a19c702fa2e93b0456beccbcfe437eda00e0701f2/cryptography-46.0.3-cp311-abi3-manylinux_2_34_ppc64le.whl", hash = "sha256:c0a7bb1a68a5d3471880e264621346c48665b3bf1c3759d682fc0864c540bd9e", size = 4950838, upload-time = "2025-10-15T23:17:07.425Z" }, - { url = "https://files.pythonhosted.org/packages/8f/29/798fc4ec461a1c9e9f735f2fc58741b0daae30688f41b2497dcbc9ed1355/cryptography-46.0.3-cp311-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:10b01676fc208c3e6feeb25a8b83d81767e8059e1fe86e1dc62d10a3018fa926", size = 4481596, upload-time = "2025-10-15T23:17:09.343Z" }, - { url = "https://files.pythonhosted.org/packages/15/8d/03cd48b20a573adfff7652b76271078e3045b9f49387920e7f1f631d125e/cryptography-46.0.3-cp311-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:0abf1ffd6e57c67e92af68330d05760b7b7efb243aab8377e583284dbab72c71", size = 4426782, upload-time = "2025-10-15T23:17:11.22Z" }, - { url = "https://files.pythonhosted.org/packages/fa/b1/ebacbfe53317d55cf33165bda24c86523497a6881f339f9aae5c2e13e57b/cryptography-46.0.3-cp311-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:a04bee9ab6a4da801eb9b51f1b708a1b5b5c9eb48c03f74198464c66f0d344ac", size = 4698381, upload-time = "2025-10-15T23:17:12.829Z" }, - { url = "https://files.pythonhosted.org/packages/96/92/8a6a9525893325fc057a01f654d7efc2c64b9de90413adcf605a85744ff4/cryptography-46.0.3-cp311-abi3-win32.whl", hash = "sha256:f260d0d41e9b4da1ed1e0f1ce571f97fe370b152ab18778e9e8f67d6af432018", size = 3055988, upload-time = "2025-10-15T23:17:14.65Z" }, - { url = "https://files.pythonhosted.org/packages/7e/bf/80fbf45253ea585a1e492a6a17efcb93467701fa79e71550a430c5e60df0/cryptography-46.0.3-cp311-abi3-win_amd64.whl", hash = "sha256:a9a3008438615669153eb86b26b61e09993921ebdd75385ddd748702c5adfddb", size = 3514451, upload-time = "2025-10-15T23:17:16.142Z" }, - { url = "https://files.pythonhosted.org/packages/2e/af/9b302da4c87b0beb9db4e756386a7c6c5b8003cd0e742277888d352ae91d/cryptography-46.0.3-cp311-abi3-win_arm64.whl", hash = "sha256:5d7f93296ee28f68447397bf5198428c9aeeab45705a55d53a6343455dcb2c3c", size = 2928007, upload-time = "2025-10-15T23:17:18.04Z" }, - { url = "https://files.pythonhosted.org/packages/fd/23/45fe7f376a7df8daf6da3556603b36f53475a99ce4faacb6ba2cf3d82021/cryptography-46.0.3-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:cb3d760a6117f621261d662bccc8ef5bc32ca673e037c83fbe565324f5c46936", size = 7218248, upload-time = "2025-10-15T23:17:46.294Z" }, - { url = "https://files.pythonhosted.org/packages/27/32/b68d27471372737054cbd34c84981f9edbc24fe67ca225d389799614e27f/cryptography-46.0.3-cp38-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:4b7387121ac7d15e550f5cb4a43aef2559ed759c35df7336c402bb8275ac9683", size = 4294089, upload-time = "2025-10-15T23:17:48.269Z" }, - { url = "https://files.pythonhosted.org/packages/26/42/fa8389d4478368743e24e61eea78846a0006caffaf72ea24a15159215a14/cryptography-46.0.3-cp38-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:15ab9b093e8f09daab0f2159bb7e47532596075139dd74365da52ecc9cb46c5d", size = 4440029, upload-time = "2025-10-15T23:17:49.837Z" }, - { url = "https://files.pythonhosted.org/packages/5f/eb/f483db0ec5ac040824f269e93dd2bd8a21ecd1027e77ad7bdf6914f2fd80/cryptography-46.0.3-cp38-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:46acf53b40ea38f9c6c229599a4a13f0d46a6c3fa9ef19fc1a124d62e338dfa0", size = 4297222, upload-time = "2025-10-15T23:17:51.357Z" }, - { url = "https://files.pythonhosted.org/packages/fd/cf/da9502c4e1912cb1da3807ea3618a6829bee8207456fbbeebc361ec38ba3/cryptography-46.0.3-cp38-abi3-manylinux_2_28_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:10ca84c4668d066a9878890047f03546f3ae0a6b8b39b697457b7757aaf18dbc", size = 4012280, upload-time = "2025-10-15T23:17:52.964Z" }, - { url = "https://files.pythonhosted.org/packages/6b/8f/9adb86b93330e0df8b3dcf03eae67c33ba89958fc2e03862ef1ac2b42465/cryptography-46.0.3-cp38-abi3-manylinux_2_28_ppc64le.whl", hash = "sha256:36e627112085bb3b81b19fed209c05ce2a52ee8b15d161b7c643a7d5a88491f3", size = 4978958, upload-time = "2025-10-15T23:17:54.965Z" }, - { url = "https://files.pythonhosted.org/packages/d1/a0/5fa77988289c34bdb9f913f5606ecc9ada1adb5ae870bd0d1054a7021cc4/cryptography-46.0.3-cp38-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:1000713389b75c449a6e979ffc7dcc8ac90b437048766cef052d4d30b8220971", size = 4473714, upload-time = "2025-10-15T23:17:56.754Z" }, - { url = "https://files.pythonhosted.org/packages/14/e5/fc82d72a58d41c393697aa18c9abe5ae1214ff6f2a5c18ac470f92777895/cryptography-46.0.3-cp38-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:b02cf04496f6576afffef5ddd04a0cb7d49cf6be16a9059d793a30b035f6b6ac", size = 4296970, upload-time = "2025-10-15T23:17:58.588Z" }, - { url = "https://files.pythonhosted.org/packages/78/06/5663ed35438d0b09056973994f1aec467492b33bd31da36e468b01ec1097/cryptography-46.0.3-cp38-abi3-manylinux_2_34_ppc64le.whl", hash = "sha256:71e842ec9bc7abf543b47cf86b9a743baa95f4677d22baa4c7d5c69e49e9bc04", size = 4940236, upload-time = "2025-10-15T23:18:00.897Z" }, - { url = "https://files.pythonhosted.org/packages/fc/59/873633f3f2dcd8a053b8dd1d38f783043b5fce589c0f6988bf55ef57e43e/cryptography-46.0.3-cp38-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:402b58fc32614f00980b66d6e56a5b4118e6cb362ae8f3fda141ba4689bd4506", size = 4472642, upload-time = "2025-10-15T23:18:02.749Z" }, - { url = "https://files.pythonhosted.org/packages/3d/39/8e71f3930e40f6877737d6f69248cf74d4e34b886a3967d32f919cc50d3b/cryptography-46.0.3-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:ef639cb3372f69ec44915fafcd6698b6cc78fbe0c2ea41be867f6ed612811963", size = 4423126, upload-time = "2025-10-15T23:18:04.85Z" }, - { url = "https://files.pythonhosted.org/packages/cd/c7/f65027c2810e14c3e7268353b1681932b87e5a48e65505d8cc17c99e36ae/cryptography-46.0.3-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:3b51b8ca4f1c6453d8829e1eb7299499ca7f313900dd4d89a24b8b87c0a780d4", size = 4686573, upload-time = "2025-10-15T23:18:06.908Z" }, - { url = "https://files.pythonhosted.org/packages/0a/6e/1c8331ddf91ca4730ab3086a0f1be19c65510a33b5a441cb334e7a2d2560/cryptography-46.0.3-cp38-abi3-win32.whl", hash = "sha256:6276eb85ef938dc035d59b87c8a7dc559a232f954962520137529d77b18ff1df", size = 3036695, upload-time = "2025-10-15T23:18:08.672Z" }, - { url = "https://files.pythonhosted.org/packages/90/45/b0d691df20633eff80955a0fc7695ff9051ffce8b69741444bd9ed7bd0db/cryptography-46.0.3-cp38-abi3-win_amd64.whl", hash = "sha256:416260257577718c05135c55958b674000baef9a1c7d9e8f306ec60d71db850f", size = 3501720, upload-time = "2025-10-15T23:18:10.632Z" }, - { url = "https://files.pythonhosted.org/packages/e8/cb/2da4cc83f5edb9c3257d09e1e7ab7b23f049c7962cae8d842bbef0a9cec9/cryptography-46.0.3-cp38-abi3-win_arm64.whl", hash = "sha256:d89c3468de4cdc4f08a57e214384d0471911a3830fcdaf7a8cc587e42a866372", size = 2918740, upload-time = "2025-10-15T23:18:12.277Z" }, - { url = "https://files.pythonhosted.org/packages/06/8a/e60e46adab4362a682cf142c7dcb5bf79b782ab2199b0dcb81f55970807f/cryptography-46.0.3-pp311-pypy311_pp73-macosx_10_9_x86_64.whl", hash = "sha256:7ce938a99998ed3c8aa7e7272dca1a610401ede816d36d0693907d863b10d9ea", size = 3698132, upload-time = "2025-10-15T23:18:17.056Z" }, - { url = "https://files.pythonhosted.org/packages/da/38/f59940ec4ee91e93d3311f7532671a5cef5570eb04a144bf203b58552d11/cryptography-46.0.3-pp311-pypy311_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:191bb60a7be5e6f54e30ba16fdfae78ad3a342a0599eb4193ba88e3f3d6e185b", size = 4243992, upload-time = "2025-10-15T23:18:18.695Z" }, - { url = "https://files.pythonhosted.org/packages/b0/0c/35b3d92ddebfdfda76bb485738306545817253d0a3ded0bfe80ef8e67aa5/cryptography-46.0.3-pp311-pypy311_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:c70cc23f12726be8f8bc72e41d5065d77e4515efae3690326764ea1b07845cfb", size = 4409944, upload-time = "2025-10-15T23:18:20.597Z" }, - { url = "https://files.pythonhosted.org/packages/99/55/181022996c4063fc0e7666a47049a1ca705abb9c8a13830f074edb347495/cryptography-46.0.3-pp311-pypy311_pp73-manylinux_2_34_aarch64.whl", hash = "sha256:9394673a9f4de09e28b5356e7fff97d778f8abad85c9d5ac4a4b7e25a0de7717", size = 4242957, upload-time = "2025-10-15T23:18:22.18Z" }, - { url = "https://files.pythonhosted.org/packages/ba/af/72cd6ef29f9c5f731251acadaeb821559fe25f10852f44a63374c9ca08c1/cryptography-46.0.3-pp311-pypy311_pp73-manylinux_2_34_x86_64.whl", hash = "sha256:94cd0549accc38d1494e1f8de71eca837d0509d0d44bf11d158524b0e12cebf9", size = 4409447, upload-time = "2025-10-15T23:18:24.209Z" }, - { url = "https://files.pythonhosted.org/packages/0d/c3/e90f4a4feae6410f914f8ebac129b9ae7a8c92eb60a638012dde42030a9d/cryptography-46.0.3-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:6b5063083824e5509fdba180721d55909ffacccc8adbec85268b48439423d78c", size = 3438528, upload-time = "2025-10-15T23:18:26.227Z" }, + { url = "https://files.pythonhosted.org/packages/8d/99/157aae7949a5f30d51fcb1a9851e8ebd5c74bf99b5285d8bb4b8b9ee641e/cryptography-46.0.4-cp311-abi3-macosx_10_9_universal2.whl", hash = "sha256:281526e865ed4166009e235afadf3a4c4cba6056f99336a99efba65336fd5485", size = 7173686, upload-time = "2026-01-28T00:23:07.515Z" }, + { url = "https://files.pythonhosted.org/packages/87/91/874b8910903159043b5c6a123b7e79c4559ddd1896e38967567942635778/cryptography-46.0.4-cp311-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5f14fba5bf6f4390d7ff8f086c566454bff0411f6d8aa7af79c88b6f9267aecc", size = 4275871, upload-time = "2026-01-28T00:23:09.439Z" }, + { url = "https://files.pythonhosted.org/packages/c0/35/690e809be77896111f5b195ede56e4b4ed0435b428c2f2b6d35046fbb5e8/cryptography-46.0.4-cp311-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:47bcd19517e6389132f76e2d5303ded6cf3f78903da2158a671be8de024f4cd0", size = 4423124, upload-time = "2026-01-28T00:23:11.529Z" }, + { url = "https://files.pythonhosted.org/packages/1a/5b/a26407d4f79d61ca4bebaa9213feafdd8806dc69d3d290ce24996d3cfe43/cryptography-46.0.4-cp311-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:01df4f50f314fbe7009f54046e908d1754f19d0c6d3070df1e6268c5a4af09fa", size = 4277090, upload-time = "2026-01-28T00:23:13.123Z" }, + { url = "https://files.pythonhosted.org/packages/0c/d8/4bb7aec442a9049827aa34cee1aa83803e528fa55da9a9d45d01d1bb933e/cryptography-46.0.4-cp311-abi3-manylinux_2_28_ppc64le.whl", hash = "sha256:5aa3e463596b0087b3da0dbe2b2487e9fc261d25da85754e30e3b40637d61f81", size = 4947652, upload-time = "2026-01-28T00:23:14.554Z" }, + { url = "https://files.pythonhosted.org/packages/2b/08/f83e2e0814248b844265802d081f2fac2f1cbe6cd258e72ba14ff006823a/cryptography-46.0.4-cp311-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:0a9ad24359fee86f131836a9ac3bffc9329e956624a2d379b613f8f8abaf5255", size = 4455157, upload-time = "2026-01-28T00:23:16.443Z" }, + { url = "https://files.pythonhosted.org/packages/0a/05/19d849cf4096448779d2dcc9bb27d097457dac36f7273ffa875a93b5884c/cryptography-46.0.4-cp311-abi3-manylinux_2_31_armv7l.whl", hash = "sha256:dc1272e25ef673efe72f2096e92ae39dea1a1a450dd44918b15351f72c5a168e", size = 3981078, upload-time = "2026-01-28T00:23:17.838Z" }, + { url = "https://files.pythonhosted.org/packages/e6/89/f7bac81d66ba7cde867a743ea5b37537b32b5c633c473002b26a226f703f/cryptography-46.0.4-cp311-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:de0f5f4ec8711ebc555f54735d4c673fc34b65c44283895f1a08c2b49d2fd99c", size = 4276213, upload-time = "2026-01-28T00:23:19.257Z" }, + { url = "https://files.pythonhosted.org/packages/da/9f/7133e41f24edd827020ad21b068736e792bc68eecf66d93c924ad4719fb3/cryptography-46.0.4-cp311-abi3-manylinux_2_34_ppc64le.whl", hash = "sha256:eeeb2e33d8dbcccc34d64651f00a98cb41b2dc69cef866771a5717e6734dfa32", size = 4912190, upload-time = "2026-01-28T00:23:21.244Z" }, + { url = "https://files.pythonhosted.org/packages/a6/f7/6d43cbaddf6f65b24816e4af187d211f0bc536a29961f69faedc48501d8e/cryptography-46.0.4-cp311-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:3d425eacbc9aceafd2cb429e42f4e5d5633c6f873f5e567077043ef1b9bbf616", size = 4454641, upload-time = "2026-01-28T00:23:22.866Z" }, + { url = "https://files.pythonhosted.org/packages/9e/4f/ebd0473ad656a0ac912a16bd07db0f5d85184924e14fc88feecae2492834/cryptography-46.0.4-cp311-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:91627ebf691d1ea3976a031b61fb7bac1ccd745afa03602275dda443e11c8de0", size = 4405159, upload-time = "2026-01-28T00:23:25.278Z" }, + { url = "https://files.pythonhosted.org/packages/d1/f7/7923886f32dc47e27adeff8246e976d77258fd2aa3efdd1754e4e323bf49/cryptography-46.0.4-cp311-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:2d08bc22efd73e8854b0b7caff402d735b354862f1145d7be3b9c0f740fef6a0", size = 4666059, upload-time = "2026-01-28T00:23:26.766Z" }, + { url = "https://files.pythonhosted.org/packages/eb/a7/0fca0fd3591dffc297278a61813d7f661a14243dd60f499a7a5b48acb52a/cryptography-46.0.4-cp311-abi3-win32.whl", hash = "sha256:82a62483daf20b8134f6e92898da70d04d0ef9a75829d732ea1018678185f4f5", size = 3026378, upload-time = "2026-01-28T00:23:28.317Z" }, + { url = "https://files.pythonhosted.org/packages/2d/12/652c84b6f9873f0909374864a57b003686c642ea48c84d6c7e2c515e6da5/cryptography-46.0.4-cp311-abi3-win_amd64.whl", hash = "sha256:6225d3ebe26a55dbc8ead5ad1265c0403552a63336499564675b29eb3184c09b", size = 3478614, upload-time = "2026-01-28T00:23:30.275Z" }, + { url = "https://files.pythonhosted.org/packages/56/f7/f648fdbb61d0d45902d3f374217451385edc7e7768d1b03ff1d0e5ffc17b/cryptography-46.0.4-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:a9556ba711f7c23f77b151d5798f3ac44a13455cc68db7697a1096e6d0563cab", size = 7169583, upload-time = "2026-01-28T00:23:56.558Z" }, + { url = "https://files.pythonhosted.org/packages/d8/cc/8f3224cbb2a928de7298d6ed4790f5ebc48114e02bdc9559196bfb12435d/cryptography-46.0.4-cp38-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8bf75b0259e87fa70bddc0b8b4078b76e7fd512fd9afae6c1193bcf440a4dbef", size = 4275419, upload-time = "2026-01-28T00:23:58.364Z" }, + { url = "https://files.pythonhosted.org/packages/17/43/4a18faa7a872d00e4264855134ba82d23546c850a70ff209e04ee200e76f/cryptography-46.0.4-cp38-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:3c268a3490df22270955966ba236d6bc4a8f9b6e4ffddb78aac535f1a5ea471d", size = 4419058, upload-time = "2026-01-28T00:23:59.867Z" }, + { url = "https://files.pythonhosted.org/packages/ee/64/6651969409821d791ba12346a124f55e1b76f66a819254ae840a965d4b9c/cryptography-46.0.4-cp38-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:812815182f6a0c1d49a37893a303b44eaac827d7f0d582cecfc81b6427f22973", size = 4278151, upload-time = "2026-01-28T00:24:01.731Z" }, + { url = "https://files.pythonhosted.org/packages/20/0b/a7fce65ee08c3c02f7a8310cc090a732344066b990ac63a9dfd0a655d321/cryptography-46.0.4-cp38-abi3-manylinux_2_28_ppc64le.whl", hash = "sha256:a90e43e3ef65e6dcf969dfe3bb40cbf5aef0d523dff95bfa24256be172a845f4", size = 4939441, upload-time = "2026-01-28T00:24:03.175Z" }, + { url = "https://files.pythonhosted.org/packages/db/a7/20c5701e2cd3e1dfd7a19d2290c522a5f435dd30957d431dcb531d0f1413/cryptography-46.0.4-cp38-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:a05177ff6296644ef2876fce50518dffb5bcdf903c85250974fc8bc85d54c0af", size = 4451617, upload-time = "2026-01-28T00:24:05.403Z" }, + { url = "https://files.pythonhosted.org/packages/00/dc/3e16030ea9aa47b63af6524c354933b4fb0e352257c792c4deeb0edae367/cryptography-46.0.4-cp38-abi3-manylinux_2_31_armv7l.whl", hash = "sha256:daa392191f626d50f1b136c9b4cf08af69ca8279d110ea24f5c2700054d2e263", size = 3977774, upload-time = "2026-01-28T00:24:06.851Z" }, + { url = "https://files.pythonhosted.org/packages/42/c8/ad93f14118252717b465880368721c963975ac4b941b7ef88f3c56bf2897/cryptography-46.0.4-cp38-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:e07ea39c5b048e085f15923511d8121e4a9dc45cee4e3b970ca4f0d338f23095", size = 4277008, upload-time = "2026-01-28T00:24:08.926Z" }, + { url = "https://files.pythonhosted.org/packages/00/cf/89c99698151c00a4631fbfcfcf459d308213ac29e321b0ff44ceeeac82f1/cryptography-46.0.4-cp38-abi3-manylinux_2_34_ppc64le.whl", hash = "sha256:d5a45ddc256f492ce42a4e35879c5e5528c09cd9ad12420828c972951d8e016b", size = 4903339, upload-time = "2026-01-28T00:24:12.009Z" }, + { url = "https://files.pythonhosted.org/packages/03/c3/c90a2cb358de4ac9309b26acf49b2a100957e1ff5cc1e98e6c4996576710/cryptography-46.0.4-cp38-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:6bb5157bf6a350e5b28aee23beb2d84ae6f5be390b2f8ee7ea179cda077e1019", size = 4451216, upload-time = "2026-01-28T00:24:13.975Z" }, + { url = "https://files.pythonhosted.org/packages/96/2c/8d7f4171388a10208671e181ca43cdc0e596d8259ebacbbcfbd16de593da/cryptography-46.0.4-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:dd5aba870a2c40f87a3af043e0dee7d9eb02d4aff88a797b48f2b43eff8c3ab4", size = 4404299, upload-time = "2026-01-28T00:24:16.169Z" }, + { url = "https://files.pythonhosted.org/packages/e9/23/cbb2036e450980f65c6e0a173b73a56ff3bccd8998965dea5cc9ddd424a5/cryptography-46.0.4-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:93d8291da8d71024379ab2cb0b5c57915300155ad42e07f76bea6ad838d7e59b", size = 4664837, upload-time = "2026-01-28T00:24:17.629Z" }, + { url = "https://files.pythonhosted.org/packages/0a/21/f7433d18fe6d5845329cbdc597e30caf983229c7a245bcf54afecc555938/cryptography-46.0.4-cp38-abi3-win32.whl", hash = "sha256:0563655cb3c6d05fb2afe693340bc050c30f9f34e15763361cf08e94749401fc", size = 3009779, upload-time = "2026-01-28T00:24:20.198Z" }, + { url = "https://files.pythonhosted.org/packages/3a/6a/bd2e7caa2facffedf172a45c1a02e551e6d7d4828658c9a245516a598d94/cryptography-46.0.4-cp38-abi3-win_amd64.whl", hash = "sha256:fa0900b9ef9c49728887d1576fd8d9e7e3ea872fa9b25ef9b64888adc434e976", size = 3466633, upload-time = "2026-01-28T00:24:21.851Z" }, + { url = "https://files.pythonhosted.org/packages/59/e0/f9c6c53e1f2a1c2507f00f2faba00f01d2f334b35b0fbfe5286715da2184/cryptography-46.0.4-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:766330cce7416c92b5e90c3bb71b1b79521760cdcfc3a6a1a182d4c9fab23d2b", size = 3476316, upload-time = "2026-01-28T00:24:24.144Z" }, + { url = "https://files.pythonhosted.org/packages/27/7a/f8d2d13227a9a1a9fe9c7442b057efecffa41f1e3c51d8622f26b9edbe8f/cryptography-46.0.4-pp311-pypy311_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:c236a44acfb610e70f6b3e1c3ca20ff24459659231ef2f8c48e879e2d32b73da", size = 4216693, upload-time = "2026-01-28T00:24:25.758Z" }, + { url = "https://files.pythonhosted.org/packages/c5/de/3787054e8f7972658370198753835d9d680f6cd4a39df9f877b57f0dd69c/cryptography-46.0.4-pp311-pypy311_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:8a15fb869670efa8f83cbffbc8753c1abf236883225aed74cd179b720ac9ec80", size = 4382765, upload-time = "2026-01-28T00:24:27.577Z" }, + { url = "https://files.pythonhosted.org/packages/8a/5f/60e0afb019973ba6a0b322e86b3d61edf487a4f5597618a430a2a15f2d22/cryptography-46.0.4-pp311-pypy311_pp73-manylinux_2_34_aarch64.whl", hash = "sha256:fdc3daab53b212472f1524d070735b2f0c214239df131903bae1d598016fa822", size = 4216066, upload-time = "2026-01-28T00:24:29.056Z" }, + { url = "https://files.pythonhosted.org/packages/81/8e/bf4a0de294f147fee66f879d9bae6f8e8d61515558e3d12785dd90eca0be/cryptography-46.0.4-pp311-pypy311_pp73-manylinux_2_34_x86_64.whl", hash = "sha256:44cc0675b27cadb71bdbb96099cca1fa051cd11d2ade09e5cd3a2edb929ed947", size = 4382025, upload-time = "2026-01-28T00:24:30.681Z" }, + { url = "https://files.pythonhosted.org/packages/79/f4/9ceb90cfd6a3847069b0b0b353fd3075dc69b49defc70182d8af0c4ca390/cryptography-46.0.4-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:be8c01a7d5a55f9a47d1888162b76c8f49d62b234d88f0ff91a9fbebe32ffbc3", size = 3406043, upload-time = "2026-01-28T00:24:32.236Z" }, ] [[package]] name = "databricks-sdk" -version = "0.80.0" +version = "0.82.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "google-auth" }, { name = "protobuf" }, { name = "requests" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/e7/b0/adeac3cdbb8fd286565b93af7779c8c21966f437bfd1dec0bde3e243fbd6/databricks_sdk-0.80.0.tar.gz", hash = "sha256:53e5228edd12caf619f4fd3c3d62fddd3ff4d5b30e1680e6b6aec68ac40e770b", size = 837547, upload-time = "2026-01-22T20:30:50.858Z" } +sdist = { url = "https://files.pythonhosted.org/packages/91/ae/dbc1a08b969e48a63e1df2be535caecb16c9eaefd03277065ee1aa2aaf3c/databricks_sdk-0.82.0.tar.gz", hash = "sha256:148399cb0d15d63000e2db2a2a354b3640494cb0ed78e939d3e99a676c3f7ec0", size = 838560, upload-time = "2026-01-29T12:48:30.479Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/08/71/ac3a16e620e4de7cea10695d7e926a6b00d7790208a932d81dd0b3136772/databricks_sdk-0.80.0-py3-none-any.whl", hash = "sha256:e654b08b945b4fc9651cfe65389035382c0885d74435123a0e4860d007fc963b", size = 788323, upload-time = "2026-01-22T20:30:49.372Z" }, + { url = "https://files.pythonhosted.org/packages/81/0f/f1e2c17c4e5d37eb0135bbc816c9e77d39d34365d8e3a5bf699a9efc39ea/databricks_sdk-0.82.0-py3-none-any.whl", hash = "sha256:927fc575d3019be910839bceba332e7252a0d4e588df64c47e44dd416bd561c8", size = 789223, upload-time = "2026-01-29T12:48:28.369Z" }, ] [[package]] @@ -1605,6 +1603,7 @@ dev = [ { name = "pytest-env" }, { name = "pytest-mock" }, { name = "pytest-timeout" }, + { name = "pytest-xdist" }, { name = "ruff" }, { name = "scipy-stubs" }, { name = "sseclient-py" }, @@ -1810,6 +1809,7 @@ dev = [ { name = "pytest-env", specifier = "~=1.1.3" }, { name = "pytest-mock", specifier = "~=3.14.0" }, { name = "pytest-timeout", specifier = ">=2.4.0" }, + { name = "pytest-xdist", specifier = ">=3.8.0" }, { name = "ruff", specifier = "~=0.14.0" }, { name = "scipy-stubs", specifier = ">=1.15.3.0" }, { name = "sseclient-py", specifier = ">=1.8.0" }, @@ -1976,7 +1976,7 @@ wheels = [ [[package]] name = "e2b" -version = "2.10.2" +version = "2.12.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "attrs" }, @@ -1990,9 +1990,9 @@ dependencies = [ { name = "typing-extensions" }, { name = "wcmatch" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/10/16/afd0b78b12bc50570ec3a3cd6d668e3c112aa250e02a7cc10fd7fc717142/e2b-2.10.2.tar.gz", hash = "sha256:b77ecd620fd057b81a9610da18141811c003cc6f446c39c7ec7b9e9dc147d864", size = 114601, upload-time = "2026-01-15T16:44:44.88Z" } +sdist = { url = "https://files.pythonhosted.org/packages/13/f4/b5b7c8d6d4008e7da36107d84b9faa0ae5ca6faf3dce5f20990c2e7334e3/e2b-2.12.1.tar.gz", hash = "sha256:663c938327c4974344038b9d2927c99b28ab70a88c796bc0cb9a0cbb8b791517", size = 117187, upload-time = "2026-01-27T22:38:18.388Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/10/ab/54d17995ef09436120464fc997b5399c0920c95bc007efc315ba5518349d/e2b-2.10.2-py3-none-any.whl", hash = "sha256:c719291fc9b3006b286809f6e820b803a1aab9a6f5ae4fe0140ead17efbce821", size = 213497, upload-time = "2026-01-15T16:44:43.067Z" }, + { url = "https://files.pythonhosted.org/packages/73/f1/787e8f4cc35e96040c4a232d0a4cd84eb1328cf4ab6cd7d47348adc0dc19/e2b-2.12.1-py3-none-any.whl", hash = "sha256:4b021a226afe8f42411a1cd6c22b8c2ff92f37394d7b2461a410c78b1a4504d7", size = 220080, upload-time = "2026-01-27T22:38:17.027Z" }, ] [[package]] @@ -2076,6 +2076,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c1/8b/5fe2cc11fee489817272089c4203e679c63b570a5aaeb18d852ae3cbba6a/et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa", size = 18059, upload-time = "2024-10-25T17:25:39.051Z" }, ] +[[package]] +name = "execnet" +version = "2.1.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/bf/89/780e11f9588d9e7128a3f87788354c7946a9cbb1401ad38a48c4db9a4f07/execnet-2.1.2.tar.gz", hash = "sha256:63d83bfdd9a23e35b9c6a3261412324f964c2ec8dcd8d3c6916ee9373e0befcd", size = 166622, upload-time = "2025-11-12T09:56:37.75Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ab/84/02fc1827e8cdded4aa65baef11296a9bbe595c474f0d6d758af082d849fd/execnet-2.1.2-py3-none-any.whl", hash = "sha256:67fba928dd5a544b783f6056f449e5e3931a5c378b128bc18501f7ea79e296ec", size = 40708, upload-time = "2025-11-12T09:56:36.333Z" }, +] + [[package]] name = "faker" version = "38.2.0" @@ -3093,14 +3102,14 @@ wheels = [ [[package]] name = "hypothesis" -version = "6.150.3" +version = "6.151.4" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "sortedcontainers" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/86/eb/1b0359b52d2136a7f4b8112e60148ef8b5ba480d450f409ed63e18d4c8d2/hypothesis-6.150.3.tar.gz", hash = "sha256:32c88d4b7df3a8483e69877561b520320bf7779b0709c11869e392025e9279d4", size = 475542, upload-time = "2026-01-23T07:53:09.716Z" } +sdist = { url = "https://files.pythonhosted.org/packages/47/03/9fd03d5db09029250e69745c1600edab16fe90947636f77a12ba92d79939/hypothesis-6.151.4.tar.gz", hash = "sha256:658a62da1c3ccb36746ac2f7dc4bb1a6e76bd314e0dc54c4e1aaba2503d5545c", size = 475706, upload-time = "2026-01-29T01:30:14.985Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/69/6b/c94ca780814aa4e73b52a8d5e9092f0b83f4732a9ecb3d1f8333c93ac131/hypothesis-6.150.3-py3-none-any.whl", hash = "sha256:5577c0f8eff5ac54a8aff1ce32e30c5454167c29360fdabf1bfea0539b1689f9", size = 542960, upload-time = "2026-01-23T07:53:07.309Z" }, + { url = "https://files.pythonhosted.org/packages/9b/6d/01ad1b6c3b8cb2bb47eeaa9765dabc27cbe68e3b59f6cff83d5668f57780/hypothesis-6.151.4-py3-none-any.whl", hash = "sha256:a1cf7e0fdaa296d697a68ff3c0b3912c0050f07aa37e7d2ff33a966749d1d9b4", size = 543146, upload-time = "2026-01-29T01:30:12.805Z" }, ] [[package]] @@ -3620,7 +3629,7 @@ wheels = [ [[package]] name = "mlflow-skinny" -version = "3.8.1" +version = "3.9.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "cachetools" }, @@ -3643,9 +3652,9 @@ dependencies = [ { name = "typing-extensions" }, { name = "uvicorn" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/7e/00/18486d9072739e63471c1e441e78cdb6a10c641312d98f6699715406451e/mlflow_skinny-3.8.1.tar.gz", hash = "sha256:0c0aade08187030a4653e267bcd63de2f12cbfebf4c6737832cba45d6fb3594d", size = 2082226, upload-time = "2025-12-26T16:30:11.171Z" } +sdist = { url = "https://files.pythonhosted.org/packages/fc/18/34a8c085eece1abb7edaed3b9a383670b97a4a234fec62d1823e8c64d11b/mlflow_skinny-3.9.0.tar.gz", hash = "sha256:0598e0635dd1af9d195fb429210819aa4b56e9d6014f87134241f2325d57a290", size = 2329309, upload-time = "2026-01-29T07:42:36.8Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/8d/24/42e52320636fcbabeaf50704f9269a328acc995e1b8a44df6fea33130a0a/mlflow_skinny-3.8.1-py3-none-any.whl", hash = "sha256:3a6ee27f5ac1e67c1d565fa0e12c070b27129b03e669dcaf88ff841176429142", size = 2506002, upload-time = "2025-12-26T16:30:09.357Z" }, + { url = "https://files.pythonhosted.org/packages/c0/7c/a82fd9d6ecefba347e3a65168df63fd79784fa8c22b8734fb4cb71f2d469/mlflow_skinny-3.9.0-py3-none-any.whl", hash = "sha256:9b98706cdf9e07a61da7fbcd717c8d35ac89c76e084d25aafdbc150028e832d5", size = 2807062, upload-time = "2026-01-29T07:42:35.132Z" }, ] [[package]] @@ -3739,47 +3748,47 @@ wheels = [ [[package]] name = "multidict" -version = "6.7.0" +version = "6.7.1" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/80/1e/5492c365f222f907de1039b91f922b93fa4f764c713ee858d235495d8f50/multidict-6.7.0.tar.gz", hash = "sha256:c6e99d9a65ca282e578dfea819cfa9c0a62b2499d8677392e09feaf305e9e6f5", size = 101834, upload-time = "2025-10-06T14:52:30.657Z" } +sdist = { url = "https://files.pythonhosted.org/packages/1a/c2/c2d94cbe6ac1753f3fc980da97b3d930efe1da3af3c9f5125354436c073d/multidict-6.7.1.tar.gz", hash = "sha256:ec6652a1bee61c53a3e5776b6049172c53b6aaba34f18c9ad04f82712bac623d", size = 102010, upload-time = "2026-01-26T02:46:45.979Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/34/9e/5c727587644d67b2ed479041e4b1c58e30afc011e3d45d25bbe35781217c/multidict-6.7.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:4d409aa42a94c0b3fa617708ef5276dfe81012ba6753a0370fcc9d0195d0a1fc", size = 76604, upload-time = "2025-10-06T14:48:54.277Z" }, - { url = "https://files.pythonhosted.org/packages/17/e4/67b5c27bd17c085a5ea8f1ec05b8a3e5cba0ca734bfcad5560fb129e70ca/multidict-6.7.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:14c9e076eede3b54c636f8ce1c9c252b5f057c62131211f0ceeec273810c9721", size = 44715, upload-time = "2025-10-06T14:48:55.445Z" }, - { url = "https://files.pythonhosted.org/packages/4d/e1/866a5d77be6ea435711bef2a4291eed11032679b6b28b56b4776ab06ba3e/multidict-6.7.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4c09703000a9d0fa3c3404b27041e574cc7f4df4c6563873246d0e11812a94b6", size = 44332, upload-time = "2025-10-06T14:48:56.706Z" }, - { url = "https://files.pythonhosted.org/packages/31/61/0c2d50241ada71ff61a79518db85ada85fdabfcf395d5968dae1cbda04e5/multidict-6.7.0-cp311-cp311-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:a265acbb7bb33a3a2d626afbe756371dce0279e7b17f4f4eda406459c2b5ff1c", size = 245212, upload-time = "2025-10-06T14:48:58.042Z" }, - { url = "https://files.pythonhosted.org/packages/ac/e0/919666a4e4b57fff1b57f279be1c9316e6cdc5de8a8b525d76f6598fefc7/multidict-6.7.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:51cb455de290ae462593e5b1cb1118c5c22ea7f0d3620d9940bf695cea5a4bd7", size = 246671, upload-time = "2025-10-06T14:49:00.004Z" }, - { url = "https://files.pythonhosted.org/packages/a1/cc/d027d9c5a520f3321b65adea289b965e7bcbd2c34402663f482648c716ce/multidict-6.7.0-cp311-cp311-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:db99677b4457c7a5c5a949353e125ba72d62b35f74e26da141530fbb012218a7", size = 225491, upload-time = "2025-10-06T14:49:01.393Z" }, - { url = "https://files.pythonhosted.org/packages/75/c4/bbd633980ce6155a28ff04e6a6492dd3335858394d7bb752d8b108708558/multidict-6.7.0-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:f470f68adc395e0183b92a2f4689264d1ea4b40504a24d9882c27375e6662bb9", size = 257322, upload-time = "2025-10-06T14:49:02.745Z" }, - { url = "https://files.pythonhosted.org/packages/4c/6d/d622322d344f1f053eae47e033b0b3f965af01212de21b10bcf91be991fb/multidict-6.7.0-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:0db4956f82723cc1c270de9c6e799b4c341d327762ec78ef82bb962f79cc07d8", size = 254694, upload-time = "2025-10-06T14:49:04.15Z" }, - { url = "https://files.pythonhosted.org/packages/a8/9f/78f8761c2705d4c6d7516faed63c0ebdac569f6db1bef95e0d5218fdc146/multidict-6.7.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3e56d780c238f9e1ae66a22d2adf8d16f485381878250db8d496623cd38b22bd", size = 246715, upload-time = "2025-10-06T14:49:05.967Z" }, - { url = "https://files.pythonhosted.org/packages/78/59/950818e04f91b9c2b95aab3d923d9eabd01689d0dcd889563988e9ea0fd8/multidict-6.7.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:9d14baca2ee12c1a64740d4531356ba50b82543017f3ad6de0deb943c5979abb", size = 243189, upload-time = "2025-10-06T14:49:07.37Z" }, - { url = "https://files.pythonhosted.org/packages/7a/3d/77c79e1934cad2ee74991840f8a0110966d9599b3af95964c0cd79bb905b/multidict-6.7.0-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:295a92a76188917c7f99cda95858c822f9e4aae5824246bba9b6b44004ddd0a6", size = 237845, upload-time = "2025-10-06T14:49:08.759Z" }, - { url = "https://files.pythonhosted.org/packages/63/1b/834ce32a0a97a3b70f86437f685f880136677ac00d8bce0027e9fd9c2db7/multidict-6.7.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:39f1719f57adbb767ef592a50ae5ebb794220d1188f9ca93de471336401c34d2", size = 246374, upload-time = "2025-10-06T14:49:10.574Z" }, - { url = "https://files.pythonhosted.org/packages/23/ef/43d1c3ba205b5dec93dc97f3fba179dfa47910fc73aaaea4f7ceb41cec2a/multidict-6.7.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:0a13fb8e748dfc94749f622de065dd5c1def7e0d2216dba72b1d8069a389c6ff", size = 253345, upload-time = "2025-10-06T14:49:12.331Z" }, - { url = "https://files.pythonhosted.org/packages/6b/03/eaf95bcc2d19ead522001f6a650ef32811aa9e3624ff0ad37c445c7a588c/multidict-6.7.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:e3aa16de190d29a0ea1b48253c57d99a68492c8dd8948638073ab9e74dc9410b", size = 246940, upload-time = "2025-10-06T14:49:13.821Z" }, - { url = "https://files.pythonhosted.org/packages/e8/df/ec8a5fd66ea6cd6f525b1fcbb23511b033c3e9bc42b81384834ffa484a62/multidict-6.7.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:a048ce45dcdaaf1defb76b2e684f997fb5abf74437b6cb7b22ddad934a964e34", size = 242229, upload-time = "2025-10-06T14:49:15.603Z" }, - { url = "https://files.pythonhosted.org/packages/8a/a2/59b405d59fd39ec86d1142630e9049243015a5f5291ba49cadf3c090c541/multidict-6.7.0-cp311-cp311-win32.whl", hash = "sha256:a90af66facec4cebe4181b9e62a68be65e45ac9b52b67de9eec118701856e7ff", size = 41308, upload-time = "2025-10-06T14:49:16.871Z" }, - { url = "https://files.pythonhosted.org/packages/32/0f/13228f26f8b882c34da36efa776c3b7348455ec383bab4a66390e42963ae/multidict-6.7.0-cp311-cp311-win_amd64.whl", hash = "sha256:95b5ffa4349df2887518bb839409bcf22caa72d82beec453216802f475b23c81", size = 46037, upload-time = "2025-10-06T14:49:18.457Z" }, - { url = "https://files.pythonhosted.org/packages/84/1f/68588e31b000535a3207fd3c909ebeec4fb36b52c442107499c18a896a2a/multidict-6.7.0-cp311-cp311-win_arm64.whl", hash = "sha256:329aa225b085b6f004a4955271a7ba9f1087e39dcb7e65f6284a988264a63912", size = 43023, upload-time = "2025-10-06T14:49:19.648Z" }, - { url = "https://files.pythonhosted.org/packages/c2/9e/9f61ac18d9c8b475889f32ccfa91c9f59363480613fc807b6e3023d6f60b/multidict-6.7.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:8a3862568a36d26e650a19bb5cbbba14b71789032aebc0423f8cc5f150730184", size = 76877, upload-time = "2025-10-06T14:49:20.884Z" }, - { url = "https://files.pythonhosted.org/packages/38/6f/614f09a04e6184f8824268fce4bc925e9849edfa654ddd59f0b64508c595/multidict-6.7.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:960c60b5849b9b4f9dcc9bea6e3626143c252c74113df2c1540aebce70209b45", size = 45467, upload-time = "2025-10-06T14:49:22.054Z" }, - { url = "https://files.pythonhosted.org/packages/b3/93/c4f67a436dd026f2e780c433277fff72be79152894d9fc36f44569cab1a6/multidict-6.7.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2049be98fb57a31b4ccf870bf377af2504d4ae35646a19037ec271e4c07998aa", size = 43834, upload-time = "2025-10-06T14:49:23.566Z" }, - { url = "https://files.pythonhosted.org/packages/7f/f5/013798161ca665e4a422afbc5e2d9e4070142a9ff8905e482139cd09e4d0/multidict-6.7.0-cp312-cp312-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:0934f3843a1860dd465d38895c17fce1f1cb37295149ab05cd1b9a03afacb2a7", size = 250545, upload-time = "2025-10-06T14:49:24.882Z" }, - { url = "https://files.pythonhosted.org/packages/71/2f/91dbac13e0ba94669ea5119ba267c9a832f0cb65419aca75549fcf09a3dc/multidict-6.7.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b3e34f3a1b8131ba06f1a73adab24f30934d148afcd5f5de9a73565a4404384e", size = 258305, upload-time = "2025-10-06T14:49:26.778Z" }, - { url = "https://files.pythonhosted.org/packages/ef/b0/754038b26f6e04488b48ac621f779c341338d78503fb45403755af2df477/multidict-6.7.0-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:efbb54e98446892590dc2458c19c10344ee9a883a79b5cec4bc34d6656e8d546", size = 242363, upload-time = "2025-10-06T14:49:28.562Z" }, - { url = "https://files.pythonhosted.org/packages/87/15/9da40b9336a7c9fa606c4cf2ed80a649dffeb42b905d4f63a1d7eb17d746/multidict-6.7.0-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a35c5fc61d4f51eb045061e7967cfe3123d622cd500e8868e7c0c592a09fedc4", size = 268375, upload-time = "2025-10-06T14:49:29.96Z" }, - { url = "https://files.pythonhosted.org/packages/82/72/c53fcade0cc94dfaad583105fd92b3a783af2091eddcb41a6d5a52474000/multidict-6.7.0-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:29fe6740ebccba4175af1b9b87bf553e9c15cd5868ee967e010efcf94e4fd0f1", size = 269346, upload-time = "2025-10-06T14:49:31.404Z" }, - { url = "https://files.pythonhosted.org/packages/0d/e2/9baffdae21a76f77ef8447f1a05a96ec4bc0a24dae08767abc0a2fe680b8/multidict-6.7.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:123e2a72e20537add2f33a79e605f6191fba2afda4cbb876e35c1a7074298a7d", size = 256107, upload-time = "2025-10-06T14:49:32.974Z" }, - { url = "https://files.pythonhosted.org/packages/3c/06/3f06f611087dc60d65ef775f1fb5aca7c6d61c6db4990e7cda0cef9b1651/multidict-6.7.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:b284e319754366c1aee2267a2036248b24eeb17ecd5dc16022095e747f2f4304", size = 253592, upload-time = "2025-10-06T14:49:34.52Z" }, - { url = "https://files.pythonhosted.org/packages/20/24/54e804ec7945b6023b340c412ce9c3f81e91b3bf5fa5ce65558740141bee/multidict-6.7.0-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:803d685de7be4303b5a657b76e2f6d1240e7e0a8aa2968ad5811fa2285553a12", size = 251024, upload-time = "2025-10-06T14:49:35.956Z" }, - { url = "https://files.pythonhosted.org/packages/14/48/011cba467ea0b17ceb938315d219391d3e421dfd35928e5dbdc3f4ae76ef/multidict-6.7.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:c04a328260dfd5db8c39538f999f02779012268f54614902d0afc775d44e0a62", size = 251484, upload-time = "2025-10-06T14:49:37.631Z" }, - { url = "https://files.pythonhosted.org/packages/0d/2f/919258b43bb35b99fa127435cfb2d91798eb3a943396631ef43e3720dcf4/multidict-6.7.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:8a19cdb57cd3df4cd865849d93ee14920fb97224300c88501f16ecfa2604b4e0", size = 263579, upload-time = "2025-10-06T14:49:39.502Z" }, - { url = "https://files.pythonhosted.org/packages/31/22/a0e884d86b5242b5a74cf08e876bdf299e413016b66e55511f7a804a366e/multidict-6.7.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:9b2fd74c52accced7e75de26023b7dccee62511a600e62311b918ec5c168fc2a", size = 259654, upload-time = "2025-10-06T14:49:41.32Z" }, - { url = "https://files.pythonhosted.org/packages/b2/e5/17e10e1b5c5f5a40f2fcbb45953c9b215f8a4098003915e46a93f5fcaa8f/multidict-6.7.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:3e8bfdd0e487acf992407a140d2589fe598238eaeffa3da8448d63a63cd363f8", size = 251511, upload-time = "2025-10-06T14:49:46.021Z" }, - { url = "https://files.pythonhosted.org/packages/e3/9a/201bb1e17e7af53139597069c375e7b0dcbd47594604f65c2d5359508566/multidict-6.7.0-cp312-cp312-win32.whl", hash = "sha256:dd32a49400a2c3d52088e120ee00c1e3576cbff7e10b98467962c74fdb762ed4", size = 41895, upload-time = "2025-10-06T14:49:48.718Z" }, - { url = "https://files.pythonhosted.org/packages/46/e2/348cd32faad84eaf1d20cce80e2bb0ef8d312c55bca1f7fa9865e7770aaf/multidict-6.7.0-cp312-cp312-win_amd64.whl", hash = "sha256:92abb658ef2d7ef22ac9f8bb88e8b6c3e571671534e029359b6d9e845923eb1b", size = 46073, upload-time = "2025-10-06T14:49:50.28Z" }, - { url = "https://files.pythonhosted.org/packages/25/ec/aad2613c1910dce907480e0c3aa306905830f25df2e54ccc9dea450cb5aa/multidict-6.7.0-cp312-cp312-win_arm64.whl", hash = "sha256:490dab541a6a642ce1a9d61a4781656b346a55c13038f0b1244653828e3a83ec", size = 43226, upload-time = "2025-10-06T14:49:52.304Z" }, - { url = "https://files.pythonhosted.org/packages/b7/da/7d22601b625e241d4f23ef1ebff8acfc60da633c9e7e7922e24d10f592b3/multidict-6.7.0-py3-none-any.whl", hash = "sha256:394fc5c42a333c9ffc3e421a4c85e08580d990e08b99f6bf35b4132114c5dcb3", size = 12317, upload-time = "2025-10-06T14:52:29.272Z" }, + { url = "https://files.pythonhosted.org/packages/ce/f1/a90635c4f88fb913fbf4ce660b83b7445b7a02615bda034b2f8eb38fd597/multidict-6.7.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:7ff981b266af91d7b4b3793ca3382e53229088d193a85dfad6f5f4c27fc73e5d", size = 76626, upload-time = "2026-01-26T02:43:26.485Z" }, + { url = "https://files.pythonhosted.org/packages/a6/9b/267e64eaf6fc637a15b35f5de31a566634a2740f97d8d094a69d34f524a4/multidict-6.7.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:844c5bca0b5444adb44a623fb0a1310c2f4cd41f402126bb269cd44c9b3f3e1e", size = 44706, upload-time = "2026-01-26T02:43:27.607Z" }, + { url = "https://files.pythonhosted.org/packages/dd/a4/d45caf2b97b035c57267791ecfaafbd59c68212004b3842830954bb4b02e/multidict-6.7.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f2a0a924d4c2e9afcd7ec64f9de35fcd96915149b2216e1cb2c10a56df483855", size = 44356, upload-time = "2026-01-26T02:43:28.661Z" }, + { url = "https://files.pythonhosted.org/packages/fd/d2/0a36c8473f0cbaeadd5db6c8b72d15bbceeec275807772bfcd059bef487d/multidict-6.7.1-cp311-cp311-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:8be1802715a8e892c784c0197c2ace276ea52702a0ede98b6310c8f255a5afb3", size = 244355, upload-time = "2026-01-26T02:43:31.165Z" }, + { url = "https://files.pythonhosted.org/packages/5d/16/8c65be997fd7dd311b7d39c7b6e71a0cb449bad093761481eccbbe4b42a2/multidict-6.7.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2e2d2ed645ea29f31c4c7ea1552fcfd7cb7ba656e1eafd4134a6620c9f5fdd9e", size = 246433, upload-time = "2026-01-26T02:43:32.581Z" }, + { url = "https://files.pythonhosted.org/packages/01/fb/4dbd7e848d2799c6a026ec88ad39cf2b8416aa167fcc903baa55ecaa045c/multidict-6.7.1-cp311-cp311-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:95922cee9a778659e91db6497596435777bd25ed116701a4c034f8e46544955a", size = 225376, upload-time = "2026-01-26T02:43:34.417Z" }, + { url = "https://files.pythonhosted.org/packages/b6/8a/4a3a6341eac3830f6053062f8fbc9a9e54407c80755b3f05bc427295c2d0/multidict-6.7.1-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:6b83cabdc375ffaaa15edd97eb7c0c672ad788e2687004990074d7d6c9b140c8", size = 257365, upload-time = "2026-01-26T02:43:35.741Z" }, + { url = "https://files.pythonhosted.org/packages/f7/a2/dd575a69c1aa206e12d27d0770cdf9b92434b48a9ef0cd0d1afdecaa93c4/multidict-6.7.1-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:38fb49540705369bab8484db0689d86c0a33a0a9f2c1b197f506b71b4b6c19b0", size = 254747, upload-time = "2026-01-26T02:43:36.976Z" }, + { url = "https://files.pythonhosted.org/packages/5a/56/21b27c560c13822ed93133f08aa6372c53a8e067f11fbed37b4adcdac922/multidict-6.7.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:439cbebd499f92e9aa6793016a8acaa161dfa749ae86d20960189f5398a19144", size = 246293, upload-time = "2026-01-26T02:43:38.258Z" }, + { url = "https://files.pythonhosted.org/packages/5a/a4/23466059dc3854763423d0ad6c0f3683a379d97673b1b89ec33826e46728/multidict-6.7.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:6d3bc717b6fe763b8be3f2bee2701d3c8eb1b2a8ae9f60910f1b2860c82b6c49", size = 242962, upload-time = "2026-01-26T02:43:40.034Z" }, + { url = "https://files.pythonhosted.org/packages/1f/67/51dd754a3524d685958001e8fa20a0f5f90a6a856e0a9dcabff69be3dbb7/multidict-6.7.1-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:619e5a1ac57986dbfec9f0b301d865dddf763696435e2962f6d9cf2fdff2bb71", size = 237360, upload-time = "2026-01-26T02:43:41.752Z" }, + { url = "https://files.pythonhosted.org/packages/64/3f/036dfc8c174934d4b55d86ff4f978e558b0e585cef70cfc1ad01adc6bf18/multidict-6.7.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:0b38ebffd9be37c1170d33bc0f36f4f262e0a09bc1aac1c34c7aa51a7293f0b3", size = 245940, upload-time = "2026-01-26T02:43:43.042Z" }, + { url = "https://files.pythonhosted.org/packages/3d/20/6214d3c105928ebc353a1c644a6ef1408bc5794fcb4f170bb524a3c16311/multidict-6.7.1-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:10ae39c9cfe6adedcdb764f5e8411d4a92b055e35573a2eaa88d3323289ef93c", size = 253502, upload-time = "2026-01-26T02:43:44.371Z" }, + { url = "https://files.pythonhosted.org/packages/b1/e2/c653bc4ae1be70a0f836b82172d643fcf1dade042ba2676ab08ec08bff0f/multidict-6.7.1-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:25167cc263257660290fba06b9318d2026e3c910be240a146e1f66dd114af2b0", size = 247065, upload-time = "2026-01-26T02:43:45.745Z" }, + { url = "https://files.pythonhosted.org/packages/c8/11/a854b4154cd3bd8b1fd375e8a8ca9d73be37610c361543d56f764109509b/multidict-6.7.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:128441d052254f42989ef98b7b6a6ecb1e6f708aa962c7984235316db59f50fa", size = 241870, upload-time = "2026-01-26T02:43:47.054Z" }, + { url = "https://files.pythonhosted.org/packages/13/bf/9676c0392309b5fdae322333d22a829715b570edb9baa8016a517b55b558/multidict-6.7.1-cp311-cp311-win32.whl", hash = "sha256:d62b7f64ffde3b99d06b707a280db04fb3855b55f5a06df387236051d0668f4a", size = 41302, upload-time = "2026-01-26T02:43:48.753Z" }, + { url = "https://files.pythonhosted.org/packages/c9/68/f16a3a8ba6f7b6dc92a1f19669c0810bd2c43fc5a02da13b1cbf8e253845/multidict-6.7.1-cp311-cp311-win_amd64.whl", hash = "sha256:bdbf9f3b332abd0cdb306e7c2113818ab1e922dc84b8f8fd06ec89ed2a19ab8b", size = 45981, upload-time = "2026-01-26T02:43:49.921Z" }, + { url = "https://files.pythonhosted.org/packages/ac/ad/9dd5305253fa00cd3c7555dbef69d5bf4133debc53b87ab8d6a44d411665/multidict-6.7.1-cp311-cp311-win_arm64.whl", hash = "sha256:b8c990b037d2fff2f4e33d3f21b9b531c5745b33a49a7d6dbe7a177266af44f6", size = 43159, upload-time = "2026-01-26T02:43:51.635Z" }, + { url = "https://files.pythonhosted.org/packages/8d/9c/f20e0e2cf80e4b2e4b1c365bf5fe104ee633c751a724246262db8f1a0b13/multidict-6.7.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:a90f75c956e32891a4eda3639ce6dd86e87105271f43d43442a3aedf3cddf172", size = 76893, upload-time = "2026-01-26T02:43:52.754Z" }, + { url = "https://files.pythonhosted.org/packages/fe/cf/18ef143a81610136d3da8193da9d80bfe1cb548a1e2d1c775f26b23d024a/multidict-6.7.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:3fccb473e87eaa1382689053e4a4618e7ba7b9b9b8d6adf2027ee474597128cd", size = 45456, upload-time = "2026-01-26T02:43:53.893Z" }, + { url = "https://files.pythonhosted.org/packages/a9/65/1caac9d4cd32e8433908683446eebc953e82d22b03d10d41a5f0fefe991b/multidict-6.7.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b0fa96985700739c4c7853a43c0b3e169360d6855780021bfc6d0f1ce7c123e7", size = 43872, upload-time = "2026-01-26T02:43:55.041Z" }, + { url = "https://files.pythonhosted.org/packages/cf/3b/d6bd75dc4f3ff7c73766e04e705b00ed6dbbaccf670d9e05a12b006f5a21/multidict-6.7.1-cp312-cp312-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:cb2a55f408c3043e42b40cc8eecd575afa27b7e0b956dfb190de0f8499a57a53", size = 251018, upload-time = "2026-01-26T02:43:56.198Z" }, + { url = "https://files.pythonhosted.org/packages/fd/80/c959c5933adedb9ac15152e4067c702a808ea183a8b64cf8f31af8ad3155/multidict-6.7.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:eb0ce7b2a32d09892b3dd6cc44877a0d02a33241fafca5f25c8b6b62374f8b75", size = 258883, upload-time = "2026-01-26T02:43:57.499Z" }, + { url = "https://files.pythonhosted.org/packages/86/85/7ed40adafea3d4f1c8b916e3b5cc3a8e07dfcdcb9cd72800f4ed3ca1b387/multidict-6.7.1-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:c3a32d23520ee37bf327d1e1a656fec76a2edd5c038bf43eddfa0572ec49c60b", size = 242413, upload-time = "2026-01-26T02:43:58.755Z" }, + { url = "https://files.pythonhosted.org/packages/d2/57/b8565ff533e48595503c785f8361ff9a4fde4d67de25c207cd0ba3befd03/multidict-6.7.1-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:9c90fed18bffc0189ba814749fdcc102b536e83a9f738a9003e569acd540a733", size = 268404, upload-time = "2026-01-26T02:44:00.216Z" }, + { url = "https://files.pythonhosted.org/packages/e0/50/9810c5c29350f7258180dfdcb2e52783a0632862eb334c4896ac717cebcb/multidict-6.7.1-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:da62917e6076f512daccfbbde27f46fed1c98fee202f0559adec8ee0de67f71a", size = 269456, upload-time = "2026-01-26T02:44:02.202Z" }, + { url = "https://files.pythonhosted.org/packages/f3/8d/5e5be3ced1d12966fefb5c4ea3b2a5b480afcea36406559442c6e31d4a48/multidict-6.7.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bfde23ef6ed9db7eaee6c37dcec08524cb43903c60b285b172b6c094711b3961", size = 256322, upload-time = "2026-01-26T02:44:03.56Z" }, + { url = "https://files.pythonhosted.org/packages/31/6e/d8a26d81ac166a5592782d208dd90dfdc0a7a218adaa52b45a672b46c122/multidict-6.7.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3758692429e4e32f1ba0df23219cd0b4fc0a52f476726fff9337d1a57676a582", size = 253955, upload-time = "2026-01-26T02:44:04.845Z" }, + { url = "https://files.pythonhosted.org/packages/59/4c/7c672c8aad41534ba619bcd4ade7a0dc87ed6b8b5c06149b85d3dd03f0cd/multidict-6.7.1-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:398c1478926eca669f2fd6a5856b6de9c0acf23a2cb59a14c0ba5844fa38077e", size = 251254, upload-time = "2026-01-26T02:44:06.133Z" }, + { url = "https://files.pythonhosted.org/packages/7b/bd/84c24de512cbafbdbc39439f74e967f19570ce7924e3007174a29c348916/multidict-6.7.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:c102791b1c4f3ab36ce4101154549105a53dc828f016356b3e3bcae2e3a039d3", size = 252059, upload-time = "2026-01-26T02:44:07.518Z" }, + { url = "https://files.pythonhosted.org/packages/fa/ba/f5449385510825b73d01c2d4087bf6d2fccc20a2d42ac34df93191d3dd03/multidict-6.7.1-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:a088b62bd733e2ad12c50dad01b7d0166c30287c166e137433d3b410add807a6", size = 263588, upload-time = "2026-01-26T02:44:09.382Z" }, + { url = "https://files.pythonhosted.org/packages/d7/11/afc7c677f68f75c84a69fe37184f0f82fce13ce4b92f49f3db280b7e92b3/multidict-6.7.1-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:3d51ff4785d58d3f6c91bdbffcb5e1f7ddfda557727043aa20d20ec4f65e324a", size = 259642, upload-time = "2026-01-26T02:44:10.73Z" }, + { url = "https://files.pythonhosted.org/packages/2b/17/ebb9644da78c4ab36403739e0e6e0e30ebb135b9caf3440825001a0bddcb/multidict-6.7.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:fc5907494fccf3e7d3f94f95c91d6336b092b5fc83811720fae5e2765890dfba", size = 251377, upload-time = "2026-01-26T02:44:12.042Z" }, + { url = "https://files.pythonhosted.org/packages/ca/a4/840f5b97339e27846c46307f2530a2805d9d537d8b8bd416af031cad7fa0/multidict-6.7.1-cp312-cp312-win32.whl", hash = "sha256:28ca5ce2fd9716631133d0e9a9b9a745ad7f60bac2bccafb56aa380fc0b6c511", size = 41887, upload-time = "2026-01-26T02:44:14.245Z" }, + { url = "https://files.pythonhosted.org/packages/80/31/0b2517913687895f5904325c2069d6a3b78f66cc641a86a2baf75a05dcbb/multidict-6.7.1-cp312-cp312-win_amd64.whl", hash = "sha256:fcee94dfbd638784645b066074b338bc9cc155d4b4bffa4adce1615c5a426c19", size = 46053, upload-time = "2026-01-26T02:44:15.371Z" }, + { url = "https://files.pythonhosted.org/packages/0c/5b/aba28e4ee4006ae4c7df8d327d31025d760ffa992ea23812a601d226e682/multidict-6.7.1-cp312-cp312-win_arm64.whl", hash = "sha256:ba0a9fb644d0c1a2194cf7ffb043bd852cea63a57f66fbd33959f7dae18517bf", size = 43307, upload-time = "2026-01-26T02:44:16.852Z" }, + { url = "https://files.pythonhosted.org/packages/81/08/7036c080d7117f28a4af526d794aab6a84463126db031b007717c1a6676e/multidict-6.7.1-py3-none-any.whl", hash = "sha256:55d97cc6dae627efa6a6e548885712d4864b81110ac76fa4e534c03819fa4a56", size = 12319, upload-time = "2026-01-26T02:46:44.004Z" }, ] [[package]] @@ -4071,7 +4080,7 @@ wheels = [ [[package]] name = "openai" -version = "2.15.0" +version = "2.16.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "anyio" }, @@ -4083,9 +4092,9 @@ dependencies = [ { name = "tqdm" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/94/f4/4690ecb5d70023ce6bfcfeabfe717020f654bde59a775058ec6ac4692463/openai-2.15.0.tar.gz", hash = "sha256:42eb8cbb407d84770633f31bf727d4ffb4138711c670565a41663d9439174fba", size = 627383, upload-time = "2026-01-09T22:10:08.603Z" } +sdist = { url = "https://files.pythonhosted.org/packages/b1/6c/e4c964fcf1d527fdf4739e7cc940c60075a4114d50d03871d5d5b1e13a88/openai-2.16.0.tar.gz", hash = "sha256:42eaa22ca0d8ded4367a77374104d7a2feafee5bd60a107c3c11b5243a11cd12", size = 629649, upload-time = "2026-01-27T23:28:02.579Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/b5/df/c306f7375d42bafb379934c2df4c2fa3964656c8c782bac75ee10c102818/openai-2.15.0-py3-none-any.whl", hash = "sha256:6ae23b932cd7230f7244e52954daa6602716d6b9bf235401a107af731baea6c3", size = 1067879, upload-time = "2026-01-09T22:10:06.446Z" }, + { url = "https://files.pythonhosted.org/packages/16/83/0315bf2cfd75a2ce8a7e54188e9456c60cec6c0cf66728ed07bd9859ff26/openai-2.16.0-py3-none-any.whl", hash = "sha256:5f46643a8f42899a84e80c38838135d7038e7718333ce61396994f887b09a59b", size = 1068612, upload-time = "2026-01-27T23:28:00.356Z" }, ] [[package]] @@ -4106,7 +4115,7 @@ wheels = [ [[package]] name = "openinference-instrumentation" -version = "0.1.42" +version = "0.1.43" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "openinference-semantic-conventions" }, @@ -4114,9 +4123,9 @@ dependencies = [ { name = "opentelemetry-sdk" }, { name = "wrapt" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/00/d0/b19061a21fd6127d2857c77744a36073bba9c1502d1d5e8517b708eb8b7c/openinference_instrumentation-0.1.42.tar.gz", hash = "sha256:2275babc34022e151b5492cfba41d3b12e28377f8e08cb45e5d64fe2d9d7fe37", size = 23954, upload-time = "2025-11-05T01:37:46.869Z" } +sdist = { url = "https://files.pythonhosted.org/packages/09/65/f979c42c35406eed5568530bb779a5c34540a42af563bd9049392ecf050e/openinference_instrumentation-0.1.43.tar.gz", hash = "sha256:fa9e8c84f63bb579b48b3e4cea21c10fa5a78961a6db349057ebcd7a33b541dd", size = 23956, upload-time = "2026-01-26T09:10:28.32Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/c3/71/43ee4616fc95dbd2f560550f199c6652a5eb93f84e8aa0039bc95c19cfe0/openinference_instrumentation-0.1.42-py3-none-any.whl", hash = "sha256:e7521ff90833ef7cc65db526a2f59b76a496180abeaaee30ec6abbbc0b43f8ec", size = 30086, upload-time = "2025-11-05T01:37:43.866Z" }, + { url = "https://files.pythonhosted.org/packages/2a/a3/61cb41c04ce05fa86654edac1e6e2c037d1caa828a4bc5bc3cd7a656fb62/openinference_instrumentation-0.1.43-py3-none-any.whl", hash = "sha256:f8b13f39da15202a50823733b245bb296147bb417eb873000c891164c9e68935", size = 30089, upload-time = "2026-01-26T09:10:27.231Z" }, ] [[package]] @@ -4641,11 +4650,11 @@ wheels = [ [[package]] name = "pathspec" -version = "1.0.3" +version = "1.0.4" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/4c/b2/bb8e495d5262bfec41ab5cb18f522f1012933347fb5d9e62452d446baca2/pathspec-1.0.3.tar.gz", hash = "sha256:bac5cf97ae2c2876e2d25ebb15078eb04d76e4b98921ee31c6f85ade8b59444d", size = 130841, upload-time = "2026-01-09T15:46:46.009Z" } +sdist = { url = "https://files.pythonhosted.org/packages/fa/36/e27608899f9b8d4dff0617b2d9ab17ca5608956ca44461ac14ac48b44015/pathspec-1.0.4.tar.gz", hash = "sha256:0210e2ae8a21a9137c0d470578cb0e595af87edaa6ebf12ff176f14a02e0e645", size = 131200, upload-time = "2026-01-27T03:59:46.938Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/32/2b/121e912bd60eebd623f873fd090de0e84f322972ab25a7f9044c056804ed/pathspec-1.0.3-py3-none-any.whl", hash = "sha256:e80767021c1cc524aa3fb14bedda9c34406591343cc42797b386ce7b9354fb6c", size = 55021, upload-time = "2026-01-09T15:46:44.652Z" }, + { url = "https://files.pythonhosted.org/packages/ef/3c/2c197d226f9ea224a9ab8d197933f9da0ae0aac5b6e0f884e2b8d9c8e9f7/pathspec-1.0.4-py3-none-any.whl", hash = "sha256:fb6ae2fd4e7c921a165808a552060e722767cfa526f99ca5156ed2ce45a5c723", size = 55206, upload-time = "2026-01-27T03:59:45.137Z" }, ] [[package]] @@ -4807,7 +4816,7 @@ wheels = [ [[package]] name = "posthog" -version = "7.6.0" +version = "7.7.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "backoff" }, @@ -4817,9 +4826,9 @@ dependencies = [ { name = "six" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/9d/10/dcbe5d12ba5e62b2a9c9004a80117765468198c44ffef16d2b54f938bddf/posthog-7.6.0.tar.gz", hash = "sha256:941dfd278ee427c9b14640f09b35b5bb52a71bdf028d7dbb7307e1838fd3002e", size = 146194, upload-time = "2026-01-19T16:23:04.571Z" } +sdist = { url = "https://files.pythonhosted.org/packages/23/dd/ca6d5a79614af27ededc0dca85e77f42f7704e29f8314819d7ce92b9a7f3/posthog-7.7.0.tar.gz", hash = "sha256:b4f2b1a616e099961f6ab61a5a2f88de62082c26801699e556927d21c00737ef", size = 160766, upload-time = "2026-01-27T21:15:41.63Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/ae/f6/8d4a2d1b67368fec425f32911e2f3638d5ac9e8abfebc698ac426fcf65db/posthog-7.6.0-py3-none-any.whl", hash = "sha256:c4dd78cf77c4fecceb965f86066e5ac37886ef867d68ffe75a1db5d681d7d9ad", size = 168426, upload-time = "2026-01-19T16:23:02.71Z" }, + { url = "https://files.pythonhosted.org/packages/41/3f/41b426ed9ab161d630edec84bacb6664ae62b6e63af1165919c7e11c17d1/posthog-7.7.0-py3-none-any.whl", hash = "sha256:955f42097bf147459653b9102e5f7f9a22e4b6fc9f15003447bd1137fafbc505", size = 185353, upload-time = "2026-01-27T21:15:40.051Z" }, ] [[package]] @@ -4901,18 +4910,18 @@ wheels = [ [[package]] name = "psutil" -version = "7.2.1" +version = "7.2.2" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/73/cb/09e5184fb5fc0358d110fc3ca7f6b1d033800734d34cac10f4136cfac10e/psutil-7.2.1.tar.gz", hash = "sha256:f7583aec590485b43ca601dd9cea0dcd65bd7bb21d30ef4ddbf4ea6b5ed1bdd3", size = 490253, upload-time = "2025-12-29T08:26:00.169Z" } +sdist = { url = "https://files.pythonhosted.org/packages/aa/c6/d1ddf4abb55e93cebc4f2ed8b5d6dbad109ecb8d63748dd2b20ab5e57ebe/psutil-7.2.2.tar.gz", hash = "sha256:0746f5f8d406af344fd547f1c8daa5f5c33dbc293bb8d6a16d80b4bb88f59372", size = 493740, upload-time = "2026-01-28T18:14:54.428Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/c5/cf/5180eb8c8bdf6a503c6919f1da28328bd1e6b3b1b5b9d5b01ae64f019616/psutil-7.2.1-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:b2e953fcfaedcfbc952b44744f22d16575d3aa78eb4f51ae74165b4e96e55f42", size = 128137, upload-time = "2025-12-29T08:26:27.759Z" }, - { url = "https://files.pythonhosted.org/packages/c5/2c/78e4a789306a92ade5000da4f5de3255202c534acdadc3aac7b5458fadef/psutil-7.2.1-cp36-abi3-macosx_11_0_arm64.whl", hash = "sha256:05cc68dbb8c174828624062e73078e7e35406f4ca2d0866c272c2410d8ef06d1", size = 128947, upload-time = "2025-12-29T08:26:29.548Z" }, - { url = "https://files.pythonhosted.org/packages/29/f8/40e01c350ad9a2b3cb4e6adbcc8a83b17ee50dd5792102b6142385937db5/psutil-7.2.1-cp36-abi3-manylinux2010_x86_64.manylinux_2_12_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5e38404ca2bb30ed7267a46c02f06ff842e92da3bb8c5bfdadbd35a5722314d8", size = 154694, upload-time = "2025-12-29T08:26:32.147Z" }, - { url = "https://files.pythonhosted.org/packages/06/e4/b751cdf839c011a9714a783f120e6a86b7494eb70044d7d81a25a5cd295f/psutil-7.2.1-cp36-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ab2b98c9fc19f13f59628d94df5cc4cc4844bc572467d113a8b517d634e362c6", size = 156136, upload-time = "2025-12-29T08:26:34.079Z" }, - { url = "https://files.pythonhosted.org/packages/44/ad/bbf6595a8134ee1e94a4487af3f132cef7fce43aef4a93b49912a48c3af7/psutil-7.2.1-cp36-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:f78baafb38436d5a128f837fab2d92c276dfb48af01a240b861ae02b2413ada8", size = 148108, upload-time = "2025-12-29T08:26:36.225Z" }, - { url = "https://files.pythonhosted.org/packages/1c/15/dd6fd869753ce82ff64dcbc18356093471a5a5adf4f77ed1f805d473d859/psutil-7.2.1-cp36-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:99a4cd17a5fdd1f3d014396502daa70b5ec21bf4ffe38393e152f8e449757d67", size = 147402, upload-time = "2025-12-29T08:26:39.21Z" }, - { url = "https://files.pythonhosted.org/packages/34/68/d9317542e3f2b180c4306e3f45d3c922d7e86d8ce39f941bb9e2e9d8599e/psutil-7.2.1-cp37-abi3-win_amd64.whl", hash = "sha256:b1b0671619343aa71c20ff9767eced0483e4fc9e1f489d50923738caf6a03c17", size = 136938, upload-time = "2025-12-29T08:26:41.036Z" }, - { url = "https://files.pythonhosted.org/packages/3e/73/2ce007f4198c80fcf2cb24c169884f833fe93fbc03d55d302627b094ee91/psutil-7.2.1-cp37-abi3-win_arm64.whl", hash = "sha256:0d67c1822c355aa6f7314d92018fb4268a76668a536f133599b91edd48759442", size = 133836, upload-time = "2025-12-29T08:26:43.086Z" }, + { url = "https://files.pythonhosted.org/packages/e7/36/5ee6e05c9bd427237b11b3937ad82bb8ad2752d72c6969314590dd0c2f6e/psutil-7.2.2-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:ed0cace939114f62738d808fdcecd4c869222507e266e574799e9c0faa17d486", size = 129090, upload-time = "2026-01-28T18:15:22.168Z" }, + { url = "https://files.pythonhosted.org/packages/80/c4/f5af4c1ca8c1eeb2e92ccca14ce8effdeec651d5ab6053c589b074eda6e1/psutil-7.2.2-cp36-abi3-macosx_11_0_arm64.whl", hash = "sha256:1a7b04c10f32cc88ab39cbf606e117fd74721c831c98a27dc04578deb0c16979", size = 129859, upload-time = "2026-01-28T18:15:23.795Z" }, + { url = "https://files.pythonhosted.org/packages/b5/70/5d8df3b09e25bce090399cf48e452d25c935ab72dad19406c77f4e828045/psutil-7.2.2-cp36-abi3-manylinux2010_x86_64.manylinux_2_12_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:076a2d2f923fd4821644f5ba89f059523da90dc9014e85f8e45a5774ca5bc6f9", size = 155560, upload-time = "2026-01-28T18:15:25.976Z" }, + { url = "https://files.pythonhosted.org/packages/63/65/37648c0c158dc222aba51c089eb3bdfa238e621674dc42d48706e639204f/psutil-7.2.2-cp36-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b0726cecd84f9474419d67252add4ac0cd9811b04d61123054b9fb6f57df6e9e", size = 156997, upload-time = "2026-01-28T18:15:27.794Z" }, + { url = "https://files.pythonhosted.org/packages/8e/13/125093eadae863ce03c6ffdbae9929430d116a246ef69866dad94da3bfbc/psutil-7.2.2-cp36-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:fd04ef36b4a6d599bbdb225dd1d3f51e00105f6d48a28f006da7f9822f2606d8", size = 148972, upload-time = "2026-01-28T18:15:29.342Z" }, + { url = "https://files.pythonhosted.org/packages/04/78/0acd37ca84ce3ddffaa92ef0f571e073faa6d8ff1f0559ab1272188ea2be/psutil-7.2.2-cp36-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:b58fabe35e80b264a4e3bb23e6b96f9e45a3df7fb7eed419ac0e5947c61e47cc", size = 148266, upload-time = "2026-01-28T18:15:31.597Z" }, + { url = "https://files.pythonhosted.org/packages/b4/90/e2159492b5426be0c1fef7acba807a03511f97c5f86b3caeda6ad92351a7/psutil-7.2.2-cp37-abi3-win_amd64.whl", hash = "sha256:eb7e81434c8d223ec4a219b5fc1c47d0417b12be7ea866e24fb5ad6e84b3d988", size = 137737, upload-time = "2026-01-28T18:15:33.849Z" }, + { url = "https://files.pythonhosted.org/packages/8c/c7/7bb2e321574b10df20cbde462a94e2b71d05f9bbda251ef27d104668306a/psutil-7.2.2-cp37-abi3-win_arm64.whl", hash = "sha256:8c233660f575a5a89e6d4cb65d9f938126312bca76d8fe087b947b3a1aaac9ee", size = 134617, upload-time = "2026-01-28T18:15:36.514Z" }, ] [[package]] @@ -5196,7 +5205,7 @@ wheels = [ [[package]] name = "pyobvector" -version = "0.2.22" +version = "0.2.23" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "aiomysql" }, @@ -5206,9 +5215,9 @@ dependencies = [ { name = "sqlalchemy" }, { name = "sqlglot" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/30/b9/443d65757cdfb47d31ef4b9ed0609628ae468e52e57033051e1fad256c59/pyobvector-0.2.22.tar.gz", hash = "sha256:0bd4af46cfdfbc67e691d5b49f3b0662f702a7a42a7f7a240f1021af378e793c", size = 72706, upload-time = "2026-01-15T03:19:57.4Z" } +sdist = { url = "https://files.pythonhosted.org/packages/7f/14/ea82e5f70c335d2a253ae0a5f182f99abc0319511d565ec887c1d576cfb4/pyobvector-0.2.23.tar.gz", hash = "sha256:c575c84d7aef078d19f7ceeccb7240ea7371940e4e240214ed013b757fbe2b97", size = 73663, upload-time = "2026-01-29T09:29:37.197Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/e0/88/1583888a4ce85202d93fa03f2817681637465668e8b260ef1b9d5a39c3ca/pyobvector-0.2.22-py3-none-any.whl", hash = "sha256:4a0f5c094af7ca8242fdf9e5111e75544de0a9615491e9ec2f9d218dc909b509", size = 60627, upload-time = "2026-01-15T03:19:55.918Z" }, + { url = "https://files.pythonhosted.org/packages/4f/45/29100150b64ec6c2361f11da969bf0a25f33408bae1eba0054abe315922d/pyobvector-0.2.23-py3-none-any.whl", hash = "sha256:04973247f843cbfef548b9d07989190ffc64a56d49c88bf60b3220f0841b33d3", size = 60900, upload-time = "2026-01-29T09:29:35.727Z" }, ] [[package]] @@ -5371,6 +5380,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/fa/b6/3127540ecdf1464a00e5a01ee60a1b09175f6913f0644ac748494d9c4b21/pytest_timeout-2.4.0-py3-none-any.whl", hash = "sha256:c42667e5cdadb151aeb5b26d114aff6bdf5a907f176a007a30b940d3d865b5c2", size = 14382, upload-time = "2025-05-05T19:44:33.502Z" }, ] +[[package]] +name = "pytest-xdist" +version = "3.8.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "execnet" }, + { name = "pytest" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/78/b4/439b179d1ff526791eb921115fca8e44e596a13efeda518b9d845a619450/pytest_xdist-3.8.0.tar.gz", hash = "sha256:7e578125ec9bc6050861aa93f2d59f1d8d085595d6551c2c90b6f4fad8d3a9f1", size = 88069, upload-time = "2025-07-01T13:30:59.346Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ca/31/d4e37e9e550c2b92a9cbc2e4d0b7420a27224968580b5a447f420847c975/pytest_xdist-3.8.0-py3-none-any.whl", hash = "sha256:202ca578cfeb7370784a8c33d6d05bc6e13b4f25b5053c30a152269fd10f0b88", size = 46396, upload-time = "2025-07-01T13:30:56.632Z" }, +] + [[package]] name = "python-calamine" version = "0.6.1" @@ -5819,15 +5841,15 @@ wheels = [ [[package]] name = "rich" -version = "14.3.0" +version = "14.3.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "markdown-it-py" }, { name = "pygments" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/aa/9c/137848452e130e71f3ca9a9876751ddcac99e4b1f248ed297996c8c2d728/rich-14.3.0.tar.gz", hash = "sha256:b75e54d3abbcc49137e83e4db54dc86c5e47687eebc95aa0305363231a36e699", size = 230113, upload-time = "2026-01-24T12:25:46.336Z" } +sdist = { url = "https://files.pythonhosted.org/packages/a1/84/4831f881aa6ff3c976f6d6809b58cdfa350593ffc0dc3c58f5f6586780fb/rich-14.3.1.tar.gz", hash = "sha256:b8c5f568a3a749f9290ec6bddedf835cec33696bfc1e48bcfecb276c7386e4b8", size = 230125, upload-time = "2026-01-24T21:40:44.847Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/fa/e0/83cbdcb81b5cbbbe355648dd402b410437806544f48ee218a2354798f012/rich-14.3.0-py3-none-any.whl", hash = "sha256:0b8c1e368c1125b9e993c2d2f1342802525f4853fc6dac2e8e9e88bac0f45bce", size = 309950, upload-time = "2026-01-24T12:25:44.679Z" }, + { url = "https://files.pythonhosted.org/packages/87/2a/a1810c8627b9ec8c57ec5ec325d306701ae7be50235e8fd81266e002a3cc/rich-14.3.1-py3-none-any.whl", hash = "sha256:da750b1aebbff0b372557426fb3f35ba56de8ef954b3190315eb64076d6fb54e", size = 309952, upload-time = "2026-01-24T21:40:42.969Z" }, ] [[package]] @@ -6000,11 +6022,11 @@ flask = [ [[package]] name = "setuptools" -version = "80.10.1" +version = "80.10.2" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/86/ff/f75651350db3cf2ef767371307eb163f3cc1ac03e16fdf3ac347607f7edb/setuptools-80.10.1.tar.gz", hash = "sha256:bf2e513eb8144c3298a3bd28ab1a5edb739131ec5c22e045ff93cd7f5319703a", size = 1229650, upload-time = "2026-01-21T09:42:03.061Z" } +sdist = { url = "https://files.pythonhosted.org/packages/76/95/faf61eb8363f26aa7e1d762267a8d602a1b26d4f3a1e758e92cb3cb8b054/setuptools-80.10.2.tar.gz", hash = "sha256:8b0e9d10c784bf7d262c4e5ec5d4ec94127ce206e8738f29a437945fbc219b70", size = 1200343, upload-time = "2026-01-25T22:38:17.252Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/e0/76/f963c61683a39084aa575f98089253e1e852a4417cb8a3a8a422923a5246/setuptools-80.10.1-py3-none-any.whl", hash = "sha256:fc30c51cbcb8199a219c12cc9c281b5925a4978d212f84229c909636d9f6984e", size = 1099859, upload-time = "2026-01-21T09:42:00.688Z" }, + { url = "https://files.pythonhosted.org/packages/94/b8/f1f62a5e3c0ad2ff1d189590bfa4c46b4f3b6e49cef6f26c6ee4e575394d/setuptools-80.10.2-py3-none-any.whl", hash = "sha256:95b30ddfb717250edb492926c92b5221f7ef3fbcc2b07579bcd4a27da21d0173", size = 1064234, upload-time = "2026-01-25T22:38:15.216Z" }, ] [[package]] @@ -6488,26 +6510,26 @@ wheels = [ [[package]] name = "ty" -version = "0.0.13" +version = "0.0.14" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/5a/dc/b607f00916f5a7c52860b84a66dc17bc6988e8445e96b1d6e175a3837397/ty-0.0.13.tar.gz", hash = "sha256:7a1d135a400ca076407ea30012d1f75419634160ed3b9cad96607bf2956b23b3", size = 4999183, upload-time = "2026-01-21T13:21:16.133Z" } +sdist = { url = "https://files.pythonhosted.org/packages/af/57/22c3d6bf95c2229120c49ffc2f0da8d9e8823755a1c3194da56e51f1cc31/ty-0.0.14.tar.gz", hash = "sha256:a691010565f59dd7f15cf324cdcd1d9065e010c77a04f887e1ea070ba34a7de2", size = 5036573, upload-time = "2026-01-27T00:57:31.427Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/1a/df/3632f1918f4c0a33184f107efc5d436ab6da147fd3d3b94b3af6461efbf4/ty-0.0.13-py3-none-linux_armv6l.whl", hash = "sha256:1b2b8e02697c3a94c722957d712a0615bcc317c9b9497be116ef746615d892f2", size = 9993501, upload-time = "2026-01-21T13:21:26.628Z" }, - { url = "https://files.pythonhosted.org/packages/92/87/6a473ced5ac280c6ce5b1627c71a8a695c64481b99aabc798718376a441e/ty-0.0.13-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:f15cdb8e233e2b5adfce673bb21f4c5e8eaf3334842f7eea3c70ac6fda8c1de5", size = 9860986, upload-time = "2026-01-21T13:21:24.425Z" }, - { url = "https://files.pythonhosted.org/packages/5d/9b/d89ae375cf0a7cd9360e1164ce017f8c753759be63b6a11ed4c944abe8c6/ty-0.0.13-py3-none-macosx_11_0_arm64.whl", hash = "sha256:0819e89ac9f0d8af7a062837ce197f0461fee2fc14fd07e2c368780d3a397b73", size = 9350748, upload-time = "2026-01-21T13:21:28.502Z" }, - { url = "https://files.pythonhosted.org/packages/a8/a6/9ad58518056fab344b20c0bb2c1911936ebe195318e8acc3bc45ac1c6b6b/ty-0.0.13-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1de79f481084b7cc7a202ba0d7a75e10970d10ffa4f025b23f2e6b7324b74886", size = 9849884, upload-time = "2026-01-21T13:21:21.886Z" }, - { url = "https://files.pythonhosted.org/packages/b1/c3/8add69095fa179f523d9e9afcc15a00818af0a37f2b237a9b59bc0046c34/ty-0.0.13-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:4fb2154cff7c6e95d46bfaba283c60642616f20d73e5f96d0c89c269f3e1bcec", size = 9822975, upload-time = "2026-01-21T13:21:14.292Z" }, - { url = "https://files.pythonhosted.org/packages/a4/05/4c0927c68a0a6d43fb02f3f0b6c19c64e3461dc8ed6c404dde0efb8058f7/ty-0.0.13-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:00be58d89337c27968a20d58ca553458608c5b634170e2bec82824c2e4cf4d96", size = 10294045, upload-time = "2026-01-21T13:21:30.505Z" }, - { url = "https://files.pythonhosted.org/packages/b4/86/6dc190838aba967557fe0bfd494c595d00b5081315a98aaf60c0e632aaeb/ty-0.0.13-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:72435eade1fa58c6218abb4340f43a6c3ff856ae2dc5722a247d3a6dd32e9737", size = 10916460, upload-time = "2026-01-21T13:21:07.788Z" }, - { url = "https://files.pythonhosted.org/packages/04/40/9ead96b7c122e1109dfcd11671184c3506996bf6a649306ec427e81d9544/ty-0.0.13-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:77a548742ee8f621d718159e7027c3b555051d096a49bb580249a6c5fc86c271", size = 10597154, upload-time = "2026-01-21T13:21:18.064Z" }, - { url = "https://files.pythonhosted.org/packages/aa/7d/e832a2c081d2be845dc6972d0c7998914d168ccbc0b9c86794419ab7376e/ty-0.0.13-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:da067c57c289b7cf914669704b552b6207c2cc7f50da4118c3e12388642e6b3f", size = 10410710, upload-time = "2026-01-21T13:21:12.388Z" }, - { url = "https://files.pythonhosted.org/packages/31/e3/898be3a96237a32f05c4c29b43594dc3b46e0eedfe8243058e46153b324f/ty-0.0.13-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:d1b50a01fffa140417fca5a24b658fbe0734074a095d5b6f0552484724474343", size = 9826299, upload-time = "2026-01-21T13:21:00.845Z" }, - { url = "https://files.pythonhosted.org/packages/bb/eb/db2d852ce0ed742505ff18ee10d7d252f3acfd6fc60eca7e9c7a0288a6d8/ty-0.0.13-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:0f33c46f52e5e9378378eca0d8059f026f3c8073ace02f7f2e8d079ddfe5207e", size = 9831610, upload-time = "2026-01-21T13:21:05.842Z" }, - { url = "https://files.pythonhosted.org/packages/9e/61/149f59c8abaddcbcbb0bd13b89c7741ae1c637823c5cf92ed2c644fcadef/ty-0.0.13-py3-none-musllinux_1_2_i686.whl", hash = "sha256:168eda24d9a0b202cf3758c2962cc295878842042b7eca9ed2965259f59ce9f2", size = 9978885, upload-time = "2026-01-21T13:21:10.306Z" }, - { url = "https://files.pythonhosted.org/packages/a0/cd/026d4e4af60a80918a8d73d2c42b8262dd43ab2fa7b28d9743004cb88d57/ty-0.0.13-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:d4917678b95dc8cb399cc459fab568ba8d5f0f33b7a94bf840d9733043c43f29", size = 10506453, upload-time = "2026-01-21T13:20:56.633Z" }, - { url = "https://files.pythonhosted.org/packages/63/06/8932833a4eca2df49c997a29afb26721612de8078ae79074c8fe87e17516/ty-0.0.13-py3-none-win32.whl", hash = "sha256:c1f2ec40daa405508b053e5b8e440fbae5fdb85c69c9ab0ee078f8bc00eeec3d", size = 9433482, upload-time = "2026-01-21T13:20:58.717Z" }, - { url = "https://files.pythonhosted.org/packages/aa/fd/e8d972d1a69df25c2cecb20ea50e49ad5f27a06f55f1f5f399a563e71645/ty-0.0.13-py3-none-win_amd64.whl", hash = "sha256:8b7b1ab9f187affbceff89d51076038363b14113be29bda2ddfa17116de1d476", size = 10319156, upload-time = "2026-01-21T13:21:03.266Z" }, - { url = "https://files.pythonhosted.org/packages/2d/c2/05fdd64ac003a560d4fbd1faa7d9a31d75df8f901675e5bed1ee2ceeff87/ty-0.0.13-py3-none-win_arm64.whl", hash = "sha256:1c9630333497c77bb9bcabba42971b96ee1f36c601dd3dcac66b4134f9fa38f0", size = 9808316, upload-time = "2026-01-21T13:20:54.053Z" }, + { url = "https://files.pythonhosted.org/packages/99/cb/cc6d1d8de59beb17a41f9a614585f884ec2d95450306c173b3b7cc090d2e/ty-0.0.14-py3-none-linux_armv6l.whl", hash = "sha256:32cf2a7596e693094621d3ae568d7ee16707dce28c34d1762947874060fdddaa", size = 10034228, upload-time = "2026-01-27T00:57:53.133Z" }, + { url = "https://files.pythonhosted.org/packages/f3/96/dd42816a2075a8f31542296ae687483a8d047f86a6538dfba573223eaf9a/ty-0.0.14-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:f971bf9805f49ce8c0968ad53e29624d80b970b9eb597b7cbaba25d8a18ce9a2", size = 9939162, upload-time = "2026-01-27T00:57:43.857Z" }, + { url = "https://files.pythonhosted.org/packages/ff/b4/73c4859004e0f0a9eead9ecb67021438b2e8e5fdd8d03e7f5aca77623992/ty-0.0.14-py3-none-macosx_11_0_arm64.whl", hash = "sha256:45448b9e4806423523268bc15e9208c4f3f2ead7c344f615549d2e2354d6e924", size = 9418661, upload-time = "2026-01-27T00:58:03.411Z" }, + { url = "https://files.pythonhosted.org/packages/58/35/839c4551b94613db4afa20ee555dd4f33bfa7352d5da74c5fa416ffa0fd2/ty-0.0.14-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ee94a9b747ff40114085206bdb3205a631ef19a4d3fb89e302a88754cbbae54c", size = 9837872, upload-time = "2026-01-27T00:57:23.718Z" }, + { url = "https://files.pythonhosted.org/packages/41/2b/bbecf7e2faa20c04bebd35fc478668953ca50ee5847ce23e08acf20ea119/ty-0.0.14-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:6756715a3c33182e9ab8ffca2bb314d3c99b9c410b171736e145773ee0ae41c3", size = 9848819, upload-time = "2026-01-27T00:57:58.501Z" }, + { url = "https://files.pythonhosted.org/packages/be/60/3c0ba0f19c0f647ad9d2b5b5ac68c0f0b4dc899001bd53b3a7537fb247a2/ty-0.0.14-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:89d0038a2f698ba8b6fec5cf216a4e44e2f95e4a5095a8c0f57fe549f87087c2", size = 10324371, upload-time = "2026-01-27T00:57:29.291Z" }, + { url = "https://files.pythonhosted.org/packages/24/32/99d0a0b37d0397b0a989ffc2682493286aa3bc252b24004a6714368c2c3d/ty-0.0.14-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2c64a83a2d669b77f50a4957039ca1450626fb474619f18f6f8a3eb885bf7544", size = 10865898, upload-time = "2026-01-27T00:57:33.542Z" }, + { url = "https://files.pythonhosted.org/packages/1a/88/30b583a9e0311bb474269cfa91db53350557ebec09002bfc3fb3fc364e8c/ty-0.0.14-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:242488bfb547ef080199f6fd81369ab9cb638a778bb161511d091ffd49c12129", size = 10555777, upload-time = "2026-01-27T00:58:05.853Z" }, + { url = "https://files.pythonhosted.org/packages/cd/a2/cb53fb6325dcf3d40f2b1d0457a25d55bfbae633c8e337bde8ec01a190eb/ty-0.0.14-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4790c3866f6c83a4f424fc7d09ebdb225c1f1131647ba8bdc6fcdc28f09ed0ff", size = 10412913, upload-time = "2026-01-27T00:57:38.834Z" }, + { url = "https://files.pythonhosted.org/packages/42/8f/f2f5202d725ed1e6a4e5ffaa32b190a1fe70c0b1a2503d38515da4130b4c/ty-0.0.14-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:950f320437f96d4ea9a2332bbfb5b68f1c1acd269ebfa4c09b6970cc1565bd9d", size = 9837608, upload-time = "2026-01-27T00:57:55.898Z" }, + { url = "https://files.pythonhosted.org/packages/f7/ba/59a2a0521640c489dafa2c546ae1f8465f92956fede18660653cce73b4c5/ty-0.0.14-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:4a0ec3ee70d83887f86925bbc1c56f4628bd58a0f47f6f32ddfe04e1f05466df", size = 9884324, upload-time = "2026-01-27T00:57:46.786Z" }, + { url = "https://files.pythonhosted.org/packages/03/95/8d2a49880f47b638743212f011088552ecc454dd7a665ddcbdabea25772a/ty-0.0.14-py3-none-musllinux_1_2_i686.whl", hash = "sha256:a1a4e6b6da0c58b34415955279eff754d6206b35af56a18bb70eb519d8d139ef", size = 10033537, upload-time = "2026-01-27T00:58:01.149Z" }, + { url = "https://files.pythonhosted.org/packages/e9/40/4523b36f2ce69f92ccf783855a9e0ebbbd0f0bb5cdce6211ee1737159ed3/ty-0.0.14-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:dc04384e874c5de4c5d743369c277c8aa73d1edea3c7fc646b2064b637db4db3", size = 10495910, upload-time = "2026-01-27T00:57:26.691Z" }, + { url = "https://files.pythonhosted.org/packages/08/d5/655beb51224d1bfd4f9ddc0bb209659bfe71ff141bcf05c418ab670698f0/ty-0.0.14-py3-none-win32.whl", hash = "sha256:b20e22cf54c66b3e37e87377635da412d9a552c9bf4ad9fc449fed8b2e19dad2", size = 9507626, upload-time = "2026-01-27T00:57:41.43Z" }, + { url = "https://files.pythonhosted.org/packages/b6/d9/c569c9961760e20e0a4bc008eeb1415754564304fd53997a371b7cf3f864/ty-0.0.14-py3-none-win_amd64.whl", hash = "sha256:e312ff9475522d1a33186657fe74d1ec98e4a13e016d66f5758a452c90ff6409", size = 10437980, upload-time = "2026-01-27T00:57:36.422Z" }, + { url = "https://files.pythonhosted.org/packages/ad/0c/186829654f5bfd9a028f6648e9caeb11271960a61de97484627d24443f91/ty-0.0.14-py3-none-win_arm64.whl", hash = "sha256:b6facdbe9b740cb2c15293a1d178e22ffc600653646452632541d01c36d5e378", size = 9885831, upload-time = "2026-01-27T00:57:49.747Z" }, ] [[package]] @@ -6739,11 +6761,11 @@ wheels = [ [[package]] name = "types-pexpect" -version = "4.9.0.20250916" +version = "4.9.0.20260127" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/0c/e6/cc43e306dc7de14ec7861c24ac4957f688741ae39ae685049695d796b587/types_pexpect-4.9.0.20250916.tar.gz", hash = "sha256:69e5fed6199687a730a572de780a5749248a4c5df2ff1521e194563475c9928d", size = 13322, upload-time = "2025-09-16T02:49:25.61Z" } +sdist = { url = "https://files.pythonhosted.org/packages/2e/32/7e03a07e16f79a404d6200ed6bdfcc320d0fb833436a5c6895a1403dedb7/types_pexpect-4.9.0.20260127.tar.gz", hash = "sha256:f8d43efc24251a8e533c71ea9be03d19bb5d08af096d561611697af9720cba7f", size = 13461, upload-time = "2026-01-27T03:28:30.923Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/aa/6d/7740e235a9fb2570968da7d386d7feb511ce68cd23472402ff8cdf7fc78f/types_pexpect-4.9.0.20250916-py3-none-any.whl", hash = "sha256:7fa43cb96042ac58bc74f7c28e5d85782be0ee01344149886849e9d90936fe8a", size = 17057, upload-time = "2025-09-16T02:49:24.546Z" }, + { url = "https://files.pythonhosted.org/packages/8a/d9/7ac5c9aa5a89a1a64cd835ae348227f4939406d826e461b85b690a8ba1c2/types_pexpect-4.9.0.20260127-py3-none-any.whl", hash = "sha256:69216c0ebf0fe45ad2900823133959b027e9471e24fc3f2e4c7b00605555da5f", size = 17078, upload-time = "2026-01-27T03:28:29.848Z" }, ] [[package]] @@ -7331,11 +7353,11 @@ wheels = [ [[package]] name = "wcwidth" -version = "0.3.2" +version = "0.5.0" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/05/07/0b5bcc9812b1b2fd331cc88289ef4d47d428afdbbf0216bb7d53942d93d6/wcwidth-0.3.2.tar.gz", hash = "sha256:d469b3059dab6b1077def5923ed0a8bf5738bd4a1a87f686d5e2de455354c4ad", size = 233633, upload-time = "2026-01-23T21:08:52.451Z" } +sdist = { url = "https://files.pythonhosted.org/packages/64/6e/62daec357285b927e82263a81f3b4c1790215bc77c42530ce4a69d501a43/wcwidth-0.5.0.tar.gz", hash = "sha256:f89c103c949a693bf563377b2153082bf58e309919dfb7f27b04d862a0089333", size = 246585, upload-time = "2026-01-27T01:31:44.942Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/72/c6/1452e716c5af065c018f75d42ca97517a04ac6aae4133722e0424649a07c/wcwidth-0.3.2-py3-none-any.whl", hash = "sha256:817abc6a89e47242a349b5d100cbd244301690d6d8d2ec6335f26fe6640a6315", size = 86280, upload-time = "2026-01-23T21:08:51.362Z" }, + { url = "https://files.pythonhosted.org/packages/f2/3e/45583b67c2ff08ad5a582d316fcb2f11d6cf0a50c7707ac09d212d25bc98/wcwidth-0.5.0-py3-none-any.whl", hash = "sha256:1efe1361b83b0ff7877b81ba57c8562c99cf812158b778988ce17ec061095695", size = 93772, upload-time = "2026-01-27T01:31:43.432Z" }, ] [[package]] diff --git a/dev/pytest/pytest_unit_tests.sh b/dev/pytest/pytest_unit_tests.sh index 496cb40952..7c39a48bf4 100755 --- a/dev/pytest/pytest_unit_tests.sh +++ b/dev/pytest/pytest_unit_tests.sh @@ -5,6 +5,12 @@ SCRIPT_DIR="$(dirname "$(realpath "$0")")" cd "$SCRIPT_DIR/../.." PYTEST_TIMEOUT="${PYTEST_TIMEOUT:-20}" +PYTEST_XDIST_ARGS="${PYTEST_XDIST_ARGS:--n auto}" -# libs -pytest --timeout "${PYTEST_TIMEOUT}" api/tests/unit_tests +# Run most tests in parallel (excluding controllers which have import conflicts with xdist) +# Controller tests have module-level side effects (Flask route registration) that cause +# race conditions when imported concurrently by multiple pytest-xdist workers. +pytest --timeout "${PYTEST_TIMEOUT}" ${PYTEST_XDIST_ARGS} api/tests/unit_tests --ignore=api/tests/unit_tests/controllers + +# Run controller tests sequentially to avoid import race conditions +pytest --timeout "${PYTEST_TIMEOUT}" api/tests/unit_tests/controllers diff --git a/docker/.env.example b/docker/.env.example index c000e6108d..10cd4c1414 100644 --- a/docker/.env.example +++ b/docker/.env.example @@ -1388,6 +1388,7 @@ PLUGIN_DAEMON_PORT=5002 PLUGIN_DAEMON_KEY=lYkiYYT6owG+71oLerGzA7GXCgOT++6ovaezWAjpCjf+Sjc3ZtU+qUEi PLUGIN_DAEMON_URL=http://plugin_daemon:5002 PLUGIN_MAX_PACKAGE_SIZE=52428800 +PLUGIN_MODEL_SCHEMA_CACHE_TTL=3600 PLUGIN_PPROF_ENABLED=false PLUGIN_DEBUGGING_HOST=0.0.0.0 diff --git a/docker/docker-compose.yaml b/docker/docker-compose.yaml index 5fcd3afedf..7f08b39e5c 100644 --- a/docker/docker-compose.yaml +++ b/docker/docker-compose.yaml @@ -591,6 +591,7 @@ x-shared-env: &shared-api-worker-env PLUGIN_DAEMON_KEY: ${PLUGIN_DAEMON_KEY:-lYkiYYT6owG+71oLerGzA7GXCgOT++6ovaezWAjpCjf+Sjc3ZtU+qUEi} PLUGIN_DAEMON_URL: ${PLUGIN_DAEMON_URL:-http://plugin_daemon:5002} PLUGIN_MAX_PACKAGE_SIZE: ${PLUGIN_MAX_PACKAGE_SIZE:-52428800} + PLUGIN_MODEL_SCHEMA_CACHE_TTL: ${PLUGIN_MODEL_SCHEMA_CACHE_TTL:-3600} PLUGIN_PPROF_ENABLED: ${PLUGIN_PPROF_ENABLED:-false} PLUGIN_DEBUGGING_HOST: ${PLUGIN_DEBUGGING_HOST:-0.0.0.0} PLUGIN_DEBUGGING_PORT: ${PLUGIN_DEBUGGING_PORT:-5003} diff --git a/web/AGENTS.md b/web/AGENTS.md index 7362cd51db..5dd41b8a3c 100644 --- a/web/AGENTS.md +++ b/web/AGENTS.md @@ -1,5 +1,9 @@ +## Frontend Workflow + +- Refer to the `./docs/test.md` and `./docs/lint.md` for detailed frontend workflow instructions. + ## Automated Test Generation -- Use `web/testing/testing.md` as the canonical instruction set for generating frontend automated tests. +- Use `./docs/test.md` as the canonical instruction set for generating frontend automated tests. - When proposing or saving tests, re-read that document and follow every requirement. - All frontend tests MUST also comply with the `frontend-testing` skill. Treat the skill as a mandatory constraint, not optional guidance. diff --git a/web/README.md b/web/README.md index aa3a04f1b4..a95ca2d49c 100644 --- a/web/README.md +++ b/web/README.md @@ -109,6 +109,8 @@ Open [http://localhost:6006](http://localhost:6006) with your browser to see the If your IDE is VSCode, rename `web/.vscode/settings.example.json` to `web/.vscode/settings.json` for lint code setting. +Then follow the [Lint Documentation](./docs/lint.md) to lint the code. + ## Test We use [Vitest](https://vitest.dev/) and [React Testing Library](https://testing-library.com/docs/react-testing-library/intro/) for Unit Testing. diff --git a/web/app/components/app-sidebar/dataset-info/dropdown.tsx b/web/app/components/app-sidebar/dataset-info/dropdown.tsx index 4d7c832e04..96127c4210 100644 --- a/web/app/components/app-sidebar/dataset-info/dropdown.tsx +++ b/web/app/components/app-sidebar/dataset-info/dropdown.tsx @@ -11,6 +11,7 @@ import { datasetDetailQueryKeyPrefix, useInvalidDatasetList } from '@/service/kn import { useInvalid } from '@/service/use-base' import { useExportPipelineDSL } from '@/service/use-pipeline' import { cn } from '@/utils/classnames' +import { downloadBlob } from '@/utils/download' import ActionButton from '../../base/action-button' import Confirm from '../../base/confirm' import { PortalToFollowElem, PortalToFollowElemContent, PortalToFollowElemTrigger } from '../../base/portal-to-follow-elem' @@ -64,13 +65,8 @@ const DropDown = ({ pipelineId: pipeline_id, include, }) - const a = document.createElement('a') const file = new Blob([data], { type: 'application/yaml' }) - const url = URL.createObjectURL(file) - a.href = url - a.download = `${name}.pipeline` - a.click() - URL.revokeObjectURL(url) + downloadBlob({ data: file, fileName: `${name}.pipeline` }) } catch { Toast.notify({ type: 'error', message: t('exportFailed', { ns: 'app' }) }) diff --git a/web/app/components/app-sidebar/toggle-button.tsx b/web/app/components/app-sidebar/toggle-button.tsx index a6bdee4f78..cbfbeee452 100644 --- a/web/app/components/app-sidebar/toggle-button.tsx +++ b/web/app/components/app-sidebar/toggle-button.tsx @@ -4,7 +4,7 @@ import { useTranslation } from 'react-i18next' import { cn } from '@/utils/classnames' import Button from '../base/button' import Tooltip from '../base/tooltip' -import { getKeyboardKeyNameBySystem } from '../workflow/utils' +import ShortcutsName from '../workflow/shortcuts-name' type TooltipContentProps = { expand: boolean @@ -20,18 +20,7 @@ const TooltipContent = ({ return (
{expand ? t('sidebar.collapseSidebar', { ns: 'layout' }) : t('sidebar.expandSidebar', { ns: 'layout' })} -
- { - TOGGLE_SHORTCUT.map(key => ( - - {getKeyboardKeyNameBySystem(key)} - - )) - } -
+
) } diff --git a/web/app/components/app/annotation/header-opts/index.tsx b/web/app/components/app/annotation/header-opts/index.tsx index 5add1aed32..4fc1e26007 100644 --- a/web/app/components/app/annotation/header-opts/index.tsx +++ b/web/app/components/app/annotation/header-opts/index.tsx @@ -21,6 +21,7 @@ import { LanguagesSupported } from '@/i18n-config/language' import { clearAllAnnotations, fetchExportAnnotationList } from '@/service/annotation' import { cn } from '@/utils/classnames' +import { downloadBlob } from '@/utils/download' import Button from '../../../base/button' import AddAnnotationModal from '../add-annotation-modal' import BatchAddModal from '../batch-add-annotation-modal' @@ -56,28 +57,23 @@ const HeaderOptions: FC = ({ ) const JSONLOutput = () => { - const a = document.createElement('a') const content = listTransformer(list).join('\n') const file = new Blob([content], { type: 'application/jsonl' }) - const url = URL.createObjectURL(file) - a.href = url - a.download = `annotations-${locale}.jsonl` - a.click() - URL.revokeObjectURL(url) + downloadBlob({ data: file, fileName: `annotations-${locale}.jsonl` }) } - const fetchList = async () => { + const fetchList = React.useCallback(async () => { const { data }: any = await fetchExportAnnotationList(appId) setList(data as AnnotationItemBasic[]) - } + }, [appId]) useEffect(() => { fetchList() - }, []) + }, [fetchList]) useEffect(() => { if (controlUpdateList) fetchList() - }, [controlUpdateList]) + }, [controlUpdateList, fetchList]) const [showBulkImportModal, setShowBulkImportModal] = useState(false) const [showClearConfirm, setShowClearConfirm] = useState(false) diff --git a/web/app/components/app/app-publisher/index.tsx b/web/app/components/app/app-publisher/index.tsx index 06bbad24a4..ab7f442ebf 100644 --- a/web/app/components/app/app-publisher/index.tsx +++ b/web/app/components/app/app-publisher/index.tsx @@ -57,7 +57,8 @@ import Divider from '../../base/divider' import Loading from '../../base/loading' import Toast from '../../base/toast' import Tooltip from '../../base/tooltip' -import { getKeyboardKeyCodeBySystem, getKeyboardKeyNameBySystem } from '../../workflow/utils' +import ShortcutsName from '../../workflow/shortcuts-name' +import { getKeyboardKeyCodeBySystem } from '../../workflow/utils' import AccessControl from '../app-access-control' import PublishWithMultipleModel from './publish-with-multiple-model' import SuggestedAction from './suggested-action' @@ -410,13 +411,7 @@ const AppPublisher = ({ : (
{t('common.publishUpdate', { ns: 'workflow' })} -
- {PUBLISH_SHORTCUT.map(key => ( - - {getKeyboardKeyNameBySystem(key)} - - ))} -
+
) } diff --git a/web/app/components/app/configuration/config-var/index.spec.tsx b/web/app/components/app/configuration/config-var/index.spec.tsx index b5015ed079..490d7b4410 100644 --- a/web/app/components/app/configuration/config-var/index.spec.tsx +++ b/web/app/components/app/configuration/config-var/index.spec.tsx @@ -2,7 +2,7 @@ import type { ReactNode } from 'react' import type { IConfigVarProps } from './index' import type { ExternalDataTool } from '@/models/common' import type { PromptVariable } from '@/models/debug' -import { act, fireEvent, render, screen } from '@testing-library/react' +import { act, fireEvent, render, screen, waitFor } from '@testing-library/react' import * as React from 'react' import { vi } from 'vitest' import Toast from '@/app/components/base/toast' @@ -240,7 +240,9 @@ describe('ConfigVar', () => { const saveButton = await screen.findByRole('button', { name: 'common.operation.save' }) fireEvent.click(saveButton) - expect(onPromptVariablesChange).toHaveBeenCalledTimes(1) + await waitFor(() => { + expect(onPromptVariablesChange).toHaveBeenCalledTimes(1) + }) }) it('should show error when variable key is duplicated', async () => { diff --git a/web/app/components/app/create-app-modal/index.tsx b/web/app/components/app/create-app-modal/index.tsx index 31a0762d27..47008c9f74 100644 --- a/web/app/components/app/create-app-modal/index.tsx +++ b/web/app/components/app/create-app-modal/index.tsx @@ -2,8 +2,7 @@ import type { AppIconSelection } from '../../base/app-icon-picker' import type { RuntimeMode } from '@/types/app' - -import { RiArrowRightLine, RiArrowRightSLine, RiCheckLine, RiCommandLine, RiCornerDownLeftLine, RiExchange2Fill } from '@remixicon/react' +import { RiArrowRightLine, RiArrowRightSLine, RiCheckLine, RiExchange2Fill } from '@remixicon/react' import { useDebounceFn, useKeyPress } from 'ahooks' import Image from 'next/image' import { useRouter } from 'next/navigation' @@ -32,6 +31,7 @@ import { getRedirection } from '@/utils/app-redirection' import { cn } from '@/utils/classnames' import { basePath } from '@/utils/var' import AppIconPicker from '../../base/app-icon-picker' +import ShortcutsName from '../../workflow/shortcuts-name' type CreateAppProps = { onSuccess: () => void @@ -342,10 +342,7 @@ function CreateApp({ onClose, onSuccess, onCreateFromTemplate, defaultAppMode }: diff --git a/web/app/components/app/create-from-dsl-modal/index.tsx b/web/app/components/app/create-from-dsl-modal/index.tsx index 129a54e2a3..3acb2933eb 100644 --- a/web/app/components/app/create-from-dsl-modal/index.tsx +++ b/web/app/components/app/create-from-dsl-modal/index.tsx @@ -29,6 +29,7 @@ import { } from '@/service/apps' import { getRedirection } from '@/utils/app-redirection' import { cn } from '@/utils/classnames' +import ShortcutsName from '../../workflow/shortcuts-name' import DSLConfirmModal from './dsl-confirm-modal' import Uploader from './uploader' @@ -327,8 +328,10 @@ const CreateFromDSLModal = ({ show, onSuccess, onClose, activeTab = CreateFromDS disabled={buttonDisabled} variant="primary" onClick={handleCreateApp} + className="gap-1" > - {t('newApp.import', { ns: 'app' })} + {t('newApp.import', { ns: 'app' })} + diff --git a/web/app/components/apps/app-card.tsx b/web/app/components/apps/app-card.tsx index b096d681e2..7415ba6d29 100644 --- a/web/app/components/apps/app-card.tsx +++ b/web/app/components/apps/app-card.tsx @@ -35,6 +35,7 @@ import { fetchWorkflowDraft } from '@/service/workflow' import { AppModeEnum } from '@/types/app' import { getRedirection } from '@/utils/app-redirection' import { cn } from '@/utils/classnames' +import { downloadBlob } from '@/utils/download' import { formatTime } from '@/utils/time' import { basePath } from '@/utils/var' @@ -172,13 +173,8 @@ const AppCard = ({ app, onRefresh, onlineUsers = [] }: AppCardProps) => { appID: app.id, include, }) - const a = document.createElement('a') const file = new Blob([data], { type: 'application/yaml' }) - const url = URL.createObjectURL(file) - a.href = url - a.download = `${app.name}.${isDownLoadBundle ? 'zip' : 'yaml'}` - a.click() - URL.revokeObjectURL(url) + downloadBlob({ data: file, fileName: `${app.name}.yml` }) } catch { notify({ @@ -363,7 +359,7 @@ const AppCard = ({ app, onRefresh, onlineUsers = [] }: AppCardProps) => { dateFormat: `${t('segment.dateTimeFormat', { ns: 'datasetDocuments' })}`, }) return `${t('segment.editedAt', { ns: 'datasetDocuments' })} ${timeText}` - }, [app.updated_at, app.created_at]) + }, [app.updated_at, app.created_at, t]) const onlineUserAvatars = useMemo(() => { if (!onlineUsers.length) diff --git a/web/app/components/apps/index.tsx b/web/app/components/apps/index.tsx index 255bfbf9c5..3be8492489 100644 --- a/web/app/components/apps/index.tsx +++ b/web/app/components/apps/index.tsx @@ -105,6 +105,7 @@ const Apps = () => { {isShowTryAppPanel && ( { e.stopPropagation() - downloadFile(url || base64Url || '', name) + downloadUrl({ url: url || base64Url || '', fileName: name, target: '_blank' }) }} > diff --git a/web/app/components/base/file-uploader/file-uploader-in-chat-input/file-image-item.tsx b/web/app/components/base/file-uploader/file-uploader-in-chat-input/file-image-item.tsx index 77dc3e35b8..d9118aac4f 100644 --- a/web/app/components/base/file-uploader/file-uploader-in-chat-input/file-image-item.tsx +++ b/web/app/components/base/file-uploader/file-uploader-in-chat-input/file-image-item.tsx @@ -8,9 +8,9 @@ import Button from '@/app/components/base/button' import { ReplayLine } from '@/app/components/base/icons/src/vender/other' import ImagePreview from '@/app/components/base/image-uploader/image-preview' import ProgressCircle from '@/app/components/base/progress-bar/progress-circle' +import { downloadUrl } from '@/utils/download' import FileImageRender from '../file-image-render' import { - downloadFile, fileIsUploaded, } from '../utils' @@ -85,7 +85,7 @@ const FileImageItem = ({ className="absolute bottom-0.5 right-0.5 flex h-6 w-6 items-center justify-center rounded-lg bg-components-actionbar-bg shadow-md" onClick={(e) => { e.stopPropagation() - downloadFile(download_url || '', name) + downloadUrl({ url: download_url || '', fileName: name, target: '_blank' }) }} > diff --git a/web/app/components/base/file-uploader/file-uploader-in-chat-input/file-item.tsx b/web/app/components/base/file-uploader/file-uploader-in-chat-input/file-item.tsx index 828864239a..af32f917b9 100644 --- a/web/app/components/base/file-uploader/file-uploader-in-chat-input/file-item.tsx +++ b/web/app/components/base/file-uploader/file-uploader-in-chat-input/file-item.tsx @@ -12,10 +12,10 @@ import VideoPreview from '@/app/components/base/file-uploader/video-preview' import { ReplayLine } from '@/app/components/base/icons/src/vender/other' import ProgressCircle from '@/app/components/base/progress-bar/progress-circle' import { cn } from '@/utils/classnames' +import { downloadUrl } from '@/utils/download' import { formatFileSize } from '@/utils/format' import FileTypeIcon from '../file-type-icon' import { - downloadFile, fileIsUploaded, getFileAppearanceType, getFileExtension, @@ -100,7 +100,7 @@ const FileItem = ({ className="absolute -right-1 -top-1 hidden group-hover/file-item:flex" onClick={(e) => { e.stopPropagation() - downloadFile(download_url || '', name) + downloadUrl({ url: download_url || '', fileName: name, target: '_blank' }) }} > diff --git a/web/app/components/base/file-uploader/utils.spec.ts b/web/app/components/base/file-uploader/utils.spec.ts index de167a8c25..f69b3c27f5 100644 --- a/web/app/components/base/file-uploader/utils.spec.ts +++ b/web/app/components/base/file-uploader/utils.spec.ts @@ -1,4 +1,3 @@ -import type { MockInstance } from 'vitest' import mime from 'mime' import { SupportUploadFileTypes } from '@/app/components/workflow/types' import { upload } from '@/service/base' @@ -6,7 +5,6 @@ import { TransferMethod } from '@/types/app' import { FILE_EXTS } from '../prompt-editor/constants' import { FileAppearanceTypeEnum } from './types' import { - downloadFile, fileIsUploaded, fileUpload, getFileAppearanceType, @@ -782,74 +780,4 @@ describe('file-uploader utils', () => { } as any)).toBe(true) }) }) - - describe('downloadFile', () => { - let mockAnchor: HTMLAnchorElement - let createElementMock: MockInstance - let appendChildMock: MockInstance - let removeChildMock: MockInstance - - beforeEach(() => { - // Mock createElement and appendChild - mockAnchor = { - href: '', - download: '', - style: { display: '' }, - target: '', - title: '', - click: vi.fn(), - } as unknown as HTMLAnchorElement - - createElementMock = vi.spyOn(document, 'createElement').mockReturnValue(mockAnchor as any) - appendChildMock = vi.spyOn(document.body, 'appendChild').mockImplementation((node: Node) => { - return node - }) - removeChildMock = vi.spyOn(document.body, 'removeChild').mockImplementation((node: Node) => { - return node - }) - }) - - afterEach(() => { - vi.resetAllMocks() - }) - - it('should create and trigger download with correct attributes', () => { - const url = 'https://example.com/test.pdf' - const filename = 'test.pdf' - - downloadFile(url, filename) - - // Verify anchor element was created with correct properties - expect(createElementMock).toHaveBeenCalledWith('a') - expect(mockAnchor.href).toBe(url) - expect(mockAnchor.download).toBe(filename) - expect(mockAnchor.style.display).toBe('none') - expect(mockAnchor.target).toBe('_blank') - expect(mockAnchor.title).toBe(filename) - - // Verify DOM operations - expect(appendChildMock).toHaveBeenCalledWith(mockAnchor) - expect(mockAnchor.click).toHaveBeenCalled() - expect(removeChildMock).toHaveBeenCalledWith(mockAnchor) - }) - - it('should handle empty filename', () => { - const url = 'https://example.com/test.pdf' - const filename = '' - - downloadFile(url, filename) - - expect(mockAnchor.download).toBe('') - expect(mockAnchor.title).toBe('') - }) - - it('should handle empty url', () => { - const url = '' - const filename = 'test.pdf' - - downloadFile(url, filename) - - expect(mockAnchor.href).toBe('') - }) - }) }) diff --git a/web/app/components/base/file-uploader/utils.ts b/web/app/components/base/file-uploader/utils.ts index 5d5754b8fe..23e460db51 100644 --- a/web/app/components/base/file-uploader/utils.ts +++ b/web/app/components/base/file-uploader/utils.ts @@ -249,15 +249,3 @@ export const fileIsUploaded = (file: FileEntity) => { if (file.transferMethod === TransferMethod.remote_url && file.progress === 100) return true } - -export const downloadFile = (url: string, filename: string) => { - const anchor = document.createElement('a') - anchor.href = url - anchor.download = filename - anchor.style.display = 'none' - anchor.target = '_blank' - anchor.title = filename - document.body.appendChild(anchor) - anchor.click() - document.body.removeChild(anchor) -} diff --git a/web/app/components/base/icons/assets/vender/knowledge/search-menu.svg b/web/app/components/base/icons/assets/vender/knowledge/search-menu.svg index 39a2298c0e..2c7be9cd7e 100644 --- a/web/app/components/base/icons/assets/vender/knowledge/search-menu.svg +++ b/web/app/components/base/icons/assets/vender/knowledge/search-menu.svg @@ -1,7 +1,7 @@ - - - - - + + + + + diff --git a/web/app/components/base/icons/assets/vender/line/general/check.svg b/web/app/components/base/icons/assets/vender/line/general/check.svg index 4fe24c808d..2e4367fa75 100644 --- a/web/app/components/base/icons/assets/vender/line/general/check.svg +++ b/web/app/components/base/icons/assets/vender/line/general/check.svg @@ -1,5 +1,5 @@ - + diff --git a/web/app/components/base/icons/assets/vender/line/general/info-circle.svg b/web/app/components/base/icons/assets/vender/line/general/info-circle.svg index 1c0e19c6f6..a7b2ec4378 100644 --- a/web/app/components/base/icons/assets/vender/line/general/info-circle.svg +++ b/web/app/components/base/icons/assets/vender/line/general/info-circle.svg @@ -1,6 +1,6 @@ - + diff --git a/web/app/components/base/icons/assets/vender/solid/development/table-cells.svg b/web/app/components/base/icons/assets/vender/solid/development/table-cells.svg index 7a97eb35d6..6feffd3b1e 100644 --- a/web/app/components/base/icons/assets/vender/solid/development/table-cells.svg +++ b/web/app/components/base/icons/assets/vender/solid/development/table-cells.svg @@ -1,3 +1,3 @@ - + diff --git a/web/app/components/base/icons/assets/vender/workflow/folder-spark.svg b/web/app/components/base/icons/assets/vender/workflow/folder-spark.svg index 6f1beeb5a3..540da135e2 100644 --- a/web/app/components/base/icons/assets/vender/workflow/folder-spark.svg +++ b/web/app/components/base/icons/assets/vender/workflow/folder-spark.svg @@ -1,4 +1,4 @@ - - + + diff --git a/web/app/components/base/icons/assets/vender/workflow/home.svg b/web/app/components/base/icons/assets/vender/workflow/home.svg index f7c6988265..62f7e54cc3 100644 --- a/web/app/components/base/icons/assets/vender/workflow/home.svg +++ b/web/app/components/base/icons/assets/vender/workflow/home.svg @@ -1,5 +1,5 @@ - + diff --git a/web/app/components/base/image-uploader/image-preview.tsx b/web/app/components/base/image-uploader/image-preview.tsx index b6a07c60aa..0641af3d79 100644 --- a/web/app/components/base/image-uploader/image-preview.tsx +++ b/web/app/components/base/image-uploader/image-preview.tsx @@ -8,6 +8,7 @@ import { createPortal } from 'react-dom' import { useHotkeys } from 'react-hotkeys-hook' import Toast from '@/app/components/base/toast' import Tooltip from '@/app/components/base/tooltip' +import { downloadUrl } from '@/utils/download' type ImagePreviewProps = { url: string @@ -60,27 +61,14 @@ const ImagePreview: FC = ({ const downloadImage = () => { // Open in a new window, considering the case when the page is inside an iframe - if (url.startsWith('http') || url.startsWith('https')) { - const a = document.createElement('a') - a.href = url - a.target = '_blank' - a.download = title - a.click() - } - else if (url.startsWith('data:image')) { - // Base64 image - const a = document.createElement('a') - a.href = url - a.target = '_blank' - a.download = title - a.click() - } - else { - Toast.notify({ - type: 'error', - message: `Unable to open image: ${url}`, - }) + if (url.startsWith('http') || url.startsWith('https') || url.startsWith('data:image')) { + downloadUrl({ url, fileName: title, target: '_blank' }) + return } + Toast.notify({ + type: 'error', + message: `Unable to open image: ${url}`, + }) } const zoomIn = () => { @@ -135,12 +123,7 @@ const ImagePreview: FC = ({ catch (err) { console.error('Failed to copy image:', err) - const link = document.createElement('a') - link.href = url - link.download = `${title}.png` - document.body.appendChild(link) - link.click() - document.body.removeChild(link) + downloadUrl({ url, fileName: `${title}.png` }) Toast.notify({ type: 'info', @@ -215,6 +198,7 @@ const ImagePreview: FC = ({ tabIndex={-1} > { } + {/* eslint-disable-next-line next/no-img-element */} {title} { }, [isShow]) const downloadQR = () => { - const canvas = document.getElementsByTagName('canvas')[0] - const link = document.createElement('a') - link.download = 'qrcode.png' - link.href = canvas.toDataURL() - link.click() + const canvas = qrCodeRef.current?.querySelector('canvas') + if (!(canvas instanceof HTMLCanvasElement)) + return + downloadUrl({ url: canvas.toDataURL(), fileName: 'qrcode.png' }) } const handlePanelClick = (event: React.MouseEvent) => { diff --git a/web/app/components/datasets/common/document-status-with-action/index-failed.spec.tsx b/web/app/components/datasets/common/document-status-with-action/index-failed.spec.tsx index 43255ce908..27070aaaed 100644 --- a/web/app/components/datasets/common/document-status-with-action/index-failed.spec.tsx +++ b/web/app/components/datasets/common/document-status-with-action/index-failed.spec.tsx @@ -179,8 +179,10 @@ describe('RetryButton (IndexFailed)', () => { }, false), ) - // Delay the response to test loading state - mockRetryErrorDocs.mockImplementation(() => new Promise(resolve => setTimeout(() => resolve({ result: 'success' }), 100))) + let resolveRetry: ((value: { result: 'success' }) => void) | undefined + mockRetryErrorDocs.mockImplementation(() => new Promise((resolve) => { + resolveRetry = resolve + })) render() @@ -193,6 +195,11 @@ describe('RetryButton (IndexFailed)', () => { expect(button).toHaveClass('cursor-not-allowed') expect(button).toHaveClass('text-text-disabled') }) + + resolveRetry?.({ result: 'success' }) + await waitFor(() => { + expect(mockRefetch).toHaveBeenCalled() + }) }) }) diff --git a/web/app/components/datasets/create-from-pipeline/list/template-card/index.spec.tsx b/web/app/components/datasets/create-from-pipeline/list/template-card/index.spec.tsx index 290f7af99b..036370abd3 100644 --- a/web/app/components/datasets/create-from-pipeline/list/template-card/index.spec.tsx +++ b/web/app/components/datasets/create-from-pipeline/list/template-card/index.spec.tsx @@ -23,9 +23,10 @@ vi.mock('@/app/components/base/toast', () => ({ }, })) -// Mock downloadFile utility -vi.mock('@/utils/format', () => ({ - downloadFile: vi.fn(), +// Mock download utilities +vi.mock('@/utils/download', () => ({ + downloadBlob: vi.fn(), + downloadUrl: vi.fn(), })) // Capture Confirm callbacks @@ -502,8 +503,8 @@ describe('TemplateCard', () => { }) }) - it('should call downloadFile on successful export', async () => { - const { downloadFile } = await import('@/utils/format') + it('should call downloadBlob on successful export', async () => { + const { downloadBlob } = await import('@/utils/download') mockExportPipelineDSL.mockImplementation((_id, callbacks) => { callbacks.onSuccess({ data: 'yaml_content' }) return Promise.resolve() @@ -514,7 +515,7 @@ describe('TemplateCard', () => { fireEvent.click(exportButton) await waitFor(() => { - expect(downloadFile).toHaveBeenCalledWith(expect.objectContaining({ + expect(downloadBlob).toHaveBeenCalledWith(expect.objectContaining({ fileName: 'Test Pipeline.pipeline', })) }) diff --git a/web/app/components/datasets/create-from-pipeline/list/template-card/index.tsx b/web/app/components/datasets/create-from-pipeline/list/template-card/index.tsx index 662ca72080..b3395a83d5 100644 --- a/web/app/components/datasets/create-from-pipeline/list/template-card/index.tsx +++ b/web/app/components/datasets/create-from-pipeline/list/template-card/index.tsx @@ -16,7 +16,7 @@ import { useInvalidCustomizedTemplateList, usePipelineTemplateById, } from '@/service/use-pipeline' -import { downloadFile } from '@/utils/format' +import { downloadBlob } from '@/utils/download' import Actions from './actions' import Content from './content' import Details from './details' @@ -108,10 +108,7 @@ const TemplateCard = ({ await exportPipelineDSL(pipeline.id, { onSuccess: (res) => { const blob = new Blob([res.data], { type: 'application/yaml' }) - downloadFile({ - data: blob, - fileName: `${pipeline.name}.pipeline`, - }) + downloadBlob({ data: blob, fileName: `${pipeline.name}.pipeline` }) Toast.notify({ type: 'success', message: t('exportDSL.successTip', { ns: 'datasetPipeline' }), diff --git a/web/app/components/datasets/create/website/watercrawl/index.tsx b/web/app/components/datasets/create/website/watercrawl/index.tsx index 0df2dbe8a1..e68a89ae5a 100644 --- a/web/app/components/datasets/create/website/watercrawl/index.tsx +++ b/web/app/components/datasets/create/website/watercrawl/index.tsx @@ -125,11 +125,25 @@ const WaterCrawl: FC = ({ await sleep(2500) return await waitForCrawlFinished(jobId) } - catch (e: any) { - const errorBody = await e.json() + catch (error: unknown) { + let errorMessage = '' + + const maybeErrorWithJson = error as { json?: () => Promise, message?: unknown } | null + if (maybeErrorWithJson?.json) { + try { + const errorBody = await maybeErrorWithJson.json() as { message?: unknown } | null + if (typeof errorBody?.message === 'string') + errorMessage = errorBody.message + } + catch {} + } + + if (!errorMessage && typeof maybeErrorWithJson?.message === 'string') + errorMessage = maybeErrorWithJson.message + return { isError: true, - errorMessage: errorBody.message, + errorMessage, data: { data: [], }, diff --git a/web/app/components/datasets/documents/detail/completed/common/action-buttons.tsx b/web/app/components/datasets/documents/detail/completed/common/action-buttons.tsx index efb9848494..a0cbfea147 100644 --- a/web/app/components/datasets/documents/detail/completed/common/action-buttons.tsx +++ b/web/app/components/datasets/documents/detail/completed/common/action-buttons.tsx @@ -4,7 +4,8 @@ import * as React from 'react' import { useMemo } from 'react' import { useTranslation } from 'react-i18next' import Button from '@/app/components/base/button' -import { getKeyboardKeyCodeBySystem, getKeyboardKeyNameBySystem } from '@/app/components/workflow/utils' +import ShortcutsName from '@/app/components/workflow/shortcuts-name' +import { getKeyboardKeyCodeBySystem } from '@/app/components/workflow/utils' import { ChunkingMode } from '@/models/datasets' import { useDocumentContext } from '../../context' @@ -54,7 +55,7 @@ const ActionButtons: FC = ({ >
{t('operation.cancel', { ns: 'common' })} - ESC +
{(isParentChildParagraphMode && actionType === 'edit' && !isChildChunk && showRegenerationButton) @@ -76,10 +77,7 @@ const ActionButtons: FC = ({ >
{t('operation.save', { ns: 'common' })} -
- {getKeyboardKeyNameBySystem('ctrl')} - S -
+
diff --git a/web/app/components/datasets/list/dataset-card/hooks/use-dataset-card-state.ts b/web/app/components/datasets/list/dataset-card/hooks/use-dataset-card-state.ts index ad68a1df1c..4bd8357f1c 100644 --- a/web/app/components/datasets/list/dataset-card/hooks/use-dataset-card-state.ts +++ b/web/app/components/datasets/list/dataset-card/hooks/use-dataset-card-state.ts @@ -5,6 +5,7 @@ import { useTranslation } from 'react-i18next' import Toast from '@/app/components/base/toast' import { useCheckDatasetUsage, useDeleteDataset } from '@/service/use-dataset-card' import { useExportPipelineDSL } from '@/service/use-pipeline' +import { downloadBlob } from '@/utils/download' type ModalState = { showRenameModal: boolean @@ -65,13 +66,8 @@ export const useDatasetCardState = ({ dataset, onSuccess }: UseDatasetCardStateO pipelineId: pipeline_id, include, }) - const a = document.createElement('a') const file = new Blob([data], { type: 'application/yaml' }) - const url = URL.createObjectURL(file) - a.href = url - a.download = `${name}.pipeline` - a.click() - URL.revokeObjectURL(url) + downloadBlob({ data: file, fileName: `${name}.pipeline` }) } catch { Toast.notify({ type: 'error', message: t('exportFailed', { ns: 'app' }) }) diff --git a/web/app/components/explore/app-card/index.tsx b/web/app/components/explore/app-card/index.tsx index 5d82ab65cc..15152e0695 100644 --- a/web/app/components/explore/app-card/index.tsx +++ b/web/app/components/explore/app-card/index.tsx @@ -74,17 +74,15 @@ const AppCard = ({ {isExplore && (canCreate || isTrialApp) && ( diff --git a/web/app/components/explore/try-app/index.tsx b/web/app/components/explore/try-app/index.tsx index b2e2b72140..c6f00ed08e 100644 --- a/web/app/components/explore/try-app/index.tsx +++ b/web/app/components/explore/try-app/index.tsx @@ -1,11 +1,13 @@ /* eslint-disable style/multiline-ternary */ 'use client' import type { FC } from 'react' +import type { App as AppType } from '@/models/explore' import { RiCloseLine } from '@remixicon/react' import * as React from 'react' import { useState } from 'react' import Loading from '@/app/components/base/loading' import Modal from '@/app/components/base/modal/index' +import { useGlobalPublicStore } from '@/context/global-public-context' import { useGetTryAppInfo } from '@/service/use-try-app' import Button from '../../base/button' import App from './app' @@ -15,6 +17,7 @@ import Tab, { TypeEnum } from './tab' type Props = { appId: string + app?: AppType category?: string onClose: () => void onCreate: () => void @@ -22,13 +25,23 @@ type Props = { const TryApp: FC = ({ appId, + app, category, onClose, onCreate, }) => { - const [type, setType] = useState(TypeEnum.TRY) + const { systemFeatures } = useGlobalPublicStore() + const isTrialApp = !!(app && app.can_trial && systemFeatures.enable_trial_app) + const [type, setType] = useState(() => (app && !isTrialApp ? TypeEnum.DETAIL : TypeEnum.TRY)) const { data: appDetail, isLoading } = useGetTryAppInfo(appId) + React.useEffect(() => { + if (app && !isTrialApp && type !== TypeEnum.DETAIL) + // eslint-disable-next-line react-hooks-extra/no-direct-set-state-in-use-effect + setType(TypeEnum.DETAIL) + // eslint-disable-next-line react-hooks/exhaustive-deps + }, [app, isTrialApp]) + return ( = ({ diff --git a/web/app/components/rag-pipeline/hooks/use-DSL.ts b/web/app/components/rag-pipeline/hooks/use-DSL.ts index 1660d555eb..5c0f9def1c 100644 --- a/web/app/components/rag-pipeline/hooks/use-DSL.ts +++ b/web/app/components/rag-pipeline/hooks/use-DSL.ts @@ -11,6 +11,7 @@ import { useWorkflowStore } from '@/app/components/workflow/store' import { useEventEmitterContextContext } from '@/context/event-emitter' import { useExportPipelineDSL } from '@/service/use-pipeline' import { fetchWorkflowDraft } from '@/service/workflow' +import { downloadBlob } from '@/utils/download' import { useNodesSyncDraft } from './use-nodes-sync-draft' export const useDSL = () => { @@ -37,13 +38,8 @@ export const useDSL = () => { pipelineId, include, }) - const a = document.createElement('a') const file = new Blob([data], { type: 'application/yaml' }) - const url = URL.createObjectURL(file) - a.href = url - a.download = `${knowledgeName}.pipeline` - a.click() - URL.revokeObjectURL(url) + downloadBlob({ data: file, fileName: `${knowledgeName}.pipeline` }) } catch { notify({ type: 'error', message: t('exportFailed', { ns: 'app' }) }) diff --git a/web/app/components/workflow-app/components/workflow-onboarding-modal/index.tsx b/web/app/components/workflow-app/components/workflow-onboarding-modal/index.tsx index c483abfb0b..16bae51246 100644 --- a/web/app/components/workflow-app/components/workflow-onboarding-modal/index.tsx +++ b/web/app/components/workflow-app/components/workflow-onboarding-modal/index.tsx @@ -7,6 +7,7 @@ import { } from 'react' import { useTranslation } from 'react-i18next' import Modal from '@/app/components/base/modal' +import ShortcutsName from '@/app/components/workflow/shortcuts-name' import { BlockEnum } from '@/app/components/workflow/types' import StartNodeSelectionPanel from './start-node-selection-panel' @@ -75,9 +76,7 @@ const WorkflowOnboardingModal: FC = ({ {isShow && (
{t('onboarding.escTip.press', { ns: 'workflow' })} - - {t('onboarding.escTip.key', { ns: 'workflow' })} - + {t('onboarding.escTip.toDismiss', { ns: 'workflow' })}
)} diff --git a/web/app/components/workflow/block-selector/market-place-plugin/action.tsx b/web/app/components/workflow/block-selector/market-place-plugin/action.tsx index b8300d6f2b..abdbae1b4c 100644 --- a/web/app/components/workflow/block-selector/market-place-plugin/action.tsx +++ b/web/app/components/workflow/block-selector/market-place-plugin/action.tsx @@ -15,7 +15,7 @@ import { } from '@/app/components/base/portal-to-follow-elem' import { useDownloadPlugin } from '@/service/use-plugins' import { cn } from '@/utils/classnames' -import { downloadFile } from '@/utils/format' +import { downloadBlob } from '@/utils/download' import { getMarketplaceUrl } from '@/utils/var' type Props = { @@ -67,7 +67,7 @@ const OperationDropdown: FC = ({ if (!needDownload || !blob) return const fileName = `${author}-${name}_${version}.zip` - downloadFile({ data: blob, fileName }) + downloadBlob({ data: blob, fileName }) setNeedDownload(false) queryClient.removeQueries({ queryKey: ['plugins', 'downloadPlugin', downloadInfo], diff --git a/web/app/components/workflow/header/run-mode.tsx b/web/app/components/workflow/header/run-mode.tsx index 1a101bc6d2..74bc5bc80a 100644 --- a/web/app/components/workflow/header/run-mode.tsx +++ b/web/app/components/workflow/header/run-mode.tsx @@ -7,9 +7,9 @@ import { trackEvent } from '@/app/components/base/amplitude' import { StopCircle } from '@/app/components/base/icons/src/vender/line/mediaAndDevices' import { useToastContext } from '@/app/components/base/toast' import { useWorkflowRun, useWorkflowRunValidation, useWorkflowStartRun } from '@/app/components/workflow/hooks' +import ShortcutsName from '@/app/components/workflow/shortcuts-name' import { useStore } from '@/app/components/workflow/store' import { WorkflowRunningStatus } from '@/app/components/workflow/types' -import { getKeyboardKeyNameBySystem } from '@/app/components/workflow/utils' import { EVENT_WORKFLOW_STOP } from '@/app/components/workflow/variable-inspect/types' import { useEventEmitterContextContext } from '@/context/event-emitter' import { cn } from '@/utils/classnames' @@ -143,14 +143,7 @@ const RunMode = ({ > {text ?? t('common.run', { ns: 'workflow' })} -
-
- {getKeyboardKeyNameBySystem('alt')} -
-
- R -
-
+ ) diff --git a/web/app/components/workflow/header/version-history-button.tsx b/web/app/components/workflow/header/version-history-button.tsx index 32e72dc184..b98dfeea76 100644 --- a/web/app/components/workflow/header/version-history-button.tsx +++ b/web/app/components/workflow/header/version-history-button.tsx @@ -8,7 +8,8 @@ import useTheme from '@/hooks/use-theme' import { cn } from '@/utils/classnames' import Button from '../../base/button' import Tooltip from '../../base/tooltip' -import { getKeyboardKeyCodeBySystem, getKeyboardKeyNameBySystem } from '../utils' +import ShortcutsName from '../shortcuts-name' +import { getKeyboardKeyCodeBySystem } from '../utils' type VersionHistoryButtonProps = { onClick: () => Promise | unknown @@ -23,16 +24,7 @@ const PopupContent = React.memo(() => {
{t('common.versionHistory', { ns: 'workflow' })}
-
- {VERSION_HISTORY_SHORTCUT.map(key => ( - - {getKeyboardKeyNameBySystem(key)} - - ))} -
+ ) }) diff --git a/web/app/components/workflow/nodes/http/components/curl-panel.tsx b/web/app/components/workflow/nodes/http/components/curl-panel.tsx index aa67a2a0ae..6c809c310f 100644 --- a/web/app/components/workflow/nodes/http/components/curl-panel.tsx +++ b/web/app/components/workflow/nodes/http/components/curl-panel.tsx @@ -41,7 +41,7 @@ const parseCurl = (curlCommand: string): { node: HttpNodeType | null, error: str case '--request': if (i + 1 >= args.length) return { node: null, error: 'Missing HTTP method after -X or --request.' } - node.method = (args[++i].replace(/^['"]|['"]$/g, '') as Method) || Method.get + node.method = (args[++i].replace(/^['"]|['"]$/g, '').toLowerCase() as Method) || Method.get hasData = true break case '-H': diff --git a/web/app/components/workflow/nodes/llm/components/json-schema-config-modal/visual-editor/edit-card/advanced-actions.tsx b/web/app/components/workflow/nodes/llm/components/json-schema-config-modal/visual-editor/edit-card/advanced-actions.tsx index 536277b9e2..8aad824008 100644 --- a/web/app/components/workflow/nodes/llm/components/json-schema-config-modal/visual-editor/edit-card/advanced-actions.tsx +++ b/web/app/components/workflow/nodes/llm/components/json-schema-config-modal/visual-editor/edit-card/advanced-actions.tsx @@ -3,7 +3,8 @@ import { useKeyPress } from 'ahooks' import * as React from 'react' import { useTranslation } from 'react-i18next' import Button from '@/app/components/base/button' -import { getKeyboardKeyCodeBySystem, getKeyboardKeyNameBySystem } from '@/app/components/workflow/utils' +import ShortcutsName from '@/app/components/workflow/shortcuts-name' +import { getKeyboardKeyCodeBySystem } from '@/app/components/workflow/utils' type AdvancedActionsProps = { isConfirmDisabled: boolean @@ -11,15 +12,6 @@ type AdvancedActionsProps = { onConfirm: () => void } -const Key = (props: { keyName: string }) => { - const { keyName } = props - return ( - - {keyName} - - ) -} - const AdvancedActions: FC = ({ isConfirmDisabled, onCancel, @@ -48,10 +40,7 @@ const AdvancedActions: FC = ({ onClick={onConfirm} > {t('operation.confirm', { ns: 'common' })} -
- - -
+ ) diff --git a/web/app/components/workflow/operator/more-actions.tsx b/web/app/components/workflow/operator/more-actions.tsx index e9fc1ea87d..7e6617e84b 100644 --- a/web/app/components/workflow/operator/more-actions.tsx +++ b/web/app/components/workflow/operator/more-actions.tsx @@ -19,6 +19,7 @@ import { } from '@/app/components/base/portal-to-follow-elem' import { useStore } from '@/app/components/workflow/store' import { cn } from '@/utils/classnames' +import { downloadUrl } from '@/utils/download' import { useNodesReadOnly } from '../hooks' import TipPopup from './tip-popup' @@ -146,26 +147,14 @@ const MoreActions: FC = () => { } } + const fileName = `${filename}.${type}` + if (currentWorkflow) { setPreviewUrl(dataUrl) - setPreviewTitle(`${filename}.${type}`) + setPreviewTitle(fileName) + } - const link = document.createElement('a') - link.href = dataUrl - link.download = `${filename}.${type}` - document.body.appendChild(link) - link.click() - document.body.removeChild(link) - } - else { - // For current view, just download - const link = document.createElement('a') - link.href = dataUrl - link.download = `${filename}.${type}` - document.body.appendChild(link) - link.click() - document.body.removeChild(link) - } + downloadUrl({ url: dataUrl, fileName }) } catch (error) { console.error('Export image failed:', error) diff --git a/web/app/components/workflow/shortcuts-name.tsx b/web/app/components/workflow/shortcuts-name.tsx index a31528d00c..d2ee75b341 100644 --- a/web/app/components/workflow/shortcuts-name.tsx +++ b/web/app/components/workflow/shortcuts-name.tsx @@ -6,11 +6,13 @@ type ShortcutsNameProps = { keys: readonly string[] className?: string textColor?: 'default' | 'secondary' + bgColor?: 'gray' | 'white' } const ShortcutsName = ({ keys, className, textColor = 'default', + bgColor = 'gray', }: ShortcutsNameProps) => { return (
diff --git a/web/app/components/workflow/skill/editor/skill-editor/plugins/file-picker-panel.tsx b/web/app/components/workflow/skill/editor/skill-editor/plugins/file-picker-panel.tsx index bc19b18c0c..340b395b48 100644 --- a/web/app/components/workflow/skill/editor/skill-editor/plugins/file-picker-panel.tsx +++ b/web/app/components/workflow/skill/editor/skill-editor/plugins/file-picker-panel.tsx @@ -1,7 +1,6 @@ import type { NodeRendererProps } from 'react-arborist' import type { FileAppearanceType } from '@/app/components/base/file-uploader/types' import type { TreeNodeData } from '@/app/components/workflow/skill/type' -import { RiArrowDownSLine, RiArrowRightSLine, RiFolderLine, RiFolderOpenLine, RiQuestionLine } from '@remixicon/react' import { useSize } from 'ahooks' import * as React from 'react' import { useCallback, useMemo, useRef } from 'react' @@ -66,8 +65,8 @@ const FilePickerTreeNode = ({ node, style, dragHandle, onSelectNode }: FilePicke {isFolder ? ( node.isOpen - ?
@@ -162,7 +161,7 @@ const FilePickerPanel = ({ {t('skillEditor.referenceFiles')} -