Fix/merge fix (#25781)

2025-09-16 19:14:33 +08:00 · 2025-09-16 19:14:33 +08:00 · b37bef44f6
parent d575072735 8cbfaa2c03
commit b37bef44f6
17 changed files with 82 additions and 64 deletions
--- a/api/controllers/console/init.py
+++ b/api/controllers/console/init.py
@ -61,6 +61,7 @@ from . import (
    init_validate,
    ping,
    setup,
+    spec,
    version,
 )

--- a/api/controllers/console/datasets/datasets.py
+++ b/api/controllers/console/datasets/datasets.py
@ -741,7 +741,7 @@ class DatasetApiDeleteApi(Resource):
        return {"result": "success"}, 204


-@console_ns.route("/datasets/<uuid:dataset_id>/api-keys/<str:status>")
+@console_ns.route("/datasets/<uuid:dataset_id>/api-keys/<string:status>")
 class DatasetEnableApiApi(Resource):
    @setup_required
    @login_required
--- a/api/core/app/app_config/entities.py
+++ b/api/core/app/app_config/entities.py
@ -1,6 +1,6 @@
 from collections.abc import Sequence
 from enum import StrEnum, auto
-from typing import Any, Literal
+from typing import Any, Literal, Optional

 from pydantic import BaseModel, Field, field_validator

--- a/api/core/app/apps/pipeline/pipeline_generator.py
+++ b/api/core/app/apps/pipeline/pipeline_generator.py
@ -119,7 +119,7 @@ class PipelineGenerator(BaseAppGenerator):
    ) -> Union[Mapping[str, Any], Generator[Mapping | str, None, None], None]:
        # Add null check for dataset

-        with Session(db.engine) as session:
+        with Session(db.engine, expire_on_commit=False) as session:
            dataset = pipeline.retrieve_dataset(session)
            if not dataset:
                raise ValueError("Pipeline dataset is required")
--- a/api/core/rag/extractor/entity/extract_setting.py
+++ b/api/core/rag/extractor/entity/extract_setting.py
@ -1,3 +1,5 @@
+from typing import Optional
+
 from pydantic import BaseModel, ConfigDict

 from models.dataset import Document
--- a/api/core/rag/index_processor/index_processor_base.py
+++ b/api/core/rag/index_processor/index_processor_base.py
@ -2,7 +2,7 @@

 from abc import ABC, abstractmethod
 from collections.abc import Mapping
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, Optional

 from configs import dify_config
 from core.rag.extractor.entity.extract_setting import ExtractSetting
@ -64,7 +64,7 @@ class BaseIndexProcessor(ABC):
        max_tokens: int,
        chunk_overlap: int,
        separator: str,
-        embedding_model_instance: ModelInstance | None,
+        embedding_model_instance: Optional["ModelInstance"],
    ) -> TextSplitter:
        """
        Get the NodeParser object according to the processing rule.
--- a/api/core/workflow/nodes/knowledge_index/knowledge_index_node.py
+++ b/api/core/workflow/nodes/knowledge_index/knowledge_index_node.py
@ -141,7 +141,7 @@ class KnowledgeIndexNode(Node):
        index_processor = IndexProcessorFactory(dataset.chunk_structure).init_index_processor()
        if original_document_id:
            segments = db.session.scalars(
-                select(DocumentSegment).where(DocumentSegment.document_id == document_id)
+                select(DocumentSegment).where(DocumentSegment.document_id == original_document_id.value)
            ).all()
            if segments:
                index_node_ids = [segment.index_node_id for segment in segments]
--- a/api/models/dataset.py
+++ b/api/models/dataset.py
@ -10,7 +10,7 @@ import re
 import time
 from datetime import datetime
 from json import JSONDecodeError
-from typing import Any, cast
+from typing import Any, Optional, cast

 import sqlalchemy as sa
 from sqlalchemy import DateTime, String, func, select
--- a/api/models/workflow.py
+++ b/api/models/workflow.py
@ -3,7 +3,7 @@ import logging
 from collections.abc import Mapping, Sequence
 from datetime import datetime
 from enum import StrEnum
-from typing import TYPE_CHECKING, Any, Union, cast
+from typing import TYPE_CHECKING, Any, Optional, Union, cast
 from uuid import uuid4

 import sqlalchemy as sa
--- a/api/services/dataset_service.py
+++ b/api/services/dataset_service.py
@ -7,7 +7,7 @@ import time
 import uuid
 from collections import Counter
 from collections.abc import Sequence
-from typing import Any, Literal
+from typing import Any, Literal, Optional

 import sqlalchemy as sa
 from sqlalchemy import exists, func, select
--- a/api/services/rag_pipeline/pipeline_generate_service.py
+++ b/api/services/rag_pipeline/pipeline_generate_service.py
@ -4,7 +4,8 @@ from typing import Any, Union
 from configs import dify_config
 from core.app.apps.pipeline.pipeline_generator import PipelineGenerator
 from core.app.entities.app_invoke_entities import InvokeFrom
-from models.dataset import Pipeline
+from extensions.ext_database import db
+from models.dataset import Document, Pipeline
 from models.model import Account, App, EndUser
 from models.workflow import Workflow
 from services.rag_pipeline.rag_pipeline import RagPipelineService
@ -31,6 +32,9 @@ class PipelineGenerateService:
        """
        try:
            workflow = cls._get_workflow(pipeline, invoke_from)
+            if original_document_id := args.get("original_document_id"):
+                # update document status to waiting
+                cls.update_document_status(original_document_id)
            return PipelineGenerator.convert_to_event_stream(
                PipelineGenerator().generate(
                    pipeline=pipeline,
@ -97,3 +101,15 @@ class PipelineGenerateService:
                raise ValueError("Workflow not published")

        return workflow
+
+    @classmethod
+    def update_document_status(cls, document_id: str):
+        """
+        Update document status to waiting
+        :param document_id: document id
+        """
+        document = db.session.query(Document).filter(Document.id == document_id).first()
+        if document:
+            document.indexing_status = "waiting"
+            db.session.add(document)
+            db.session.commit()
--- a/api/services/rag_pipeline/rag_pipeline.py
+++ b/api/services/rag_pipeline/rag_pipeline.py
@ -1348,11 +1348,11 @@ class RagPipelineService:
                "start_node_id": document_pipeline_excution_log.datasource_node_id,
                "datasource_type": document_pipeline_excution_log.datasource_type,
                "datasource_info_list": [json.loads(document_pipeline_excution_log.datasource_info)],
+                "original_document_id": document.id,
            },
            invoke_from=InvokeFrom.PUBLISHED,
            streaming=False,
            call_depth=0,
            workflow_thread_pool_id=None,
            is_retry=True,
-            documents=[document],
        )
--- a/api/services/rag_pipeline/transform/file-general-economy.yml
+++ b/api/services/rag_pipeline/transform/file-general-economy.yml
@ -282,7 +282,7 @@ workflow:
          conditions:
          - comparison_operator: is
            id: 9da88d93-3ff6-463f-abfd-6bcafbf2554d
-            value: xlsx
+            value: .xlsx
            varType: file
            variable_selector:
            - '1752479895761'
@ -290,7 +290,7 @@ workflow:
            - extension
          - comparison_operator: is
            id: d0e88f5e-dfe3-4bae-af0c-dbec267500de
-            value: xls
+            value: .xls
            varType: file
            variable_selector:
            - '1752479895761'
@ -298,7 +298,7 @@ workflow:
            - extension
          - comparison_operator: is
            id: a957e91e-1ed7-4c6b-9c80-2f0948858f1d
-            value: md
+            value: .md
            varType: file
            variable_selector:
            - '1752479895761'
@ -306,7 +306,7 @@ workflow:
            - extension
          - comparison_operator: is
            id: 870c3c39-8d3f-474a-ab8b-9c0ccf53db73
-            value: markdown
+            value: .markdown
            varType: file
            variable_selector:
            - '1752479895761'
@ -314,7 +314,7 @@ workflow:
            - extension
          - comparison_operator: is
            id: f9541513-1e71-4dc1-9db5-35dc84a39e3c
-            value: mdx
+            value: .mdx
            varType: file
            variable_selector:
            - '1752479895761'
@ -322,7 +322,7 @@ workflow:
            - extension
          - comparison_operator: is
            id: 4c7f455b-ac20-40ca-9495-6cc44ffcb35d
-            value: html
+            value: .html
            varType: file
            variable_selector:
            - '1752479895761'
@ -330,7 +330,7 @@ workflow:
            - extension
          - comparison_operator: is
            id: 2e12d9c7-8057-4a09-8851-f9fd1d0718d1
-            value: htm
+            value: .htm
            varType: file
            variable_selector:
            - '1752479895761'
@ -338,7 +338,7 @@ workflow:
            - extension
          - comparison_operator: is
            id: 73a995a9-d8b9-4aef-89f7-306e2ddcbce2
-            value: docx
+            value: .docx
            varType: file
            variable_selector:
            - '1752479895761'
@ -346,7 +346,7 @@ workflow:
            - extension
          - comparison_operator: is
            id: 8a2e8772-0426-458b-a1f9-9eaaec0f27c8
-            value: csv
+            value: .csv
            varType: file
            variable_selector:
            - '1752479895761'
@ -354,7 +354,7 @@ workflow:
            - extension
          - comparison_operator: is
            id: aa2cb6b6-a2fc-462a-a9f5-c9c3f33a1602
-            value: txt
+            value: .txt
            varType: file
            variable_selector:
            - '1752479895761'
--- a/api/services/rag_pipeline/transform/file-general-high-quality.yml
+++ b/api/services/rag_pipeline/transform/file-general-high-quality.yml
@ -282,7 +282,7 @@ workflow:
          conditions:
          - comparison_operator: is
            id: 9da88d93-3ff6-463f-abfd-6bcafbf2554d
-            value: xlsx
+            value: .xlsx
            varType: file
            variable_selector:
            - '1752479895761'
@ -290,7 +290,7 @@ workflow:
            - extension
          - comparison_operator: is
            id: d0e88f5e-dfe3-4bae-af0c-dbec267500de
-            value: xls
+            value: .xls
            varType: file
            variable_selector:
            - '1752479895761'
@ -298,7 +298,7 @@ workflow:
            - extension
          - comparison_operator: is
            id: a957e91e-1ed7-4c6b-9c80-2f0948858f1d
-            value: md
+            value: .md
            varType: file
            variable_selector:
            - '1752479895761'
@ -306,7 +306,7 @@ workflow:
            - extension
          - comparison_operator: is
            id: 870c3c39-8d3f-474a-ab8b-9c0ccf53db73
-            value: markdown
+            value: .markdown
            varType: file
            variable_selector:
            - '1752479895761'
@ -314,7 +314,7 @@ workflow:
            - extension
          - comparison_operator: is
            id: f9541513-1e71-4dc1-9db5-35dc84a39e3c
-            value: mdx
+            value: .mdx
            varType: file
            variable_selector:
            - '1752479895761'
@ -322,7 +322,7 @@ workflow:
            - extension
          - comparison_operator: is
            id: 4c7f455b-ac20-40ca-9495-6cc44ffcb35d
-            value: html
+            value: .html
            varType: file
            variable_selector:
            - '1752479895761'
@ -330,7 +330,7 @@ workflow:
            - extension
          - comparison_operator: is
            id: 2e12d9c7-8057-4a09-8851-f9fd1d0718d1
-            value: htm
+            value: .htm
            varType: file
            variable_selector:
            - '1752479895761'
@ -338,7 +338,7 @@ workflow:
            - extension
          - comparison_operator: is
            id: 73a995a9-d8b9-4aef-89f7-306e2ddcbce2
-            value: docx
+            value: .docx
            varType: file
            variable_selector:
            - '1752479895761'
@ -346,7 +346,7 @@ workflow:
            - extension
          - comparison_operator: is
            id: 8a2e8772-0426-458b-a1f9-9eaaec0f27c8
-            value: csv
+            value: .csv
            varType: file
            variable_selector:
            - '1752479895761'
@ -354,7 +354,7 @@ workflow:
            - extension
          - comparison_operator: is
            id: aa2cb6b6-a2fc-462a-a9f5-c9c3f33a1602
-            value: txt
+            value: .txt
            varType: file
            variable_selector:
            - '1752479895761'
--- a/api/services/rag_pipeline/transform/file-parentchild.yml
+++ b/api/services/rag_pipeline/transform/file-parentchild.yml
@ -281,7 +281,7 @@ workflow:
          conditions:
          - comparison_operator: is
            id: 9da88d93-3ff6-463f-abfd-6bcafbf2554d
-            value: xlsx
+            value: .xlsx
            varType: file
            variable_selector:
            - '1752479895761'
@ -289,7 +289,7 @@ workflow:
            - extension
          - comparison_operator: is
            id: d0e88f5e-dfe3-4bae-af0c-dbec267500de
-            value: xls
+            value: .xls
            varType: file
            variable_selector:
            - '1752479895761'
@ -297,7 +297,7 @@ workflow:
            - extension
          - comparison_operator: is
            id: a957e91e-1ed7-4c6b-9c80-2f0948858f1d
-            value: md
+            value: .md
            varType: file
            variable_selector:
            - '1752479895761'
@ -305,7 +305,7 @@ workflow:
            - extension
          - comparison_operator: is
            id: 870c3c39-8d3f-474a-ab8b-9c0ccf53db73
-            value: markdown
+            value: .markdown
            varType: file
            variable_selector:
            - '1752479895761'
@ -313,7 +313,7 @@ workflow:
            - extension
          - comparison_operator: is
            id: f9541513-1e71-4dc1-9db5-35dc84a39e3c
-            value: mdx
+            value: .mdx
            varType: file
            variable_selector:
            - '1752479895761'
@ -321,7 +321,7 @@ workflow:
            - extension
          - comparison_operator: is
            id: 4c7f455b-ac20-40ca-9495-6cc44ffcb35d
-            value: html
+            value: .html
            varType: file
            variable_selector:
            - '1752479895761'
@ -329,7 +329,7 @@ workflow:
            - extension
          - comparison_operator: is
            id: 2e12d9c7-8057-4a09-8851-f9fd1d0718d1
-            value: htm
+            value: .htm
            varType: file
            variable_selector:
            - '1752479895761'
@ -337,7 +337,7 @@ workflow:
            - extension
          - comparison_operator: is
            id: 73a995a9-d8b9-4aef-89f7-306e2ddcbce2
-            value: docx
+            value: .docx
            varType: file
            variable_selector:
            - '1752479895761'
@ -345,7 +345,7 @@ workflow:
            - extension
          - comparison_operator: is
            id: 8a2e8772-0426-458b-a1f9-9eaaec0f27c8
-            value: csv
+            value: .csv
            varType: file
            variable_selector:
            - '1752479895761'
@ -353,7 +353,7 @@ workflow:
            - extension
          - comparison_operator: is
            id: aa2cb6b6-a2fc-462a-a9f5-c9c3f33a1602
-            value: txt
+            value: .txt
            varType: file
            variable_selector:
            - '1752479895761'
--- a/api/tasks/rag_pipeline/priority_rag_pipeline_run_task.py
+++ b/api/tasks/rag_pipeline/priority_rag_pipeline_run_task.py
@ -103,7 +103,7 @@ def run_single_rag_pipeline_task(rag_pipeline_invoke_entity: Mapping[str, Any],
            workflow_thread_pool_id = rag_pipeline_invoke_entity_model.workflow_thread_pool_id
            application_generate_entity = rag_pipeline_invoke_entity_model.application_generate_entity

-            with Session(db.engine) as session:
+            with Session(db.engine, expire_on_commit=False) as session:
                # Load required entities
                account = session.query(Account).filter(Account.id == user_id).first()
                if not account:
@ -144,30 +144,30 @@ def run_single_rag_pipeline_task(rag_pipeline_invoke_entity: Mapping[str, Any],
                    triggered_from=WorkflowNodeExecutionTriggeredFrom.RAG_PIPELINE_RUN,
                )

-                # Set the user directly in g for preserve_flask_contexts
-                g._login_user = account
+            # Set the user directly in g for preserve_flask_contexts
+            g._login_user = account

-                # Copy context for passing to pipeline generator
-                context = contextvars.copy_context()
+            # Copy context for passing to pipeline generator
+            context = contextvars.copy_context()

-                # Direct execution without creating another thread
-                # Since we're already in a thread pool, no need for nested threading
-                from core.app.apps.pipeline.pipeline_generator import PipelineGenerator
+            # Direct execution without creating another thread
+            # Since we're already in a thread pool, no need for nested threading
+            from core.app.apps.pipeline.pipeline_generator import PipelineGenerator

-                pipeline_generator = PipelineGenerator()
-                pipeline_generator._generate(
-                    flask_app=flask_app,
-                    context=context,
-                    pipeline=pipeline,
-                    workflow_id=workflow_id,
-                    user=account,
-                    application_generate_entity=entity,
-                    invoke_from=InvokeFrom.PUBLISHED,
-                    workflow_execution_repository=workflow_execution_repository,
-                    workflow_node_execution_repository=workflow_node_execution_repository,
-                    streaming=streaming,
-                    workflow_thread_pool_id=workflow_thread_pool_id,
-                )
+            pipeline_generator = PipelineGenerator()
+            pipeline_generator._generate(
+                flask_app=flask_app,
+                context=context,
+                pipeline=pipeline,
+                workflow_id=workflow_id,
+                user=account,
+                application_generate_entity=entity,
+                invoke_from=InvokeFrom.PUBLISHED,
+                workflow_execution_repository=workflow_execution_repository,
+                workflow_node_execution_repository=workflow_node_execution_repository,
+                streaming=streaming,
+                workflow_thread_pool_id=workflow_thread_pool_id,
+            )
        except Exception:
            logging.exception("Error in priority pipeline task")
            raise
--- a/api/tasks/retry_document_indexing_task.py
+++ b/api/tasks/retry_document_indexing_task.py
@ -29,7 +29,6 @@ def retry_document_indexing_task(dataset_id: str, document_ids: list[str], user_
    Usage: retry_document_indexing_task.delay(dataset_id, document_ids, user_id)
    """
    start_at = time.perf_counter()
-    print("sadaddadadaaaadadadadsdsadasdadasdasda")
    try:
        dataset = db.session.query(Dataset).where(Dataset.id == dataset_id).first()
        if not dataset: