diff --git a/api/core/rag/index_processor/processor/parent_child_index_processor.py b/api/core/rag/index_processor/processor/parent_child_index_processor.py index 7e33ef9c02..ccb1c55b72 100644 --- a/api/core/rag/index_processor/processor/parent_child_index_processor.py +++ b/api/core/rag/index_processor/processor/parent_child_index_processor.py @@ -1,11 +1,13 @@ """Paragraph index processor.""" import json +import logging import uuid from collections.abc import Mapping from typing import Any from configs import dify_config +from core.entities.knowledge_entities import PreviewDetail from core.model_manager import ModelInstance from core.rag.cleaner.clean_processor import CleanProcessor from core.rag.datasource.retrieval_service import RetrievalService @@ -27,6 +29,8 @@ from services.account_service import AccountService from services.entities.knowledge_entities.knowledge_entities import ParentMode, Rule from services.summary_index_service import SummaryIndexService +logger = logging.getLogger(__name__) + class ParentChildIndexProcessor(BaseIndexProcessor): def extract(self, extract_setting: ExtractSetting, **kwargs) -> list[Document]: @@ -350,3 +354,57 @@ class ParentChildIndexProcessor(BaseIndexProcessor): "preview": preview, "total_segments": len(parent_childs.parent_child_chunks), } + + def generate_summary_preview( + self, tenant_id: str, preview_texts: list[PreviewDetail], summary_index_setting: dict + ) -> list[PreviewDetail]: + """ + For each parent chunk in preview_texts, concurrently call generate_summary to generate a summary + and write it to the summary attribute of PreviewDetail. + + Note: For parent-child structure, we only generate summaries for parent chunks. + """ + import concurrent.futures + + from flask import current_app + + # Capture Flask app context for worker threads + flask_app = None + try: + flask_app = current_app._get_current_object() # type: ignore + except RuntimeError: + logger.warning("No Flask application context available, summary generation may fail") + + def process(preview: PreviewDetail) -> None: + """Generate summary for a single preview item (parent chunk).""" + try: + if flask_app: + # Ensure Flask app context in worker thread + with flask_app.app_context(): + # Use ParagraphIndexProcessor's generate_summary method + from core.rag.index_processor.processor.paragraph_index_processor import ParagraphIndexProcessor + summary = ParagraphIndexProcessor.generate_summary( + tenant_id=tenant_id, + text=preview.content, + summary_index_setting=summary_index_setting, + ) + if summary: + preview.summary = summary + else: + # Fallback: try without app context (may fail) + from core.rag.index_processor.processor.paragraph_index_processor import ParagraphIndexProcessor + summary = ParagraphIndexProcessor.generate_summary( + tenant_id=tenant_id, + text=preview.content, + summary_index_setting=summary_index_setting, + ) + if summary: + preview.summary = summary + except Exception: + logger.exception("Failed to generate summary for preview") + # Don't fail the entire preview if summary generation fails + preview.summary = None + + with concurrent.futures.ThreadPoolExecutor() as executor: + list(executor.map(process, preview_texts)) + return preview_texts diff --git a/api/core/rag/index_processor/processor/qa_index_processor.py b/api/core/rag/index_processor/processor/qa_index_processor.py index b38af0cacb..1875e8b848 100644 --- a/api/core/rag/index_processor/processor/qa_index_processor.py +++ b/api/core/rag/index_processor/processor/qa_index_processor.py @@ -237,6 +237,17 @@ class QAIndexProcessor(BaseIndexProcessor): "total_segments": len(qa_chunks.qa_chunks), } + def generate_summary_preview( + self, tenant_id: str, preview_texts: list[PreviewDetail], summary_index_setting: dict + ) -> list[PreviewDetail]: + """ + QA model doesn't generate summaries, so this method returns preview_texts unchanged. + + Note: QA model uses question-answer pairs, which don't require summary generation. + """ + # QA model doesn't generate summaries, return as-is + return preview_texts + def _format_qa_document(self, flask_app: Flask, tenant_id: str, document_node, all_qa_documents, document_language): format_documents = [] if document_node.page_content is None or not document_node.page_content.strip():