fix: summary index bug (#31810)

Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
Co-authored-by: Jyong <76649700+JohnJyong@users.noreply.github.com>
Co-authored-by: zxhlyh <jasonapring2015@outlook.com>
Co-authored-by: Yansong Zhang <916125788@qq.com>
Co-authored-by: hj24 <mambahj24@gmail.com>
Co-authored-by: CodingOnStar <hanxujiang@dify.ai>
Co-authored-by: CodingOnStar <hanxujiang@dify.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
This commit is contained in:
FFXN 2026-02-02 09:45:17 +08:00 committed by GitHub
parent 4f826b4641
commit 41177757e6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 137 additions and 9 deletions

View File

@ -1339,6 +1339,18 @@ class DocumentGenerateSummaryApi(Resource):
missing_ids = set(document_list) - found_ids
raise NotFound(f"Some documents not found: {list(missing_ids)}")
# Update need_summary to True for documents that don't have it set
# This handles the case where documents were created when summary_index_setting was disabled
documents_to_update = [doc for doc in documents if not doc.need_summary and doc.doc_form != "qa_model"]
if documents_to_update:
document_ids_to_update = [str(doc.id) for doc in documents_to_update]
DocumentService.update_documents_need_summary(
dataset_id=dataset_id,
document_ids=document_ids_to_update,
need_summary=True,
)
# Dispatch async tasks for each document
for document in documents:
# Skip qa_model documents as they don't generate summaries

View File

@ -369,7 +369,9 @@ class IndexingRunner:
# Generate summary preview
summary_index_setting = tmp_processing_rule.get("summary_index_setting")
if summary_index_setting and summary_index_setting.get("enable") and preview_texts:
preview_texts = index_processor.generate_summary_preview(tenant_id, preview_texts, summary_index_setting)
preview_texts = index_processor.generate_summary_preview(
tenant_id, preview_texts, summary_index_setting, doc_language
)
return IndexingEstimate(total_segments=total_segments, preview=preview_texts)

View File

@ -441,11 +441,13 @@ DEFAULT_GENERATOR_SUMMARY_PROMPT = (
Requirements:
1. Write a concise summary in plain text
2. Use the same language as the input content
2. You must write in {language}. No language other than {language} should be used.
3. Focus on important facts, concepts, and details
4. If images are included, describe their key information
5. Do not use words like "好的", "ok", "I understand", "This text discusses", "The content mentions"
6. Write directly without extra words
7. If there is not enough content to generate a meaningful summary,
return an empty string without any explanation or prompt
Output only the summary text. Start summarizing now:

View File

@ -48,12 +48,22 @@ class BaseIndexProcessor(ABC):
@abstractmethod
def generate_summary_preview(
self, tenant_id: str, preview_texts: list[PreviewDetail], summary_index_setting: dict
self,
tenant_id: str,
preview_texts: list[PreviewDetail],
summary_index_setting: dict,
doc_language: str | None = None,
) -> list[PreviewDetail]:
"""
For each segment in preview_texts, generate a summary using LLM and attach it to the segment.
The summary can be stored in a new attribute, e.g., summary.
This method should be implemented by subclasses.
Args:
tenant_id: Tenant ID
preview_texts: List of preview details to generate summaries for
summary_index_setting: Summary index configuration
doc_language: Optional document language to ensure summary is generated in the correct language
"""
raise NotImplementedError

View File

@ -275,7 +275,11 @@ class ParagraphIndexProcessor(BaseIndexProcessor):
raise ValueError("Chunks is not a list")
def generate_summary_preview(
self, tenant_id: str, preview_texts: list[PreviewDetail], summary_index_setting: dict
self,
tenant_id: str,
preview_texts: list[PreviewDetail],
summary_index_setting: dict,
doc_language: str | None = None,
) -> list[PreviewDetail]:
"""
For each segment, concurrently call generate_summary to generate a summary
@ -298,11 +302,15 @@ class ParagraphIndexProcessor(BaseIndexProcessor):
if flask_app:
# Ensure Flask app context in worker thread
with flask_app.app_context():
summary, _ = self.generate_summary(tenant_id, preview.content, summary_index_setting)
summary, _ = self.generate_summary(
tenant_id, preview.content, summary_index_setting, document_language=doc_language
)
preview.summary = summary
else:
# Fallback: try without app context (may fail)
summary, _ = self.generate_summary(tenant_id, preview.content, summary_index_setting)
summary, _ = self.generate_summary(
tenant_id, preview.content, summary_index_setting, document_language=doc_language
)
preview.summary = summary
# Generate summaries concurrently using ThreadPoolExecutor
@ -356,6 +364,7 @@ class ParagraphIndexProcessor(BaseIndexProcessor):
text: str,
summary_index_setting: dict | None = None,
segment_id: str | None = None,
document_language: str | None = None,
) -> tuple[str, LLMUsage]:
"""
Generate summary for the given text using ModelInstance.invoke_llm and the default or custom summary prompt,
@ -366,6 +375,8 @@ class ParagraphIndexProcessor(BaseIndexProcessor):
text: Text content to summarize
summary_index_setting: Summary index configuration
segment_id: Optional segment ID to fetch attachments from SegmentAttachmentBinding table
document_language: Optional document language (e.g., "Chinese", "English")
to ensure summary is generated in the correct language
Returns:
Tuple of (summary_content, llm_usage) where llm_usage is LLMUsage object
@ -381,8 +392,22 @@ class ParagraphIndexProcessor(BaseIndexProcessor):
raise ValueError("model_name and model_provider_name are required in summary_index_setting")
# Import default summary prompt
is_default_prompt = False
if not summary_prompt:
summary_prompt = DEFAULT_GENERATOR_SUMMARY_PROMPT
is_default_prompt = True
# Format prompt with document language only for default prompt
# Custom prompts are used as-is to avoid interfering with user-defined templates
# If document_language is provided, use it; otherwise, use "the same language as the input content"
# This is especially important for image-only chunks where text is empty or minimal
if is_default_prompt:
language_for_prompt = document_language or "the same language as the input content"
try:
summary_prompt = summary_prompt.format(language=language_for_prompt)
except KeyError:
# If default prompt doesn't have {language} placeholder, use it as-is
pass
provider_manager = ProviderManager()
provider_model_bundle = provider_manager.get_provider_model_bundle(

View File

@ -358,7 +358,11 @@ class ParentChildIndexProcessor(BaseIndexProcessor):
}
def generate_summary_preview(
self, tenant_id: str, preview_texts: list[PreviewDetail], summary_index_setting: dict
self,
tenant_id: str,
preview_texts: list[PreviewDetail],
summary_index_setting: dict,
doc_language: str | None = None,
) -> list[PreviewDetail]:
"""
For each parent chunk in preview_texts, concurrently call generate_summary to generate a summary
@ -389,6 +393,7 @@ class ParentChildIndexProcessor(BaseIndexProcessor):
tenant_id=tenant_id,
text=preview.content,
summary_index_setting=summary_index_setting,
document_language=doc_language,
)
preview.summary = summary
else:
@ -397,6 +402,7 @@ class ParentChildIndexProcessor(BaseIndexProcessor):
tenant_id=tenant_id,
text=preview.content,
summary_index_setting=summary_index_setting,
document_language=doc_language,
)
preview.summary = summary

View File

@ -241,7 +241,11 @@ class QAIndexProcessor(BaseIndexProcessor):
}
def generate_summary_preview(
self, tenant_id: str, preview_texts: list[PreviewDetail], summary_index_setting: dict
self,
tenant_id: str,
preview_texts: list[PreviewDetail],
summary_index_setting: dict,
doc_language: str | None = None,
) -> list[PreviewDetail]:
"""
QA model doesn't generate summaries, so this method returns preview_texts unchanged.

View File

@ -78,12 +78,21 @@ class KnowledgeIndexNode(Node[KnowledgeIndexNodeData]):
indexing_technique = node_data.indexing_technique or dataset.indexing_technique
summary_index_setting = node_data.summary_index_setting or dataset.summary_index_setting
# Try to get document language if document_id is available
doc_language = None
document_id = variable_pool.get(["sys", SystemVariableKey.DOCUMENT_ID])
if document_id:
document = db.session.query(Document).filter_by(id=document_id.value).first()
if document and document.doc_language:
doc_language = document.doc_language
outputs = self._get_preview_output_with_summaries(
node_data.chunk_structure,
chunks,
dataset=dataset,
indexing_technique=indexing_technique,
summary_index_setting=summary_index_setting,
doc_language=doc_language,
)
return NodeRunResult(
status=WorkflowNodeExecutionStatus.SUCCEEDED,
@ -315,6 +324,7 @@ class KnowledgeIndexNode(Node[KnowledgeIndexNodeData]):
dataset: Dataset,
indexing_technique: str | None = None,
summary_index_setting: dict | None = None,
doc_language: str | None = None,
) -> Mapping[str, Any]:
"""
Generate preview output with summaries for chunks in preview mode.
@ -326,6 +336,7 @@ class KnowledgeIndexNode(Node[KnowledgeIndexNodeData]):
dataset: Dataset object (for tenant_id)
indexing_technique: Indexing technique from node config or dataset
summary_index_setting: Summary index setting from node config or dataset
doc_language: Optional document language to ensure summary is generated in the correct language
"""
index_processor = IndexProcessorFactory(chunk_structure).init_index_processor()
preview_output = index_processor.format_preview(chunks)
@ -365,6 +376,7 @@ class KnowledgeIndexNode(Node[KnowledgeIndexNodeData]):
tenant_id=dataset.tenant_id,
text=preview_item["content"],
summary_index_setting=summary_index_setting,
document_language=doc_language,
)
if summary:
preview_item["summary"] = summary
@ -374,6 +386,7 @@ class KnowledgeIndexNode(Node[KnowledgeIndexNodeData]):
tenant_id=dataset.tenant_id,
text=preview_item["content"],
summary_index_setting=summary_index_setting,
document_language=doc_language,
)
if summary:
preview_item["summary"] = summary

View File

@ -16,6 +16,7 @@ from sqlalchemy.orm import Session
from werkzeug.exceptions import Forbidden, NotFound
from configs import dify_config
from core.db.session_factory import session_factory
from core.errors.error import LLMBadRequestError, ProviderTokenNotInitError
from core.file import helpers as file_helpers
from core.helper.name_generator import generate_incremental_name
@ -1388,6 +1389,46 @@ class DocumentService:
).all()
return documents
@staticmethod
def update_documents_need_summary(dataset_id: str, document_ids: Sequence[str], need_summary: bool = True) -> int:
"""
Update need_summary field for multiple documents.
This method handles the case where documents were created when summary_index_setting was disabled,
and need to be updated when summary_index_setting is later enabled.
Args:
dataset_id: Dataset ID
document_ids: List of document IDs to update
need_summary: Value to set for need_summary field (default: True)
Returns:
Number of documents updated
"""
if not document_ids:
return 0
document_id_list: list[str] = [str(document_id) for document_id in document_ids]
with session_factory.create_session() as session:
updated_count = (
session.query(Document)
.filter(
Document.id.in_(document_id_list),
Document.dataset_id == dataset_id,
Document.doc_form != "qa_model", # Skip qa_model documents
)
.update({Document.need_summary: need_summary}, synchronize_session=False)
)
session.commit()
logger.info(
"Updated need_summary to %s for %d documents in dataset %s",
need_summary,
updated_count,
dataset_id,
)
return updated_count
@staticmethod
def get_document_download_url(document: Document) -> str:
"""

View File

@ -174,6 +174,10 @@ class RagPipelineTransformService:
else:
dataset.retrieval_model = knowledge_configuration.retrieval_model.model_dump()
# Copy summary_index_setting from dataset to knowledge_index node configuration
if dataset.summary_index_setting:
knowledge_configuration.summary_index_setting = dataset.summary_index_setting
knowledge_configuration_dict.update(knowledge_configuration.model_dump())
node["data"] = knowledge_configuration_dict
return node

View File

@ -49,11 +49,18 @@ class SummaryIndexService:
# Use lazy import to avoid circular import
from core.rag.index_processor.processor.paragraph_index_processor import ParagraphIndexProcessor
# Get document language to ensure summary is generated in the correct language
# This is especially important for image-only chunks where text is empty or minimal
document_language = None
if segment.document and segment.document.doc_language:
document_language = segment.document.doc_language
summary_content, usage = ParagraphIndexProcessor.generate_summary(
tenant_id=dataset.tenant_id,
text=segment.content,
summary_index_setting=summary_index_setting,
segment_id=segment.id,
document_language=document_language,
)
if not summary_content:
@ -558,6 +565,9 @@ class SummaryIndexService:
)
session.add(summary_record)
# Commit the batch created records
session.commit()
@staticmethod
def update_summary_record_error(
segment: DocumentSegment,
@ -762,7 +772,6 @@ class SummaryIndexService:
dataset=dataset,
status="not_started",
)
session.commit() # Commit initial records
summary_records = []