mirror of https://github.com/langgenius/dify.git
fix: summary index bug (#31810)
Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> Co-authored-by: Jyong <76649700+JohnJyong@users.noreply.github.com> Co-authored-by: zxhlyh <jasonapring2015@outlook.com> Co-authored-by: Yansong Zhang <916125788@qq.com> Co-authored-by: hj24 <mambahj24@gmail.com> Co-authored-by: CodingOnStar <hanxujiang@dify.ai> Co-authored-by: CodingOnStar <hanxujiang@dify.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
This commit is contained in:
parent
4f826b4641
commit
41177757e6
|
|
@ -1339,6 +1339,18 @@ class DocumentGenerateSummaryApi(Resource):
|
|||
missing_ids = set(document_list) - found_ids
|
||||
raise NotFound(f"Some documents not found: {list(missing_ids)}")
|
||||
|
||||
# Update need_summary to True for documents that don't have it set
|
||||
# This handles the case where documents were created when summary_index_setting was disabled
|
||||
documents_to_update = [doc for doc in documents if not doc.need_summary and doc.doc_form != "qa_model"]
|
||||
|
||||
if documents_to_update:
|
||||
document_ids_to_update = [str(doc.id) for doc in documents_to_update]
|
||||
DocumentService.update_documents_need_summary(
|
||||
dataset_id=dataset_id,
|
||||
document_ids=document_ids_to_update,
|
||||
need_summary=True,
|
||||
)
|
||||
|
||||
# Dispatch async tasks for each document
|
||||
for document in documents:
|
||||
# Skip qa_model documents as they don't generate summaries
|
||||
|
|
|
|||
|
|
@ -369,7 +369,9 @@ class IndexingRunner:
|
|||
# Generate summary preview
|
||||
summary_index_setting = tmp_processing_rule.get("summary_index_setting")
|
||||
if summary_index_setting and summary_index_setting.get("enable") and preview_texts:
|
||||
preview_texts = index_processor.generate_summary_preview(tenant_id, preview_texts, summary_index_setting)
|
||||
preview_texts = index_processor.generate_summary_preview(
|
||||
tenant_id, preview_texts, summary_index_setting, doc_language
|
||||
)
|
||||
|
||||
return IndexingEstimate(total_segments=total_segments, preview=preview_texts)
|
||||
|
||||
|
|
|
|||
|
|
@ -441,11 +441,13 @@ DEFAULT_GENERATOR_SUMMARY_PROMPT = (
|
|||
|
||||
Requirements:
|
||||
1. Write a concise summary in plain text
|
||||
2. Use the same language as the input content
|
||||
2. You must write in {language}. No language other than {language} should be used.
|
||||
3. Focus on important facts, concepts, and details
|
||||
4. If images are included, describe their key information
|
||||
5. Do not use words like "好的", "ok", "I understand", "This text discusses", "The content mentions"
|
||||
6. Write directly without extra words
|
||||
7. If there is not enough content to generate a meaningful summary,
|
||||
return an empty string without any explanation or prompt
|
||||
|
||||
Output only the summary text. Start summarizing now:
|
||||
|
||||
|
|
|
|||
|
|
@ -48,12 +48,22 @@ class BaseIndexProcessor(ABC):
|
|||
|
||||
@abstractmethod
|
||||
def generate_summary_preview(
|
||||
self, tenant_id: str, preview_texts: list[PreviewDetail], summary_index_setting: dict
|
||||
self,
|
||||
tenant_id: str,
|
||||
preview_texts: list[PreviewDetail],
|
||||
summary_index_setting: dict,
|
||||
doc_language: str | None = None,
|
||||
) -> list[PreviewDetail]:
|
||||
"""
|
||||
For each segment in preview_texts, generate a summary using LLM and attach it to the segment.
|
||||
The summary can be stored in a new attribute, e.g., summary.
|
||||
This method should be implemented by subclasses.
|
||||
|
||||
Args:
|
||||
tenant_id: Tenant ID
|
||||
preview_texts: List of preview details to generate summaries for
|
||||
summary_index_setting: Summary index configuration
|
||||
doc_language: Optional document language to ensure summary is generated in the correct language
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
|
|
|||
|
|
@ -275,7 +275,11 @@ class ParagraphIndexProcessor(BaseIndexProcessor):
|
|||
raise ValueError("Chunks is not a list")
|
||||
|
||||
def generate_summary_preview(
|
||||
self, tenant_id: str, preview_texts: list[PreviewDetail], summary_index_setting: dict
|
||||
self,
|
||||
tenant_id: str,
|
||||
preview_texts: list[PreviewDetail],
|
||||
summary_index_setting: dict,
|
||||
doc_language: str | None = None,
|
||||
) -> list[PreviewDetail]:
|
||||
"""
|
||||
For each segment, concurrently call generate_summary to generate a summary
|
||||
|
|
@ -298,11 +302,15 @@ class ParagraphIndexProcessor(BaseIndexProcessor):
|
|||
if flask_app:
|
||||
# Ensure Flask app context in worker thread
|
||||
with flask_app.app_context():
|
||||
summary, _ = self.generate_summary(tenant_id, preview.content, summary_index_setting)
|
||||
summary, _ = self.generate_summary(
|
||||
tenant_id, preview.content, summary_index_setting, document_language=doc_language
|
||||
)
|
||||
preview.summary = summary
|
||||
else:
|
||||
# Fallback: try without app context (may fail)
|
||||
summary, _ = self.generate_summary(tenant_id, preview.content, summary_index_setting)
|
||||
summary, _ = self.generate_summary(
|
||||
tenant_id, preview.content, summary_index_setting, document_language=doc_language
|
||||
)
|
||||
preview.summary = summary
|
||||
|
||||
# Generate summaries concurrently using ThreadPoolExecutor
|
||||
|
|
@ -356,6 +364,7 @@ class ParagraphIndexProcessor(BaseIndexProcessor):
|
|||
text: str,
|
||||
summary_index_setting: dict | None = None,
|
||||
segment_id: str | None = None,
|
||||
document_language: str | None = None,
|
||||
) -> tuple[str, LLMUsage]:
|
||||
"""
|
||||
Generate summary for the given text using ModelInstance.invoke_llm and the default or custom summary prompt,
|
||||
|
|
@ -366,6 +375,8 @@ class ParagraphIndexProcessor(BaseIndexProcessor):
|
|||
text: Text content to summarize
|
||||
summary_index_setting: Summary index configuration
|
||||
segment_id: Optional segment ID to fetch attachments from SegmentAttachmentBinding table
|
||||
document_language: Optional document language (e.g., "Chinese", "English")
|
||||
to ensure summary is generated in the correct language
|
||||
|
||||
Returns:
|
||||
Tuple of (summary_content, llm_usage) where llm_usage is LLMUsage object
|
||||
|
|
@ -381,8 +392,22 @@ class ParagraphIndexProcessor(BaseIndexProcessor):
|
|||
raise ValueError("model_name and model_provider_name are required in summary_index_setting")
|
||||
|
||||
# Import default summary prompt
|
||||
is_default_prompt = False
|
||||
if not summary_prompt:
|
||||
summary_prompt = DEFAULT_GENERATOR_SUMMARY_PROMPT
|
||||
is_default_prompt = True
|
||||
|
||||
# Format prompt with document language only for default prompt
|
||||
# Custom prompts are used as-is to avoid interfering with user-defined templates
|
||||
# If document_language is provided, use it; otherwise, use "the same language as the input content"
|
||||
# This is especially important for image-only chunks where text is empty or minimal
|
||||
if is_default_prompt:
|
||||
language_for_prompt = document_language or "the same language as the input content"
|
||||
try:
|
||||
summary_prompt = summary_prompt.format(language=language_for_prompt)
|
||||
except KeyError:
|
||||
# If default prompt doesn't have {language} placeholder, use it as-is
|
||||
pass
|
||||
|
||||
provider_manager = ProviderManager()
|
||||
provider_model_bundle = provider_manager.get_provider_model_bundle(
|
||||
|
|
|
|||
|
|
@ -358,7 +358,11 @@ class ParentChildIndexProcessor(BaseIndexProcessor):
|
|||
}
|
||||
|
||||
def generate_summary_preview(
|
||||
self, tenant_id: str, preview_texts: list[PreviewDetail], summary_index_setting: dict
|
||||
self,
|
||||
tenant_id: str,
|
||||
preview_texts: list[PreviewDetail],
|
||||
summary_index_setting: dict,
|
||||
doc_language: str | None = None,
|
||||
) -> list[PreviewDetail]:
|
||||
"""
|
||||
For each parent chunk in preview_texts, concurrently call generate_summary to generate a summary
|
||||
|
|
@ -389,6 +393,7 @@ class ParentChildIndexProcessor(BaseIndexProcessor):
|
|||
tenant_id=tenant_id,
|
||||
text=preview.content,
|
||||
summary_index_setting=summary_index_setting,
|
||||
document_language=doc_language,
|
||||
)
|
||||
preview.summary = summary
|
||||
else:
|
||||
|
|
@ -397,6 +402,7 @@ class ParentChildIndexProcessor(BaseIndexProcessor):
|
|||
tenant_id=tenant_id,
|
||||
text=preview.content,
|
||||
summary_index_setting=summary_index_setting,
|
||||
document_language=doc_language,
|
||||
)
|
||||
preview.summary = summary
|
||||
|
||||
|
|
|
|||
|
|
@ -241,7 +241,11 @@ class QAIndexProcessor(BaseIndexProcessor):
|
|||
}
|
||||
|
||||
def generate_summary_preview(
|
||||
self, tenant_id: str, preview_texts: list[PreviewDetail], summary_index_setting: dict
|
||||
self,
|
||||
tenant_id: str,
|
||||
preview_texts: list[PreviewDetail],
|
||||
summary_index_setting: dict,
|
||||
doc_language: str | None = None,
|
||||
) -> list[PreviewDetail]:
|
||||
"""
|
||||
QA model doesn't generate summaries, so this method returns preview_texts unchanged.
|
||||
|
|
|
|||
|
|
@ -78,12 +78,21 @@ class KnowledgeIndexNode(Node[KnowledgeIndexNodeData]):
|
|||
indexing_technique = node_data.indexing_technique or dataset.indexing_technique
|
||||
summary_index_setting = node_data.summary_index_setting or dataset.summary_index_setting
|
||||
|
||||
# Try to get document language if document_id is available
|
||||
doc_language = None
|
||||
document_id = variable_pool.get(["sys", SystemVariableKey.DOCUMENT_ID])
|
||||
if document_id:
|
||||
document = db.session.query(Document).filter_by(id=document_id.value).first()
|
||||
if document and document.doc_language:
|
||||
doc_language = document.doc_language
|
||||
|
||||
outputs = self._get_preview_output_with_summaries(
|
||||
node_data.chunk_structure,
|
||||
chunks,
|
||||
dataset=dataset,
|
||||
indexing_technique=indexing_technique,
|
||||
summary_index_setting=summary_index_setting,
|
||||
doc_language=doc_language,
|
||||
)
|
||||
return NodeRunResult(
|
||||
status=WorkflowNodeExecutionStatus.SUCCEEDED,
|
||||
|
|
@ -315,6 +324,7 @@ class KnowledgeIndexNode(Node[KnowledgeIndexNodeData]):
|
|||
dataset: Dataset,
|
||||
indexing_technique: str | None = None,
|
||||
summary_index_setting: dict | None = None,
|
||||
doc_language: str | None = None,
|
||||
) -> Mapping[str, Any]:
|
||||
"""
|
||||
Generate preview output with summaries for chunks in preview mode.
|
||||
|
|
@ -326,6 +336,7 @@ class KnowledgeIndexNode(Node[KnowledgeIndexNodeData]):
|
|||
dataset: Dataset object (for tenant_id)
|
||||
indexing_technique: Indexing technique from node config or dataset
|
||||
summary_index_setting: Summary index setting from node config or dataset
|
||||
doc_language: Optional document language to ensure summary is generated in the correct language
|
||||
"""
|
||||
index_processor = IndexProcessorFactory(chunk_structure).init_index_processor()
|
||||
preview_output = index_processor.format_preview(chunks)
|
||||
|
|
@ -365,6 +376,7 @@ class KnowledgeIndexNode(Node[KnowledgeIndexNodeData]):
|
|||
tenant_id=dataset.tenant_id,
|
||||
text=preview_item["content"],
|
||||
summary_index_setting=summary_index_setting,
|
||||
document_language=doc_language,
|
||||
)
|
||||
if summary:
|
||||
preview_item["summary"] = summary
|
||||
|
|
@ -374,6 +386,7 @@ class KnowledgeIndexNode(Node[KnowledgeIndexNodeData]):
|
|||
tenant_id=dataset.tenant_id,
|
||||
text=preview_item["content"],
|
||||
summary_index_setting=summary_index_setting,
|
||||
document_language=doc_language,
|
||||
)
|
||||
if summary:
|
||||
preview_item["summary"] = summary
|
||||
|
|
|
|||
|
|
@ -16,6 +16,7 @@ from sqlalchemy.orm import Session
|
|||
from werkzeug.exceptions import Forbidden, NotFound
|
||||
|
||||
from configs import dify_config
|
||||
from core.db.session_factory import session_factory
|
||||
from core.errors.error import LLMBadRequestError, ProviderTokenNotInitError
|
||||
from core.file import helpers as file_helpers
|
||||
from core.helper.name_generator import generate_incremental_name
|
||||
|
|
@ -1388,6 +1389,46 @@ class DocumentService:
|
|||
).all()
|
||||
return documents
|
||||
|
||||
@staticmethod
|
||||
def update_documents_need_summary(dataset_id: str, document_ids: Sequence[str], need_summary: bool = True) -> int:
|
||||
"""
|
||||
Update need_summary field for multiple documents.
|
||||
|
||||
This method handles the case where documents were created when summary_index_setting was disabled,
|
||||
and need to be updated when summary_index_setting is later enabled.
|
||||
|
||||
Args:
|
||||
dataset_id: Dataset ID
|
||||
document_ids: List of document IDs to update
|
||||
need_summary: Value to set for need_summary field (default: True)
|
||||
|
||||
Returns:
|
||||
Number of documents updated
|
||||
"""
|
||||
if not document_ids:
|
||||
return 0
|
||||
|
||||
document_id_list: list[str] = [str(document_id) for document_id in document_ids]
|
||||
|
||||
with session_factory.create_session() as session:
|
||||
updated_count = (
|
||||
session.query(Document)
|
||||
.filter(
|
||||
Document.id.in_(document_id_list),
|
||||
Document.dataset_id == dataset_id,
|
||||
Document.doc_form != "qa_model", # Skip qa_model documents
|
||||
)
|
||||
.update({Document.need_summary: need_summary}, synchronize_session=False)
|
||||
)
|
||||
session.commit()
|
||||
logger.info(
|
||||
"Updated need_summary to %s for %d documents in dataset %s",
|
||||
need_summary,
|
||||
updated_count,
|
||||
dataset_id,
|
||||
)
|
||||
return updated_count
|
||||
|
||||
@staticmethod
|
||||
def get_document_download_url(document: Document) -> str:
|
||||
"""
|
||||
|
|
|
|||
|
|
@ -174,6 +174,10 @@ class RagPipelineTransformService:
|
|||
else:
|
||||
dataset.retrieval_model = knowledge_configuration.retrieval_model.model_dump()
|
||||
|
||||
# Copy summary_index_setting from dataset to knowledge_index node configuration
|
||||
if dataset.summary_index_setting:
|
||||
knowledge_configuration.summary_index_setting = dataset.summary_index_setting
|
||||
|
||||
knowledge_configuration_dict.update(knowledge_configuration.model_dump())
|
||||
node["data"] = knowledge_configuration_dict
|
||||
return node
|
||||
|
|
|
|||
|
|
@ -49,11 +49,18 @@ class SummaryIndexService:
|
|||
# Use lazy import to avoid circular import
|
||||
from core.rag.index_processor.processor.paragraph_index_processor import ParagraphIndexProcessor
|
||||
|
||||
# Get document language to ensure summary is generated in the correct language
|
||||
# This is especially important for image-only chunks where text is empty or minimal
|
||||
document_language = None
|
||||
if segment.document and segment.document.doc_language:
|
||||
document_language = segment.document.doc_language
|
||||
|
||||
summary_content, usage = ParagraphIndexProcessor.generate_summary(
|
||||
tenant_id=dataset.tenant_id,
|
||||
text=segment.content,
|
||||
summary_index_setting=summary_index_setting,
|
||||
segment_id=segment.id,
|
||||
document_language=document_language,
|
||||
)
|
||||
|
||||
if not summary_content:
|
||||
|
|
@ -558,6 +565,9 @@ class SummaryIndexService:
|
|||
)
|
||||
session.add(summary_record)
|
||||
|
||||
# Commit the batch created records
|
||||
session.commit()
|
||||
|
||||
@staticmethod
|
||||
def update_summary_record_error(
|
||||
segment: DocumentSegment,
|
||||
|
|
@ -762,7 +772,6 @@ class SummaryIndexService:
|
|||
dataset=dataset,
|
||||
status="not_started",
|
||||
)
|
||||
session.commit() # Commit initial records
|
||||
|
||||
summary_records = []
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue