dify/api/services/document_indexing_proxy/batch_indexing_base.py

77 lines
2.9 KiB
Python

import logging
from collections.abc import Callable, Sequence
from dataclasses import asdict
from typing import Any
from core.entities.document_task import DocumentTask
from core.rag.pipeline.queue import TenantIsolatedTaskQueue
from .base import DocumentTaskProxyBase
logger = logging.getLogger(__name__)
class BatchDocumentIndexingProxy(DocumentTaskProxyBase):
"""
Base proxy for batch document indexing tasks (document_ids in plural).
Adds:
- Tenant isolated queue management
- Batch document handling
"""
def __init__(self, tenant_id: str, dataset_id: str, document_ids: Sequence[str]):
"""
Initialize with batch documents.
Args:
tenant_id: Tenant identifier
dataset_id: Dataset identifier
document_ids: List of document IDs to process
"""
super().__init__(tenant_id, dataset_id)
self._document_ids = document_ids
self._tenant_isolated_task_queue = TenantIsolatedTaskQueue(tenant_id, self.QUEUE_NAME)
def _send_to_direct_queue(self, task_func: Callable[[str, str, Sequence[str]], Any]):
"""
Send batch task to direct queue.
Args:
task_func: The Celery task function to call with (tenant_id, dataset_id, document_ids)
"""
logger.info("tenant %s send documents %s to direct queue", self._tenant_id, self._document_ids)
task_func.delay( # type: ignore
tenant_id=self._tenant_id, dataset_id=self._dataset_id, document_ids=self._document_ids
)
def _send_to_tenant_queue(self, task_func: Callable[[str, str, Sequence[str]], Any]):
"""
Send batch task to tenant-isolated queue.
Args:
task_func: The Celery task function to call with (tenant_id, dataset_id, document_ids)
"""
logger.info(
"tenant %s send documents %s to tenant queue %s", self._tenant_id, self._document_ids, self.QUEUE_NAME
)
if self._tenant_isolated_task_queue.get_task_key():
# Add to waiting queue using List operations (lpush)
self._tenant_isolated_task_queue.push_tasks(
[
asdict(
DocumentTask(
tenant_id=self._tenant_id, dataset_id=self._dataset_id, document_ids=self._document_ids
)
)
]
)
logger.info("tenant %s push tasks: %s - %s", self._tenant_id, self._dataset_id, self._document_ids)
else:
# Set flag and execute task
self._tenant_isolated_task_queue.set_task_waiting_time()
task_func.delay( # type: ignore
tenant_id=self._tenant_id, dataset_id=self._dataset_id, document_ids=self._document_ids
)
logger.info("tenant %s init tasks: %s - %s", self._tenant_id, self._dataset_id, self._document_ids)