dify/api/services/agent/knowledge_datasets.py
2026-06-23 16:43:10 +08:00

64 lines
2.2 KiB
Python

from __future__ import annotations
from typing import Any
from uuid import UUID
from models.agent_config_entities import AgentSoulConfig
def list_agent_soul_knowledge_dataset_ids(agent_soul: AgentSoulConfig) -> list[str]:
"""Return normalized unique knowledge dataset ids in config order.
Agent v2 knowledge dataset selection is owned by ``knowledge.sets``. This
helper keeps composer, workflow validation, candidates, and runtime
diagnostics aligned on the same normalization rules: strip whitespace, drop
blanks, preserve first-seen order, and deduplicate.
"""
dataset_ids: list[str] = []
seen: set[str] = set()
for knowledge_set in agent_soul.knowledge.sets:
for dataset in knowledge_set.datasets:
dataset_id = (dataset.id or "").strip()
if not dataset_id or dataset_id in seen:
continue
seen.add(dataset_id)
dataset_ids.append(dataset_id)
return dataset_ids
def get_tenant_knowledge_dataset_rows(*, tenant_id: str, dataset_ids: list[str]) -> dict[str, Any]:
"""Return tenant-scoped dataset rows for normalized knowledge dataset ids.
Knowledge ids come from user-editable config. Malformed ids can never match
a dataset row, so they are treated as missing instead of breaking the
UUID-typed dataset lookup.
"""
from services.dataset_service import DatasetService
valid_ids: list[str] = []
for dataset_id in dataset_ids:
try:
UUID(dataset_id)
except (TypeError, ValueError):
continue
valid_ids.append(dataset_id)
if not valid_ids:
return {}
rows, _ = DatasetService.get_datasets_by_ids(valid_ids, tenant_id)
return {str(row.id): row for row in rows}
def list_missing_tenant_knowledge_dataset_ids(*, tenant_id: str, agent_soul: AgentSoulConfig | None) -> list[str]:
"""Return normalized knowledge dataset ids missing from the tenant scope."""
if agent_soul is None:
return []
dataset_ids = list_agent_soul_knowledge_dataset_ids(agent_soul)
if not dataset_ids:
return []
rows = get_tenant_knowledge_dataset_rows(tenant_id=tenant_id, dataset_ids=dataset_ids)
return [dataset_id for dataset_id in dataset_ids if dataset_id not in rows]