From 1fcb05432d648f64b8e8b5a0867f41804b588e92 Mon Sep 17 00:00:00 2001 From: GareArc Date: Mon, 2 Mar 2026 01:07:10 -0800 Subject: [PATCH 1/3] fix(telemetry): populate missing fields in node execution trace - Extract model_provider/model_name from process_data (LLM nodes store model info there, not in execution_metadata) - Add invoke_from to node execution trace metadata dict - Add credential_id to node execution trace metadata dict - Add conversation_id to metadata after message_id lookup - Add tool_name to tool_info dict in tool node --- api/core/app/workflow/layers/persistence.py | 7 +++++++ api/core/ops/ops_trace_manager.py | 6 +++++- api/core/workflow/nodes/tool/tool_node.py | 1 + 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/api/core/app/workflow/layers/persistence.py b/api/core/app/workflow/layers/persistence.py index aaa8b4e2dc..fd7c19a71d 100644 --- a/api/core/app/workflow/layers/persistence.py +++ b/api/core/app/workflow/layers/persistence.py @@ -464,6 +464,13 @@ class WorkflowPersistenceLayer(GraphEngineLayer): node_data["invoke_from"] = self._application_generate_entity.invoke_from.value node_data["user_id"] = self._system_variables().get(SystemVariableKey.USER_ID.value) + # Extract model info from process_data — LLM nodes store provider/model there, + if domain_execution.process_data: + if mp := domain_execution.process_data.get("model_provider"): + node_data["model_provider"] = mp + if mn := domain_execution.process_data.get("model_name"): + node_data["model_name"] = mn + if domain_execution.node_type.value == "knowledge-retrieval" and domain_execution.outputs: results = domain_execution.outputs.get("result") or [] dataset_ids: list[str] = [] diff --git a/api/core/ops/ops_trace_manager.py b/api/core/ops/ops_trace_manager.py index a4014111ed..30655374ff 100644 --- a/api/core/ops/ops_trace_manager.py +++ b/api/core/ops/ops_trace_manager.py @@ -1183,10 +1183,12 @@ class TraceTask: "app_name": app_name, "workspace_name": workspace_name, "user_id": node_data.get("user_id"), + "invoke_from": node_data.get("invoke_from"), + "credential_id": node_data.get("credential_id"), + "credential_name": credential_name, "dataset_ids": node_data.get("dataset_ids"), "dataset_names": node_data.get("dataset_names"), "plugin_name": node_data.get("plugin_name"), - "credential_name": credential_name, } parent_trace_context = node_data.get("parent_trace_context") @@ -1207,6 +1209,8 @@ class TraceTask: if msg_id: message_id = str(msg_id) metadata["message_id"] = message_id + if conversation_id: + metadata["conversation_id"] = conversation_id return WorkflowNodeTraceInfo( trace_id=self.trace_id, diff --git a/api/core/workflow/nodes/tool/tool_node.py b/api/core/workflow/nodes/tool/tool_node.py index f498a23d13..1d88249fc8 100644 --- a/api/core/workflow/nodes/tool/tool_node.py +++ b/api/core/workflow/nodes/tool/tool_node.py @@ -60,6 +60,7 @@ class ToolNode(Node[ToolNodeData]): tool_info = { "provider_type": self.node_data.provider_type.value, "provider_id": self.node_data.provider_id, + "tool_name": self.node_data.tool_name, "plugin_unique_identifier": self.node_data.plugin_unique_identifier, "credential_id": self.node_data.credential_id, } From a2a5b02a53dae559fb4ee2691657ff120dc37ce8 Mon Sep 17 00:00:00 2001 From: GareArc Date: Mon, 2 Mar 2026 01:07:18 -0800 Subject: [PATCH 2/3] docs(telemetry): add token consumption query patterns to data dictionary Add token hierarchy diagram, common PromQL queries (totals, drill-down, rates), and app name lookup via trace query. --- api/enterprise/telemetry/DATA_DICTIONARY.md | 61 +++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/api/enterprise/telemetry/DATA_DICTIONARY.md b/api/enterprise/telemetry/DATA_DICTIONARY.md index c0d07d2550..8c63efbf76 100644 --- a/api/enterprise/telemetry/DATA_DICTIONARY.md +++ b/api/enterprise/telemetry/DATA_DICTIONARY.md @@ -80,6 +80,67 @@ All counters are cumulative and emitted at 100% accuracy. ⚠️ **Warning:** `dify.tokens.total` at workflow level includes all node tokens. Filter by `operation_type` to avoid double-counting. +#### Token Hierarchy & Query Patterns + +Token metrics are emitted at multiple layers. Understanding the hierarchy prevents double-counting: + +``` +App-level total +├── workflow ← sum of all node_execution tokens (DO NOT add both) +│ └── node_execution ← per-node breakdown +├── message ← independent (non-workflow chat apps only) +├── rule_generate ← independent helper LLM call +├── code_generate ← independent helper LLM call +├── structured_output ← independent helper LLM call +└── instruction_modify← independent helper LLM call +``` + +**Key rule:** `workflow` tokens already include all `node_execution` tokens. Never sum both. + +**Available labels on token metrics:** `tenant_id`, `app_id`, `operation_type`, `model_provider`, `model_name`, `node_type`. +App name is only available on span attributes (`dify.app.name`), not metric labels — use `app_id` for metric queries. + +**Common queries** (PromQL): + +```promql +# ── Totals ────────────────────────────────────────────────── +# App-level total (exclude node_execution to avoid double-counting) +sum by (app_id) (dify_tokens_total{operation_type!="node_execution"}) + +# Single app total +sum (dify_tokens_total{app_id="", operation_type!="node_execution"}) + +# Per-tenant totals +sum by (tenant_id) (dify_tokens_total{operation_type!="node_execution"}) + +# ── Drill-down ────────────────────────────────────────────── +# Workflow-level tokens for an app +sum (dify_tokens_total{app_id="", operation_type="workflow"}) + +# Node-level breakdown within an app +sum by (node_type) (dify_tokens_total{app_id="", operation_type="node_execution"}) + +# Model breakdown for an app +sum by (model_provider, model_name) (dify_tokens_total{app_id=""}) + +# Input vs output per model +sum by (model_name) (dify_tokens_input_total{app_id=""}) +sum by (model_name) (dify_tokens_output_total{app_id=""}) + +# ── Rates ─────────────────────────────────────────────────── +# Token consumption rate (per hour) +sum(rate(dify_tokens_total{operation_type!="node_execution"}[1h])) + +# Per-app consumption rate +sum by (app_id) (rate(dify_tokens_total{operation_type!="node_execution"}[1h])) +``` + +**Finding `app_id` from app name** (trace query — Tempo / Jaeger): + +``` +{ resource.dify.app.name = "My Chatbot" } | select(resource.dify.app.id) +``` + ### Request Counters | Metric | Unit | Description | From 6df00c83ae2e46958ad217d6f7365ae6f2289a28 Mon Sep 17 00:00:00 2001 From: GareArc Date: Mon, 2 Mar 2026 01:47:39 -0800 Subject: [PATCH 3/3] fix(telemetry): populate LLM credential info in node execution traces - Add _lookup_llm_credential_info() to query Provider/ProviderModel tables - Lookup LLM credentials when tool credential_id is null - Fall back to provider-level credential if no model-specific credential --- api/core/ops/ops_trace_manager.py | 67 +++++++++++++++++++++++++++++-- 1 file changed, 64 insertions(+), 3 deletions(-) diff --git a/api/core/ops/ops_trace_manager.py b/api/core/ops/ops_trace_manager.py index 30655374ff..207c9f3940 100644 --- a/api/core/ops/ops_trace_manager.py +++ b/api/core/ops/ops_trace_manager.py @@ -40,6 +40,7 @@ from models.account import Tenant from models.dataset import Dataset from models.model import App, AppModelConfig, Conversation, Message, MessageFile, TraceAppConfig from models.tools import ApiToolProvider, BuiltinToolProvider, MCPToolProvider, WorkflowToolProvider +from models.provider import Provider, ProviderModel, ProviderType from models.workflow import WorkflowAppLog from tasks.ops_trace_task import process_trace_tasks @@ -87,6 +88,54 @@ def _lookup_credential_name(credential_id: str | None, provider_type: str | None return str(name) if name else "" +def _lookup_llm_credential_info( + tenant_id: str | None, provider: str | None, model: str | None, model_type: str | None = "llm" +) -> tuple[str | None, str]: + """ + Lookup LLM credential ID and name for the given provider and model. + Returns (credential_id, credential_name). + """ + if not tenant_id or not provider: + return None, "" + + with Session(db.engine) as session: + # Try to find provider-level or model-level configuration + provider_record = session.scalar( + select(Provider).where( + Provider.tenant_id == tenant_id, + Provider.provider_name == provider, + Provider.provider_type == ProviderType.CUSTOM, + ) + ) + + if not provider_record: + return None, "" + + # Check if there's a model-specific config + credential_id = None + credential_name = "" + + if model and provider_record.credential_id: + # Try model-level first + model_record = session.scalar( + select(ProviderModel).where( + ProviderModel.tenant_id == tenant_id, + ProviderModel.provider_name == provider, + ProviderModel.model_name == model, + ProviderModel.model_type == model_type, + ) + ) + + if model_record and model_record.credential_id: + credential_id = model_record.credential_id + + if not credential_id and provider_record.credential_id: + # Fall back to provider-level credential + credential_id = provider_record.credential_id + + return credential_id, credential_name + + class OpsTraceProviderConfigMap(collections.UserDict[str, dict[str, Any]]): def __getitem__(self, provider: str) -> dict[str, Any]: match provider: @@ -1173,9 +1222,21 @@ class TraceTask: app_name, workspace_name = _lookup_app_and_workspace_names(node_data.get("app_id"), node_data.get("tenant_id")) - credential_name = _lookup_credential_name( - node_data.get("credential_id"), node_data.get("credential_provider_type") - ) + # Try tool credential lookup first + credential_id = node_data.get("credential_id") + credential_name = _lookup_credential_name(credential_id, node_data.get("credential_provider_type")) + + # If no credential_id found (e.g., LLM nodes), try LLM credential lookup + if not credential_id: + llm_cred_id, llm_cred_name = _lookup_llm_credential_info( + tenant_id=node_data.get("tenant_id"), + provider=node_data.get("model_provider"), + model=node_data.get("model_name"), + model_type="llm", + ) + if llm_cred_id: + credential_id = llm_cred_id + credential_name = llm_cred_name metadata: dict[str, Any] = { "tenant_id": node_data.get("tenant_id"),