feat(telemetry): add enterprise OTEL telemetry with gateway, traces, metrics, and logs

2026-02-06 01:02:19 -08:00 · 2026-02-06 01:02:19 -08:00 · 4e3112bd7f
parent 576eca2113
commit 4e3112bd7f
4 changed files with 45 additions and 11 deletions
--- a/api/README.md
+++ b/api/README.md
@ -122,6 +122,7 @@ These commands assume you start from the repository root.

   ```bash
   cd api
+   # Note: enterprise_telemetry queue is only used in Enterprise Edition
   uv run celery -A app.celery worker -P threads -c 2 --loglevel INFO -Q dataset,priority_dataset,priority_pipeline,pipeline,mail,ops_trace,app_deletion,plugin,workflow_storage,conversation,workflow,schedule_poller,schedule_executor,triggered_workflow_dispatcher,trigger_refresh_executor,retention,enterprise_telemetry
   ```

--- a/api/core/llm_generator/llm_generator.py
+++ b/api/core/llm_generator/llm_generator.py
@ -25,6 +25,7 @@ from core.model_runtime.entities.llm_entities import LLMResult
 from core.model_runtime.entities.message_entities import PromptMessage, SystemPromptMessage, UserPromptMessage
 from core.model_runtime.entities.model_entities import ModelType
 from core.model_runtime.errors.invoke import InvokeAuthorizationError, InvokeError
+from core.ops.entities.trace_entity import OperationType
 from core.ops.utils import measure_time
 from core.prompt.utils.prompt_template_parser import PromptTemplateParser
 from core.telemetry import TelemetryContext, TelemetryEvent, TraceTaskName
@ -215,7 +216,7 @@ class LLMGenerator:
                    tenant_id=tenant_id,
                    user_id=user_id,
                    app_id=app_id,
-                    operation_type="rule_generate",
+                    operation_type=OperationType.RULE_GENERATE,
                    instruction=instruction,
                    generated_output=generated_output,
                    llm_result=llm_result,
@ -272,7 +273,7 @@ class LLMGenerator:
                            tenant_id=tenant_id,
                            user_id=user_id,
                            app_id=app_id,
-                            operation_type="rule_generate",
+                            operation_type=OperationType.RULE_GENERATE,
                            instruction=instruction,
                            generated_output="",
                            llm_result=llm_result,
@ -338,7 +339,7 @@ class LLMGenerator:
                tenant_id=tenant_id,
                user_id=user_id,
                app_id=app_id,
-                operation_type="rule_generate",
+                operation_type=OperationType.RULE_GENERATE,
                instruction=instruction,
                generated_output=str(generated_output) if generated_output else "",
                llm_result=llm_result,
@ -409,7 +410,7 @@ class LLMGenerator:
                tenant_id=tenant_id,
                user_id=user_id,
                app_id=app_id,
-                operation_type="code_generate",
+                operation_type=OperationType.CODE_GENERATE,
                instruction=instruction,
                generated_output=result.get("code", ""),
                llm_result=llm_result,
@ -504,7 +505,7 @@ class LLMGenerator:
                tenant_id=tenant_id,
                user_id=user_id,
                app_id=app_id,
-                operation_type="structured_output",
+                operation_type=OperationType.STRUCTURED_OUTPUT,
                instruction=instruction,
                generated_output=result.get("output", ""),
                llm_result=llm_result,
@ -736,7 +737,7 @@ class LLMGenerator:
                tenant_id=tenant_id,
                user_id=user_id,
                app_id=app_id,
-                operation_type="instruction_modify",
+                operation_type=OperationType.INSTRUCTION_MODIFY,
                instruction=instruction,
                generated_output=generated_output,
                llm_result=llm_result,
@ -753,7 +754,7 @@ class LLMGenerator:
        tenant_id: str,
        user_id: str,
        app_id: str | None,
-        operation_type: str,
+        operation_type: OperationType,
        instruction: str,
        generated_output: str,
        llm_result: LLMResult | None,
--- a/api/core/ops/entities/trace_entity.py
+++ b/api/core/ops/entities/trace_entity.py
@ -211,6 +211,22 @@ trace_info_info_map = {
 }


+class OperationType(StrEnum):
+    """Operation type for token metric labels.
+
+    Used as a metric attribute on ``dify.tokens.input`` / ``dify.tokens.output``
+    counters so consumers can break down token usage by operation.
+    """
+
+    WORKFLOW = "workflow"
+    NODE_EXECUTION = "node_execution"
+    MESSAGE = "message"
+    RULE_GENERATE = "rule_generate"
+    CODE_GENERATE = "code_generate"
+    STRUCTURED_OUTPUT = "structured_output"
+    INSTRUCTION_MODIFY = "instruction_modify"
+
+
 class TraceTaskName(StrEnum):
    CONVERSATION_TRACE = "conversation"
    WORKFLOW_TRACE = "workflow"
--- a/api/enterprise/telemetry/enterprise_trace.py
+++ b/api/enterprise/telemetry/enterprise_trace.py
@ -23,6 +23,7 @@ from core.ops.entities.trace_entity import (
    GenerateNameTraceInfo,
    MessageTraceInfo,
    ModerationTraceInfo,
+    OperationType,
    PromptGenerationTraceInfo,
    SuggestedQuestionTraceInfo,
    ToolTraceInfo,
@ -216,11 +217,17 @@ class EnterpriseOtelTrace:
            tenant_id=tenant_id or "",
            app_id=app_id or "",
        )
-        self._exporter.increment_counter(EnterpriseTelemetryCounter.TOKENS, info.total_tokens, labels)
+        token_labels = self._labels(
+            **labels,
+            operation_type=OperationType.WORKFLOW,
+        )
+        self._exporter.increment_counter(EnterpriseTelemetryCounter.TOKENS, info.total_tokens, token_labels)
        if info.prompt_tokens is not None and info.prompt_tokens > 0:
-            self._exporter.increment_counter(EnterpriseTelemetryCounter.INPUT_TOKENS, info.prompt_tokens, labels)
+            self._exporter.increment_counter(EnterpriseTelemetryCounter.INPUT_TOKENS, info.prompt_tokens, token_labels)
        if info.completion_tokens is not None and info.completion_tokens > 0:
-            self._exporter.increment_counter(EnterpriseTelemetryCounter.OUTPUT_TOKENS, info.completion_tokens, labels)
+            self._exporter.increment_counter(
+                EnterpriseTelemetryCounter.OUTPUT_TOKENS, info.completion_tokens, token_labels
+            )
        invoke_from = metadata.get("triggered_from", "")
        self._exporter.increment_counter(
            EnterpriseTelemetryCounter.REQUESTS,
@ -365,6 +372,7 @@ class EnterpriseOtelTrace:
            token_labels = self._labels(
                **labels,
                model_name=info.model_name or "",
+                operation_type=OperationType.NODE_EXECUTION,
            )
            self._exporter.increment_counter(EnterpriseTelemetryCounter.TOKENS, info.total_tokens, token_labels)
            if info.prompt_tokens is not None and info.prompt_tokens > 0:
@ -454,7 +462,15 @@ class EnterpriseOtelTrace:
            model_provider=metadata.get("ls_provider", ""),
            model_name=metadata.get("ls_model_name", ""),
        )
-        self._exporter.increment_counter(EnterpriseTelemetryCounter.TOKENS, info.total_tokens, labels)
+        token_labels = self._labels(
+            **labels,
+            operation_type=OperationType.MESSAGE,
+        )
+        self._exporter.increment_counter(EnterpriseTelemetryCounter.TOKENS, info.total_tokens, token_labels)
+        if info.message_tokens > 0:
+            self._exporter.increment_counter(EnterpriseTelemetryCounter.INPUT_TOKENS, info.message_tokens, token_labels)
+        if info.answer_tokens > 0:
+            self._exporter.increment_counter(EnterpriseTelemetryCounter.OUTPUT_TOKENS, info.answer_tokens, token_labels)
        invoke_from = metadata.get("from_source", "")
        self._exporter.increment_counter(
            EnterpriseTelemetryCounter.REQUESTS,