dify/api/services/variable_truncator.py

import dataclasses
import json
from collections.abc import Mapping
from typing import Any, TypeAlias

from configs import dify_config
from core.variables.segments import (
    ArrayFileSegment,
    ArraySegment,
    FileSegment,
    FloatSegment,
    IntegerSegment,
    NoneSegment,
    ObjectSegment,
    Segment,
    StringSegment,
)

LARGE_VARIABLE_THRESHOLD = 10 * 1024  # 100KB in bytes
OBJECT_CHAR_LIMIT = 5000
ARRAY_CHAR_LIMIT = 1000

_MAX_DEPTH = 20


class MaxDepthExceededError(Exception):
    pass


class UnknownTypeError(Exception):
    pass


JSONTypes: TypeAlias = int | float | str | list | dict | None | bool


@dataclasses.dataclass(frozen=True)
class TruncationResult:
    result: Segment
    truncated: bool


class VariableTruncator:
    """
    Handles variable truncation with structure-preserving strategies.

    This class implements intelligent truncation that prioritizes maintaining data structure
    integrity while ensuring the final size doesn't exceed specified limits.

    Uses recursive size calculation to avoid repeated JSON serialization.
    """

    _JSON_SEPARATORS = (",", ":")

    def __init__(
        self,
        string_length_limit=5000,
        array_element_limit: int = 20,
        max_size_bytes: int = LARGE_VARIABLE_THRESHOLD,
    ):
        if string_length_limit <= 3:
            raise ValueError("string_length_limit should be greater than 3.")
        self._string_length_limit = string_length_limit

        if array_element_limit <= 0:
            raise ValueError("array_element_limit should be greater than 0.")
        self._array_element_limit = array_element_limit

        if max_size_bytes <= 0:
            raise ValueError("max_size_bytes should be greater than 0.")
        self._max_size_bytes = max_size_bytes

    @classmethod
    def default(cls) -> "VariableTruncator":
        return VariableTruncator(
            max_size_bytes=dify_config.WORKFLOW_VARIABLE_TRUNCATION_MAX_SIZE,
            array_element_limit=dify_config.WORKFLOW_VARIABLE_TRUNCATION_ARRAY_LENGTH,
            string_length_limit=dify_config.WORKFLOW_VARIABLE_TRUNCATION_STRING_LENGTH,
        )

    def truncate_io_mapping(self, v: Mapping[str, Any]) -> tuple[Mapping[str, Any], bool]:
        """`truncate_inputs_output` is used to truncate the `inputs` / `outputs` of a WorkflowNodeExecution record."""
        size = self.calculate_json_size(v)
        if size < self._max_size_bytes:
            return v, False
        budget = self._max_size_bytes
        is_truncated = False
        truncated_mapping: dict[str, Any] = {}
        size = len(v.items())
        remaining = size
        for key, value in v.items():
            budget -= self.calculate_json_size(key)
            if budget < 0:
                break
            truncated_value, value_truncated = self._truncate_value_to_budget(value, budget // remaining)
            if value_truncated:
                is_truncated = True
            truncated_mapping[key] = truncated_value
            # TODO(QuantumGhost): This approach is inefficient. Ideally, the truncation function should directly
            # report the size of the truncated value.
            budget -= self.calculate_json_size(truncated_value) + 2  # ":" and ","
        return truncated_mapping, is_truncated

    def truncate(self, segment: Segment) -> TruncationResult:
        """
        Apply smart truncation to a variable value.

        Args:
            value: The value to truncate (can be Segment or raw value)

        Returns:
            TruncationResult with truncated data and truncation status
        """

        if isinstance(segment, IntegerSegment):
            if isinstance(segment.value, bool):
                # TODO: here we need to support boolean types here
                return TruncationResult(result=IntegerSegment(value=int(segment.value)), truncated=False)
            return TruncationResult(result=segment, truncated=False)
        # we don't truncate ArrayFileSegment, as the number of files in one variable is relatiely small.
        elif isinstance(segment, (NoneSegment, FloatSegment, FileSegment, ArrayFileSegment)):
            return TruncationResult(result=segment, truncated=False)

        # Apply type-specific truncation with target size
        if isinstance(segment, ArraySegment):
            truncated_value, was_truncated = self._truncate_array(segment.value, self._max_size_bytes)
        elif isinstance(segment, StringSegment):
            truncated_value, was_truncated = self._truncate_string(segment.value)
        elif isinstance(segment, ObjectSegment):
            truncated_value, was_truncated = self._truncate_object(segment.value, self._max_size_bytes)
        else:
            raise AssertionError("this should be unreachable.")

        # Check if we still exceed the final character limit after type-specific truncation
        if not was_truncated:
            return TruncationResult(result=segment, truncated=False)

        truncated_size = self.calculate_json_size(truncated_value)
        if truncated_size > self._max_size_bytes:
            if isinstance(truncated_value, str):
                return TruncationResult(StringSegment(value=truncated_value[: self._max_size_bytes - 3]), True)
            # Apply final fallback - convert to JSON string and truncate
            json_str = json.dumps(truncated_value, ensure_ascii=False, separators=self._JSON_SEPARATORS)
            if len(json_str) > self._max_size_bytes:
                json_str = json_str[: self._max_size_bytes] + "..."
            return TruncationResult(result=StringSegment(value=json_str), truncated=True)

        return TruncationResult(result=segment.model_copy(update={"value": truncated_value}), truncated=True)

    @staticmethod
    def calculate_json_size(value: Any, depth=0) -> int:
        """Recursively calculate JSON size without serialization."""
        if depth > _MAX_DEPTH:
            raise MaxDepthExceededError()
        if isinstance(value, str):
            # For strings, we need to account for escaping and quotes
            # Rough estimate: each character might need escaping, plus 2 for quotes
            return len(value.encode("utf-8")) + 2
        elif isinstance(value, (int, float)):
            return len(str(value))
        elif isinstance(value, bool):
            return 4 if value else 5  # "true" or "false"
        elif value is None:
            return 4  # "null"
        elif isinstance(value, list):
            # Size = sum of elements + separators + brackets
            total = 2  # "[]"
            for i, item in enumerate(value):
                if i > 0:
                    total += 1  # ","
                total += VariableTruncator.calculate_json_size(item, depth=depth + 1)
            return total
        elif isinstance(value, dict):
            # Size = sum of keys + values + separators + brackets
            total = 2  # "{}"
            for index, key in enumerate(value.keys()):
                if index > 0:
                    total += 1  # ","
                total += VariableTruncator.calculate_json_size(str(key), depth=depth + 1)  # Key as string
                total += 1  # ":"
                total += VariableTruncator.calculate_json_size(value[key], depth=depth + 1)
            return total
        else:
            raise UnknownTypeError(f"got unknown type {type(value)}")

    def _truncate_string(self, value: str) -> tuple[str, bool]:
        """Truncate string values."""
        if len(value) <= self._string_length_limit:
            return value, False
        return value[: self._string_length_limit - 3] + "...", True

    def _truncate_array(self, value: list, target_size: int) -> tuple[list, bool]:
        """
        Truncate array with correct strategy:
        1. First limit to 20 items
        2. If still too large, truncate individual items
        """

        # Step 1: Limit to first 20 items
        limited_items = value[: self._array_element_limit]
        was_truncated = len(limited_items) < len(value)

        # Step 2: Check if we still exceed the target size
        current_size = self.calculate_json_size(limited_items)
        if current_size <= target_size:
            return limited_items, was_truncated

        # Step 3: Truncate individual items to fit within target size
        truncated_items = []
        remaining_size = target_size - 2  # Account for []

        for i, item in enumerate(limited_items):
            if i > 0:
                remaining_size -= 1  # Account for comma

            if remaining_size <= 0:
                break

            # Calculate how much space this item can use
            remaining_items = len(limited_items) - i
            item_budget = remaining_size // remaining_items

            # Truncate the item to fit within budget
            truncated_item, item_truncated = self._truncate_item_to_budget(item, item_budget)
            truncated_items.append(truncated_item)

            # Update remaining size
            item_size = self.calculate_json_size(truncated_item)
            remaining_size -= item_size

            if item_truncated:
                was_truncated = True

        return truncated_items, True

    def _truncate_object(self, value: Mapping[str, Any], target_size: int) -> tuple[Mapping[str, Any], bool]:
        """
        Truncate object with key preservation priority.

        Strategy:
        1. Keep all keys, truncate values to fit within budget
        2. If still too large, drop keys starting from the end
        """
        if not value:
            return value, False

        truncated_obj = {}
        was_truncated = False
        remaining_size = target_size - 2  # Account for {}

        # Sort keys to ensure deterministic behavior
        sorted_keys = sorted(value.keys())

        for i, key in enumerate(sorted_keys):
            val = value[key]

            if i > 0:
                remaining_size -= 1  # Account for comma

            if remaining_size <= 0:
                # No more room for additional key-value pairs
                was_truncated = True
                break

            # Calculate budget for this key-value pair
            key_size = self.calculate_json_size(str(key)) + 1  # +1 for ":"
            remaining_pairs = len(sorted_keys) - i
            value_budget = max(0, (remaining_size - key_size) // remaining_pairs)

            if value_budget <= 0:
                was_truncated = True
                break

            # Truncate the value to fit within budget
            truncated_val, val_truncated = self._truncate_value_to_budget(val, value_budget)

            truncated_obj[key] = truncated_val
            if val_truncated:
                was_truncated = True

            # Update remaining size
            pair_size = key_size + self.calculate_json_size(truncated_val)
            remaining_size -= pair_size

        return truncated_obj, was_truncated or len(truncated_obj) < len(value)

    def _truncate_item_to_budget(self, item: Any, budget: int) -> tuple[Any, bool]:
        """Truncate an array item to fit within a size budget."""
        if isinstance(item, str):
            # For strings, truncate to fit within budget (accounting for quotes)
            max_chars = max(0, budget - 5)  # -5 for quotes and potential "..."
            max_chars = min(max_chars, ARRAY_CHAR_LIMIT)
            if len(item) <= max_chars:
                return item, False
            return item[:max_chars] + "...", True
        elif isinstance(item, dict):
            # For objects, recursively truncate
            return self._truncate_object(item, budget)
        elif isinstance(item, list):
            # For nested arrays, recursively truncate
            return self._truncate_array(item, budget)
        else:
            # For other types, check if they fit
            item_size = self.calculate_json_size(item)
            if item_size <= budget:
                return item, False
            else:
                # Convert to string and truncate
                str_item = str(item)
                return self._truncate_item_to_budget(str_item, budget)

    def _truncate_value_to_budget(self, val: Any, budget: int) -> tuple[Any, bool]:
        """Truncate a value within an object to fit within budget."""
        if isinstance(val, str):
            # For strings, respect OBJECT_CHAR_LIMIT but also budget
            max_chars = min(OBJECT_CHAR_LIMIT, max(0, budget - 5))  # -5 for quotes and "..."
            if len(val) <= max_chars:
                return val, False
            return val[:max_chars] + "...", True
        elif isinstance(val, list):
            return self._truncate_array(val, budget)
        elif isinstance(val, dict):
            return self._truncate_object(val, budget)
        else:
            # For other types, check if they fit
            val_size = self.calculate_json_size(val)
            if val_size <= budget:
                return val, False
            else:
                # Convert to string and truncate
                return self._truncate_value_to_budget(str(val), budget)