From c6eb18daaec4ed543b6afa5ea355d754977612cd Mon Sep 17 00:00:00 2001 From: wangxiaolei Date: Tue, 9 Dec 2025 10:22:02 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20charset=5Fnormalizer=20for=20better=20e?= =?UTF-8?q?ncoding=20detection=20than=20httpx's=20d=E2=80=A6=20(#29264)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../workflow/nodes/http_request/entities.py | 29 +++++- .../nodes/http_request/test_entities.py | 93 +++++++++++++++++++ 2 files changed, 121 insertions(+), 1 deletion(-) diff --git a/api/core/workflow/nodes/http_request/entities.py b/api/core/workflow/nodes/http_request/entities.py index 5a7db6e0e6..e323533835 100644 --- a/api/core/workflow/nodes/http_request/entities.py +++ b/api/core/workflow/nodes/http_request/entities.py @@ -3,6 +3,7 @@ from collections.abc import Sequence from email.message import Message from typing import Any, Literal +import charset_normalizer import httpx from pydantic import BaseModel, Field, ValidationInfo, field_validator @@ -96,10 +97,12 @@ class HttpRequestNodeData(BaseNodeData): class Response: headers: dict[str, str] response: httpx.Response + _cached_text: str | None def __init__(self, response: httpx.Response): self.response = response self.headers = dict(response.headers) + self._cached_text = None @property def is_file(self): @@ -159,7 +162,31 @@ class Response: @property def text(self) -> str: - return self.response.text + """ + Get response text with robust encoding detection. + + Uses charset_normalizer for better encoding detection than httpx's default, + which helps handle Chinese and other non-ASCII characters properly. + """ + # Check cache first + if hasattr(self, "_cached_text") and self._cached_text is not None: + return self._cached_text + + # Try charset_normalizer for robust encoding detection first + detected_encoding = charset_normalizer.from_bytes(self.response.content).best() + if detected_encoding and detected_encoding.encoding: + try: + text = self.response.content.decode(detected_encoding.encoding) + self._cached_text = text + return text + except (UnicodeDecodeError, TypeError, LookupError): + # Fallback to httpx's encoding detection if charset_normalizer fails + pass + + # Fallback to httpx's built-in encoding detection + text = self.response.text + self._cached_text = text + return text @property def content(self) -> bytes: diff --git a/api/tests/unit_tests/core/workflow/nodes/http_request/test_entities.py b/api/tests/unit_tests/core/workflow/nodes/http_request/test_entities.py index 0f6b7e4ab6..47a5df92a4 100644 --- a/api/tests/unit_tests/core/workflow/nodes/http_request/test_entities.py +++ b/api/tests/unit_tests/core/workflow/nodes/http_request/test_entities.py @@ -1,3 +1,4 @@ +import json from unittest.mock import Mock, PropertyMock, patch import httpx @@ -138,3 +139,95 @@ def test_is_file_with_no_content_disposition(mock_response): type(mock_response).content = PropertyMock(return_value=bytes([0x00, 0xFF] * 512)) response = Response(mock_response) assert response.is_file + + +# UTF-8 Encoding Tests +@pytest.mark.parametrize( + ("content_bytes", "expected_text", "description"), + [ + # Chinese UTF-8 bytes + ( + b'{"message": "\xe4\xbd\xa0\xe5\xa5\xbd\xe4\xb8\x96\xe7\x95\x8c"}', + '{"message": "你好世界"}', + "Chinese characters UTF-8", + ), + # Japanese UTF-8 bytes + ( + b'{"message": "\xe3\x81\x93\xe3\x82\x93\xe3\x81\xab\xe3\x81\xa1\xe3\x81\xaf"}', + '{"message": "こんにちは"}', + "Japanese characters UTF-8", + ), + # Korean UTF-8 bytes + ( + b'{"message": "\xec\x95\x88\xeb\x85\x95\xed\x95\x98\xec\x84\xb8\xec\x9a\x94"}', + '{"message": "안녕하세요"}', + "Korean characters UTF-8", + ), + # Arabic UTF-8 + (b'{"text": "\xd9\x85\xd8\xb1\xd8\xad\xd8\xa8\xd8\xa7"}', '{"text": "مرحبا"}', "Arabic characters UTF-8"), + # European characters UTF-8 + (b'{"text": "Caf\xc3\xa9 M\xc3\xbcnchen"}', '{"text": "Café München"}', "European accented characters"), + # Simple ASCII + (b'{"text": "Hello World"}', '{"text": "Hello World"}', "Simple ASCII text"), + ], +) +def test_text_property_utf8_decoding(mock_response, content_bytes, expected_text, description): + """Test that Response.text properly decodes UTF-8 content with charset_normalizer""" + mock_response.headers = {"content-type": "application/json; charset=utf-8"} + type(mock_response).content = PropertyMock(return_value=content_bytes) + # Mock httpx response.text to return something different (simulating potential encoding issues) + mock_response.text = "incorrect-fallback-text" # To ensure we are not falling back to httpx's text property + + response = Response(mock_response) + + # Our enhanced text property should decode properly using charset_normalizer + assert response.text == expected_text, ( + f"Failed for {description}: got {repr(response.text)}, expected {repr(expected_text)}" + ) + + +def test_text_property_fallback_to_httpx(mock_response): + """Test that Response.text falls back to httpx.text when charset_normalizer fails""" + mock_response.headers = {"content-type": "application/json"} + + # Create malformed UTF-8 bytes + malformed_bytes = b'{"text": "\xff\xfe\x00\x00 invalid"}' + type(mock_response).content = PropertyMock(return_value=malformed_bytes) + + # Mock httpx.text to return some fallback value + fallback_text = '{"text": "fallback"}' + mock_response.text = fallback_text + + response = Response(mock_response) + + # Should fall back to httpx's text when charset_normalizer fails + assert response.text == fallback_text + + +@pytest.mark.parametrize( + ("json_content", "description"), + [ + # JSON with escaped Unicode (like Flask jsonify()) + ('{"message": "\\u4f60\\u597d\\u4e16\\u754c"}', "JSON with escaped Unicode"), + # JSON with mixed escape sequences and UTF-8 + ('{"mixed": "Hello \\u4f60\\u597d"}', "Mixed escaped and regular text"), + # JSON with complex escape sequences + ('{"complex": "\\ud83d\\ude00\\u4f60\\u597d"}', "Emoji and Chinese escapes"), + ], +) +def test_text_property_with_escaped_unicode(mock_response, json_content, description): + """Test Response.text with JSON containing Unicode escape sequences""" + mock_response.headers = {"content-type": "application/json"} + + content_bytes = json_content.encode("utf-8") + type(mock_response).content = PropertyMock(return_value=content_bytes) + mock_response.text = json_content # httpx would return the same for valid UTF-8 + + response = Response(mock_response) + + # Should preserve the escape sequences (valid JSON) + assert response.text == json_content, f"Failed for {description}" + + # The text should be valid JSON that can be parsed back to proper Unicode + parsed = json.loads(response.text) + assert isinstance(parsed, dict), f"Invalid JSON for {description}"