From 596559efc9f393b3189db4ead3cf0ff0728441ba Mon Sep 17 00:00:00 2001 From: XHamzaX <90039624+HamzaSwitch@users.noreply.github.com> Date: Mon, 13 Apr 2026 04:11:08 +0100 Subject: [PATCH] fix(rag): include is_summary and original_chunk_id in default vector projection (#34950) Co-authored-by: VFootball Dev --- api/core/rag/datasource/vdb/vector_factory.py | 18 +++++++++++++++++- .../rag/datasource/vdb/test_vector_factory.py | 13 ++++++++++++- 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/api/core/rag/datasource/vdb/vector_factory.py b/api/core/rag/datasource/vdb/vector_factory.py index 0ef88e1010..5d879ac3ca 100644 --- a/api/core/rag/datasource/vdb/vector_factory.py +++ b/api/core/rag/datasource/vdb/vector_factory.py @@ -41,7 +41,23 @@ class AbstractVectorFactory(ABC): class Vector: def __init__(self, dataset: Dataset, attributes: list | None = None): if attributes is None: - attributes = ["doc_id", "dataset_id", "document_id", "doc_hash", "doc_type"] + # `is_summary` and `original_chunk_id` are stored on summary vectors + # by `SummaryIndexService` and read back by `RetrievalService` to + # route summary hits through their original parent chunks. They + # must be listed here so vector backends that use this list as an + # explicit return-properties projection (notably Weaviate) actually + # return those fields; without them, summary hits silently + # collapse into `is_summary = False` branches and the summary + # retrieval path is a no-op. See #34884. + attributes = [ + "doc_id", + "dataset_id", + "document_id", + "doc_hash", + "doc_type", + "is_summary", + "original_chunk_id", + ] self._dataset = dataset self._embeddings = self._get_embeddings() self._attributes = attributes diff --git a/api/tests/unit_tests/core/rag/datasource/vdb/test_vector_factory.py b/api/tests/unit_tests/core/rag/datasource/vdb/test_vector_factory.py index 4e9ceddda9..5a0e4dcd75 100644 --- a/api/tests/unit_tests/core/rag/datasource/vdb/test_vector_factory.py +++ b/api/tests/unit_tests/core/rag/datasource/vdb/test_vector_factory.py @@ -121,7 +121,18 @@ def test_vector_init_uses_default_and_custom_attributes(vector_factory_module): default_vector = vector_factory_module.Vector(dataset) custom_vector = vector_factory_module.Vector(dataset, attributes=["doc_id"]) - assert default_vector._attributes == ["doc_id", "dataset_id", "document_id", "doc_hash", "doc_type"] + # `is_summary` and `original_chunk_id` must be in the default return-properties + # projection so summary index retrieval works on backends that honor the list + # as an explicit projection (e.g. Weaviate). See #34884. + assert default_vector._attributes == [ + "doc_id", + "dataset_id", + "document_id", + "doc_hash", + "doc_type", + "is_summary", + "original_chunk_id", + ] assert custom_vector._attributes == ["doc_id"] assert default_vector._embeddings == "embeddings" assert default_vector._vector_processor == "processor"