From 4130c506435503cbfc2007662b811d4c9f01f190 Mon Sep 17 00:00:00 2001 From: jyong <718720800@qq.com> Date: Tue, 3 Jun 2025 18:32:39 +0800 Subject: [PATCH] r2 --- api/core/rag/datasource/keyword/jieba/jieba.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/api/core/rag/datasource/keyword/jieba/jieba.py b/api/core/rag/datasource/keyword/jieba/jieba.py index d6d0bd88b2..ca54290796 100644 --- a/api/core/rag/datasource/keyword/jieba/jieba.py +++ b/api/core/rag/datasource/keyword/jieba/jieba.py @@ -28,9 +28,11 @@ class Jieba(BaseKeyword): with redis_client.lock(lock_name, timeout=600): keyword_table_handler = JiebaKeywordTableHandler() keyword_table = self._get_dataset_keyword_table() + keyword_number = self.dataset.keyword_number if self.dataset.keyword_number else self._config.max_keywords_per_chunk + for text in texts: keywords = keyword_table_handler.extract_keywords( - text.page_content, self._config.max_keywords_per_chunk + text.page_content, keyword_number ) if text.metadata is not None: self._update_segment_keywords(self.dataset.id, text.metadata["doc_id"], list(keywords)) @@ -49,17 +51,18 @@ class Jieba(BaseKeyword): keyword_table = self._get_dataset_keyword_table() keywords_list = kwargs.get("keywords_list") + keyword_number = self.dataset.keyword_number if self.dataset.keyword_number else self._config.max_keywords_per_chunk for i in range(len(texts)): text = texts[i] if keywords_list: keywords = keywords_list[i] if not keywords: keywords = keyword_table_handler.extract_keywords( - text.page_content, self._config.max_keywords_per_chunk + text.page_content, keyword_number ) else: keywords = keyword_table_handler.extract_keywords( - text.page_content, self._config.max_keywords_per_chunk + text.page_content, keyword_number ) if text.metadata is not None: self._update_segment_keywords(self.dataset.id, text.metadata["doc_id"], list(keywords)) @@ -239,7 +242,9 @@ class Jieba(BaseKeyword): keyword_table or {}, segment.index_node_id, pre_segment_data["keywords"] ) else: - keywords = keyword_table_handler.extract_keywords(segment.content, self._config.max_keywords_per_chunk) + keyword_number = self.dataset.keyword_number if self.dataset.keyword_number else self._config.max_keywords_per_chunk + + keywords = keyword_table_handler.extract_keywords(segment.content, keyword_number) segment.keywords = list(keywords) keyword_table = self._add_text_to_keyword_table( keyword_table or {}, segment.index_node_id, list(keywords)