Fix 500 error in knowledge base, select weightedScore and click retrieve. (#28586)

Signed-off-by: -LAN- <laipz8200@outlook.com>
Co-authored-by: -LAN- <laipz8200@outlook.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
This commit is contained in:
Eric Guo 2025-11-26 16:44:00 +08:00 committed by GitHub
parent d696b9f35e
commit d7010f582f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
1 changed files with 94 additions and 4 deletions

View File

@ -1,20 +1,110 @@
import re
from operator import itemgetter
from typing import cast
class JiebaKeywordTableHandler:
def __init__(self):
from core.rag.datasource.keyword.jieba.stopwords import STOPWORDS
tfidf = self._load_tfidf_extractor()
tfidf.stop_words = STOPWORDS # type: ignore[attr-defined]
self._tfidf = tfidf
def _load_tfidf_extractor(self):
"""
Load jieba TFIDF extractor with fallback strategy.
Loading Flow:
jieba.analyse.default_tfidf
exists?
YES NO
Return default jieba.analyse.TFIDF exists?
TFIDF
YES NO
Try import from
jieba.analyse.tfidf.TFIDF
SUCCESS FAILED
Instantiate TFIDF() Build fallback
& cache to default _SimpleTFIDF
"""
import jieba.analyse # type: ignore
tfidf = getattr(jieba.analyse, "default_tfidf", None)
if tfidf is not None:
return tfidf
tfidf_class = getattr(jieba.analyse, "TFIDF", None)
if tfidf_class is None:
try:
from jieba.analyse.tfidf import TFIDF # type: ignore
tfidf_class = TFIDF
except Exception:
tfidf_class = None
if tfidf_class is not None:
tfidf = tfidf_class()
jieba.analyse.default_tfidf = tfidf # type: ignore[attr-defined]
return tfidf
return self._build_fallback_tfidf()
@staticmethod
def _build_fallback_tfidf():
"""Fallback lightweight TFIDF for environments missing jieba's TFIDF."""
import jieba # type: ignore
from core.rag.datasource.keyword.jieba.stopwords import STOPWORDS
jieba.analyse.default_tfidf.stop_words = STOPWORDS # type: ignore
class _SimpleTFIDF:
def __init__(self):
self.stop_words = STOPWORDS
self._lcut = getattr(jieba, "lcut", None)
def extract_tags(self, sentence: str, top_k: int | None = 20, **kwargs):
# Basic frequency-based keyword extraction as a fallback when TF-IDF is unavailable.
top_k = kwargs.pop("topK", top_k)
cut = getattr(jieba, "cut", None)
if self._lcut:
tokens = self._lcut(sentence)
elif callable(cut):
tokens = list(cut(sentence))
else:
tokens = re.findall(r"\w+", sentence)
words = [w for w in tokens if w and w not in self.stop_words]
freq: dict[str, int] = {}
for w in words:
freq[w] = freq.get(w, 0) + 1
sorted_words = sorted(freq.items(), key=itemgetter(1), reverse=True)
if top_k is not None:
sorted_words = sorted_words[:top_k]
return [item[0] for item in sorted_words]
return _SimpleTFIDF()
def extract_keywords(self, text: str, max_keywords_per_chunk: int | None = 10) -> set[str]:
"""Extract keywords with JIEBA tfidf."""
import jieba.analyse # type: ignore
keywords = jieba.analyse.extract_tags(
keywords = self._tfidf.extract_tags(
sentence=text,
topK=max_keywords_per_chunk,
)