mirror of https://github.com/langgenius/dify.git
Fix 500 error in knowledge base, select weightedScore and click retrieve. (#28586)
Signed-off-by: -LAN- <laipz8200@outlook.com> Co-authored-by: -LAN- <laipz8200@outlook.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
This commit is contained in:
parent
d696b9f35e
commit
d7010f582f
|
|
@ -1,20 +1,110 @@
|
|||
import re
|
||||
from operator import itemgetter
|
||||
from typing import cast
|
||||
|
||||
|
||||
class JiebaKeywordTableHandler:
|
||||
def __init__(self):
|
||||
from core.rag.datasource.keyword.jieba.stopwords import STOPWORDS
|
||||
|
||||
tfidf = self._load_tfidf_extractor()
|
||||
tfidf.stop_words = STOPWORDS # type: ignore[attr-defined]
|
||||
self._tfidf = tfidf
|
||||
|
||||
def _load_tfidf_extractor(self):
|
||||
"""
|
||||
Load jieba TFIDF extractor with fallback strategy.
|
||||
|
||||
Loading Flow:
|
||||
┌─────────────────────────────────────────────────────────────────────┐
|
||||
│ jieba.analyse.default_tfidf │
|
||||
│ exists? │
|
||||
└─────────────────────────────────────────────────────────────────────┘
|
||||
│ │
|
||||
YES NO
|
||||
│ │
|
||||
▼ ▼
|
||||
┌──────────────────┐ ┌──────────────────────────────────┐
|
||||
│ Return default │ │ jieba.analyse.TFIDF exists? │
|
||||
│ TFIDF │ └──────────────────────────────────┘
|
||||
└──────────────────┘ │ │
|
||||
YES NO
|
||||
│ │
|
||||
│ ▼
|
||||
│ ┌────────────────────────────┐
|
||||
│ │ Try import from │
|
||||
│ │ jieba.analyse.tfidf.TFIDF │
|
||||
│ └────────────────────────────┘
|
||||
│ │ │
|
||||
│ SUCCESS FAILED
|
||||
│ │ │
|
||||
▼ ▼ ▼
|
||||
┌────────────────────────┐ ┌─────────────────┐
|
||||
│ Instantiate TFIDF() │ │ Build fallback │
|
||||
│ & cache to default │ │ _SimpleTFIDF │
|
||||
└────────────────────────┘ └─────────────────┘
|
||||
"""
|
||||
import jieba.analyse # type: ignore
|
||||
|
||||
tfidf = getattr(jieba.analyse, "default_tfidf", None)
|
||||
if tfidf is not None:
|
||||
return tfidf
|
||||
|
||||
tfidf_class = getattr(jieba.analyse, "TFIDF", None)
|
||||
if tfidf_class is None:
|
||||
try:
|
||||
from jieba.analyse.tfidf import TFIDF # type: ignore
|
||||
|
||||
tfidf_class = TFIDF
|
||||
except Exception:
|
||||
tfidf_class = None
|
||||
|
||||
if tfidf_class is not None:
|
||||
tfidf = tfidf_class()
|
||||
jieba.analyse.default_tfidf = tfidf # type: ignore[attr-defined]
|
||||
return tfidf
|
||||
|
||||
return self._build_fallback_tfidf()
|
||||
|
||||
@staticmethod
|
||||
def _build_fallback_tfidf():
|
||||
"""Fallback lightweight TFIDF for environments missing jieba's TFIDF."""
|
||||
import jieba # type: ignore
|
||||
|
||||
from core.rag.datasource.keyword.jieba.stopwords import STOPWORDS
|
||||
|
||||
jieba.analyse.default_tfidf.stop_words = STOPWORDS # type: ignore
|
||||
class _SimpleTFIDF:
|
||||
def __init__(self):
|
||||
self.stop_words = STOPWORDS
|
||||
self._lcut = getattr(jieba, "lcut", None)
|
||||
|
||||
def extract_tags(self, sentence: str, top_k: int | None = 20, **kwargs):
|
||||
# Basic frequency-based keyword extraction as a fallback when TF-IDF is unavailable.
|
||||
top_k = kwargs.pop("topK", top_k)
|
||||
cut = getattr(jieba, "cut", None)
|
||||
if self._lcut:
|
||||
tokens = self._lcut(sentence)
|
||||
elif callable(cut):
|
||||
tokens = list(cut(sentence))
|
||||
else:
|
||||
tokens = re.findall(r"\w+", sentence)
|
||||
|
||||
words = [w for w in tokens if w and w not in self.stop_words]
|
||||
freq: dict[str, int] = {}
|
||||
for w in words:
|
||||
freq[w] = freq.get(w, 0) + 1
|
||||
|
||||
sorted_words = sorted(freq.items(), key=itemgetter(1), reverse=True)
|
||||
if top_k is not None:
|
||||
sorted_words = sorted_words[:top_k]
|
||||
|
||||
return [item[0] for item in sorted_words]
|
||||
|
||||
return _SimpleTFIDF()
|
||||
|
||||
def extract_keywords(self, text: str, max_keywords_per_chunk: int | None = 10) -> set[str]:
|
||||
"""Extract keywords with JIEBA tfidf."""
|
||||
import jieba.analyse # type: ignore
|
||||
|
||||
keywords = jieba.analyse.extract_tags(
|
||||
keywords = self._tfidf.extract_tags(
|
||||
sentence=text,
|
||||
topK=max_keywords_per_chunk,
|
||||
)
|
||||
|
|
|
|||
Loading…
Reference in New Issue