diff --git a/.github/workflows/translate-i18n-claude.yml b/.github/workflows/translate-i18n-claude.yml index 0e05913576..003e7ffc6e 100644 --- a/.github/workflows/translate-i18n-claude.yml +++ b/.github/workflows/translate-i18n-claude.yml @@ -1,10 +1,12 @@ name: Translate i18n Files with Claude Code +# Note: claude-code-action doesn't support push events directly. +# Push events are handled by trigger-i18n-sync.yml which sends repository_dispatch. +# See: https://github.com/langgenius/dify/issues/30743 + on: - push: - branches: [main] - paths: - - 'web/i18n/en-US/*.json' + repository_dispatch: + types: [i18n-sync] workflow_dispatch: inputs: files: @@ -87,26 +89,35 @@ jobs: echo "DIFF_AVAILABLE=false" >> $GITHUB_OUTPUT fi fi - else - # Push trigger - detect changed files from the push - BEFORE_SHA="${{ github.event.before }}" - # Handle edge case: first push or force push may have null/zero SHA - if [ -z "$BEFORE_SHA" ] || [ "$BEFORE_SHA" = "0000000000000000000000000000000000000000" ]; then - # Fallback to comparing with parent commit - BEFORE_SHA="HEAD~1" + elif [ "${{ github.event_name }}" == "repository_dispatch" ]; then + # Triggered by push via trigger-i18n-sync.yml workflow + # Validate required payload fields + if [ -z "${{ github.event.client_payload.changed_files }}" ]; then + echo "Error: repository_dispatch payload missing required 'changed_files' field" >&2 + exit 1 fi - changed=$(git diff --name-only "$BEFORE_SHA" ${{ github.sha }} -- 'web/i18n/en-US/*.json' 2>/dev/null | xargs -n1 basename 2>/dev/null | sed 's/.json$//' | tr '\n' ' ' || echo "") - echo "CHANGED_FILES=$changed" >> $GITHUB_OUTPUT + echo "CHANGED_FILES=${{ github.event.client_payload.changed_files }}" >> $GITHUB_OUTPUT echo "TARGET_LANGS=" >> $GITHUB_OUTPUT - echo "SYNC_MODE=incremental" >> $GITHUB_OUTPUT + echo "SYNC_MODE=${{ github.event.client_payload.sync_mode || 'incremental' }}" >> $GITHUB_OUTPUT - # Generate detailed diff for the push - git diff "$BEFORE_SHA"..${{ github.sha }} -- 'web/i18n/en-US/*.json' > /tmp/i18n-diff.txt 2>/dev/null || echo "" > /tmp/i18n-diff.txt - if [ -s /tmp/i18n-diff.txt ]; then - echo "DIFF_AVAILABLE=true" >> $GITHUB_OUTPUT + # Decode the base64-encoded diff from the trigger workflow + if [ -n "${{ github.event.client_payload.diff_base64 }}" ]; then + if ! echo "${{ github.event.client_payload.diff_base64 }}" | base64 -d > /tmp/i18n-diff.txt 2>&1; then + echo "Warning: Failed to decode base64 diff payload" >&2 + echo "" > /tmp/i18n-diff.txt + echo "DIFF_AVAILABLE=false" >> $GITHUB_OUTPUT + elif [ -s /tmp/i18n-diff.txt ]; then + echo "DIFF_AVAILABLE=true" >> $GITHUB_OUTPUT + else + echo "DIFF_AVAILABLE=false" >> $GITHUB_OUTPUT + fi else + echo "" > /tmp/i18n-diff.txt echo "DIFF_AVAILABLE=false" >> $GITHUB_OUTPUT fi + else + echo "Unsupported event type: ${{ github.event_name }}" + exit 1 fi # Truncate diff if too large (keep first 50KB) diff --git a/.github/workflows/trigger-i18n-sync.yml b/.github/workflows/trigger-i18n-sync.yml new file mode 100644 index 0000000000..de093c9235 --- /dev/null +++ b/.github/workflows/trigger-i18n-sync.yml @@ -0,0 +1,66 @@ +name: Trigger i18n Sync on Push + +# This workflow bridges the push event to repository_dispatch +# because claude-code-action doesn't support push events directly. +# See: https://github.com/langgenius/dify/issues/30743 + +on: + push: + branches: [main] + paths: + - 'web/i18n/en-US/*.json' + +permissions: + contents: write + +jobs: + trigger: + if: github.repository == 'langgenius/dify' + runs-on: ubuntu-latest + timeout-minutes: 5 + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Detect changed files and generate diff + id: detect + run: | + BEFORE_SHA="${{ github.event.before }}" + # Handle edge case: force push may have null/zero SHA + if [ -z "$BEFORE_SHA" ] || [ "$BEFORE_SHA" = "0000000000000000000000000000000000000000" ]; then + BEFORE_SHA="HEAD~1" + fi + + # Detect changed i18n files + changed=$(git diff --name-only "$BEFORE_SHA" "${{ github.sha }}" -- 'web/i18n/en-US/*.json' 2>/dev/null | xargs -n1 basename 2>/dev/null | sed 's/.json$//' | tr '\n' ' ' || echo "") + echo "changed_files=$changed" >> $GITHUB_OUTPUT + + # Generate diff for context + git diff "$BEFORE_SHA" "${{ github.sha }}" -- 'web/i18n/en-US/*.json' > /tmp/i18n-diff.txt 2>/dev/null || echo "" > /tmp/i18n-diff.txt + + # Truncate if too large (keep first 50KB to match receiving workflow) + head -c 50000 /tmp/i18n-diff.txt > /tmp/i18n-diff-truncated.txt + mv /tmp/i18n-diff-truncated.txt /tmp/i18n-diff.txt + + # Base64 encode the diff for safe JSON transport (portable, single-line) + diff_base64=$(base64 < /tmp/i18n-diff.txt | tr -d '\n') + echo "diff_base64=$diff_base64" >> $GITHUB_OUTPUT + + if [ -n "$changed" ]; then + echo "has_changes=true" >> $GITHUB_OUTPUT + echo "Detected changed files: $changed" + else + echo "has_changes=false" >> $GITHUB_OUTPUT + echo "No i18n changes detected" + fi + + - name: Trigger i18n sync workflow + if: steps.detect.outputs.has_changes == 'true' + uses: peter-evans/repository-dispatch@v3 + with: + token: ${{ secrets.GITHUB_TOKEN }} + event-type: i18n-sync + client-payload: '{"changed_files": "${{ steps.detect.outputs.changed_files }}", "diff_base64": "${{ steps.detect.outputs.diff_base64 }}", "sync_mode": "incremental", "trigger_sha": "${{ github.sha }}"}' diff --git a/api/configs/feature/__init__.py b/api/configs/feature/__init__.py index cf855b1cc0..99ac618bcb 100644 --- a/api/configs/feature/__init__.py +++ b/api/configs/feature/__init__.py @@ -959,6 +959,16 @@ class MailConfig(BaseSettings): default=None, ) + ENABLE_TRIAL_APP: bool = Field( + description="Enable trial app", + default=False, + ) + + ENABLE_EXPLORE_BANNER: bool = Field( + description="Enable explore banner", + default=False, + ) + class RagEtlConfig(BaseSettings): """ diff --git a/api/controllers/console/__init__.py b/api/controllers/console/__init__.py index ad878fc266..fdc9aabc83 100644 --- a/api/controllers/console/__init__.py +++ b/api/controllers/console/__init__.py @@ -107,10 +107,12 @@ from .datasets.rag_pipeline import ( # Import explore controllers from .explore import ( + banner, installed_app, parameter, recommended_app, saved_message, + trial, ) # Import tag controllers @@ -145,6 +147,7 @@ __all__ = [ "apikey", "app", "audio", + "banner", "billing", "bp", "completion", @@ -198,6 +201,7 @@ __all__ = [ "statistic", "tags", "tool_providers", + "trial", "trigger_providers", "version", "website", diff --git a/api/controllers/console/admin.py b/api/controllers/console/admin.py index a25ca5ef51..978df15cf1 100644 --- a/api/controllers/console/admin.py +++ b/api/controllers/console/admin.py @@ -15,7 +15,7 @@ from controllers.console.wraps import only_edition_cloud from core.db.session_factory import session_factory from extensions.ext_database import db from libs.token import extract_access_token -from models.model import App, InstalledApp, RecommendedApp +from models.model import App, ExporleBanner, InstalledApp, RecommendedApp, TrialApp P = ParamSpec("P") R = TypeVar("R") @@ -32,6 +32,8 @@ class InsertExploreAppPayload(BaseModel): language: str = Field(...) category: str = Field(...) position: int = Field(...) + can_trial: bool = Field(default=False) + trial_limit: int = Field(default=0) @field_validator("language") @classmethod @@ -39,11 +41,33 @@ class InsertExploreAppPayload(BaseModel): return supported_language(value) +class InsertExploreBannerPayload(BaseModel): + category: str = Field(...) + title: str = Field(...) + description: str = Field(...) + img_src: str = Field(..., alias="img-src") + language: str = Field(default="en-US") + link: str = Field(...) + sort: int = Field(...) + + @field_validator("language") + @classmethod + def validate_language(cls, value: str) -> str: + return supported_language(value) + + model_config = {"populate_by_name": True} + + console_ns.schema_model( InsertExploreAppPayload.__name__, InsertExploreAppPayload.model_json_schema(ref_template=DEFAULT_REF_TEMPLATE_SWAGGER_2_0), ) +console_ns.schema_model( + InsertExploreBannerPayload.__name__, + InsertExploreBannerPayload.model_json_schema(ref_template=DEFAULT_REF_TEMPLATE_SWAGGER_2_0), +) + def admin_required(view: Callable[P, R]): @wraps(view) @@ -109,6 +133,20 @@ class InsertExploreAppListApi(Resource): ) db.session.add(recommended_app) + if payload.can_trial: + trial_app = db.session.execute( + select(TrialApp).where(TrialApp.app_id == payload.app_id) + ).scalar_one_or_none() + if not trial_app: + db.session.add( + TrialApp( + app_id=payload.app_id, + tenant_id=app.tenant_id, + trial_limit=payload.trial_limit, + ) + ) + else: + trial_app.trial_limit = payload.trial_limit app.is_public = True db.session.commit() @@ -123,6 +161,20 @@ class InsertExploreAppListApi(Resource): recommended_app.category = payload.category recommended_app.position = payload.position + if payload.can_trial: + trial_app = db.session.execute( + select(TrialApp).where(TrialApp.app_id == payload.app_id) + ).scalar_one_or_none() + if not trial_app: + db.session.add( + TrialApp( + app_id=payload.app_id, + tenant_id=app.tenant_id, + trial_limit=payload.trial_limit, + ) + ) + else: + trial_app.trial_limit = payload.trial_limit app.is_public = True db.session.commit() @@ -168,7 +220,62 @@ class InsertExploreAppApi(Resource): for installed_app in installed_apps: session.delete(installed_app) + trial_app = session.execute( + select(TrialApp).where(TrialApp.app_id == recommended_app.app_id) + ).scalar_one_or_none() + if trial_app: + session.delete(trial_app) + db.session.delete(recommended_app) db.session.commit() return {"result": "success"}, 204 + + +@console_ns.route("/admin/insert-explore-banner") +class InsertExploreBannerApi(Resource): + @console_ns.doc("insert_explore_banner") + @console_ns.doc(description="Insert an explore banner") + @console_ns.expect(console_ns.models[InsertExploreBannerPayload.__name__]) + @console_ns.response(201, "Banner inserted successfully") + @only_edition_cloud + @admin_required + def post(self): + payload = InsertExploreBannerPayload.model_validate(console_ns.payload) + + content = { + "category": payload.category, + "title": payload.title, + "description": payload.description, + "img-src": payload.img_src, + } + + banner = ExporleBanner( + content=content, + link=payload.link, + sort=payload.sort, + language=payload.language, + ) + db.session.add(banner) + db.session.commit() + + return {"result": "success"}, 201 + + +@console_ns.route("/admin/insert-explore-banner/") +class DeleteExploreBannerApi(Resource): + @console_ns.doc("delete_explore_banner") + @console_ns.doc(description="Delete an explore banner") + @console_ns.doc(params={"banner_id": "Banner ID to delete"}) + @console_ns.response(204, "Banner deleted successfully") + @only_edition_cloud + @admin_required + def delete(self, banner_id): + banner = db.session.execute(select(ExporleBanner).where(ExporleBanner.id == banner_id)).scalar_one_or_none() + if not banner: + raise NotFound(f"Banner '{banner_id}' is not found") + + db.session.delete(banner) + db.session.commit() + + return {"result": "success"}, 204 diff --git a/api/controllers/console/app/error.py b/api/controllers/console/app/error.py index fbd7901646..6b4bd6755a 100644 --- a/api/controllers/console/app/error.py +++ b/api/controllers/console/app/error.py @@ -115,3 +115,9 @@ class InvokeRateLimitError(BaseHTTPException): error_code = "rate_limit_error" description = "Rate Limit Error" code = 429 + + +class NeedAddIdsError(BaseHTTPException): + error_code = "need_add_ids" + description = "Need to add ids." + code = 400 diff --git a/api/controllers/console/app/wraps.py b/api/controllers/console/app/wraps.py index 9bb2718f89..e687d980fa 100644 --- a/api/controllers/console/app/wraps.py +++ b/api/controllers/console/app/wraps.py @@ -23,6 +23,11 @@ def _load_app_model(app_id: str) -> App | None: return app_model +def _load_app_model_with_trial(app_id: str) -> App | None: + app_model = db.session.query(App).where(App.id == app_id, App.status == "normal").first() + return app_model + + def get_app_model(view: Callable[P, R] | None = None, *, mode: Union[AppMode, list[AppMode], None] = None): def decorator(view_func: Callable[P1, R1]): @wraps(view_func) @@ -62,3 +67,44 @@ def get_app_model(view: Callable[P, R] | None = None, *, mode: Union[AppMode, li return decorator else: return decorator(view) + + +def get_app_model_with_trial(view: Callable[P, R] | None = None, *, mode: Union[AppMode, list[AppMode], None] = None): + def decorator(view_func: Callable[P, R]): + @wraps(view_func) + def decorated_view(*args: P.args, **kwargs: P.kwargs): + if not kwargs.get("app_id"): + raise ValueError("missing app_id in path parameters") + + app_id = kwargs.get("app_id") + app_id = str(app_id) + + del kwargs["app_id"] + + app_model = _load_app_model_with_trial(app_id) + + if not app_model: + raise AppNotFoundError() + + app_mode = AppMode.value_of(app_model.mode) + + if mode is not None: + if isinstance(mode, list): + modes = mode + else: + modes = [mode] + + if app_mode not in modes: + mode_values = {m.value for m in modes} + raise AppNotFoundError(f"App mode is not in the supported list: {mode_values}") + + kwargs["app_model"] = app_model + + return view_func(*args, **kwargs) + + return decorated_view + + if view is None: + return decorator + else: + return decorator(view) diff --git a/api/controllers/console/explore/banner.py b/api/controllers/console/explore/banner.py new file mode 100644 index 0000000000..da306fbc9d --- /dev/null +++ b/api/controllers/console/explore/banner.py @@ -0,0 +1,43 @@ +from flask import request +from flask_restx import Resource + +from controllers.console import api +from controllers.console.explore.wraps import explore_banner_enabled +from extensions.ext_database import db +from models.model import ExporleBanner + + +class BannerApi(Resource): + """Resource for banner list.""" + + @explore_banner_enabled + def get(self): + """Get banner list.""" + language = request.args.get("language", "en-US") + + # Build base query for enabled banners + base_query = db.session.query(ExporleBanner).where(ExporleBanner.status == "enabled") + + # Try to get banners in the requested language + banners = base_query.where(ExporleBanner.language == language).order_by(ExporleBanner.sort).all() + + # Fallback to en-US if no banners found and language is not en-US + if not banners and language != "en-US": + banners = base_query.where(ExporleBanner.language == "en-US").order_by(ExporleBanner.sort).all() + # Convert banners to serializable format + result = [] + for banner in banners: + banner_data = { + "id": banner.id, + "content": banner.content, # Already parsed as JSON by SQLAlchemy + "link": banner.link, + "sort": banner.sort, + "status": banner.status, + "created_at": banner.created_at.isoformat() if banner.created_at else None, + } + result.append(banner_data) + + return result + + +api.add_resource(BannerApi, "/explore/banners") diff --git a/api/controllers/console/explore/error.py b/api/controllers/console/explore/error.py index 1e05ff4206..e96fa64f84 100644 --- a/api/controllers/console/explore/error.py +++ b/api/controllers/console/explore/error.py @@ -29,3 +29,25 @@ class AppAccessDeniedError(BaseHTTPException): error_code = "access_denied" description = "App access denied." code = 403 + + +class TrialAppNotAllowed(BaseHTTPException): + """*403* `Trial App Not Allowed` + + Raise if the user has reached the trial app limit. + """ + + error_code = "trial_app_not_allowed" + code = 403 + description = "the app is not allowed to be trial." + + +class TrialAppLimitExceeded(BaseHTTPException): + """*403* `Trial App Limit Exceeded` + + Raise if the user has exceeded the trial app limit. + """ + + error_code = "trial_app_limit_exceeded" + code = 403 + description = "The user has exceeded the trial app limit." diff --git a/api/controllers/console/explore/recommended_app.py b/api/controllers/console/explore/recommended_app.py index 2b2f807694..362513ec1c 100644 --- a/api/controllers/console/explore/recommended_app.py +++ b/api/controllers/console/explore/recommended_app.py @@ -29,6 +29,7 @@ recommended_app_fields = { "category": fields.String, "position": fields.Integer, "is_listed": fields.Boolean, + "can_trial": fields.Boolean, } recommended_app_list_fields = { diff --git a/api/controllers/console/explore/trial.py b/api/controllers/console/explore/trial.py new file mode 100644 index 0000000000..eb3c22cd0c --- /dev/null +++ b/api/controllers/console/explore/trial.py @@ -0,0 +1,514 @@ +import logging +from typing import Any, cast + +from flask import request +from flask_restx import Resource, marshal, marshal_with, reqparse +from werkzeug.exceptions import Forbidden, InternalServerError, NotFound + +import services +from controllers.common import fields +from controllers.common.fields import build_site_model +from controllers.console import api +from controllers.console.app.error import ( + AppUnavailableError, + AudioTooLargeError, + CompletionRequestError, + ConversationCompletedError, + NeedAddIdsError, + NoAudioUploadedError, + ProviderModelCurrentlyNotSupportError, + ProviderNotInitializeError, + ProviderNotSupportSpeechToTextError, + ProviderQuotaExceededError, + UnsupportedAudioTypeError, +) +from controllers.console.app.wraps import get_app_model_with_trial +from controllers.console.explore.error import ( + AppSuggestedQuestionsAfterAnswerDisabledError, + NotChatAppError, + NotCompletionAppError, + NotWorkflowAppError, +) +from controllers.console.explore.wraps import TrialAppResource, trial_feature_enable +from controllers.service_api import service_api_ns +from controllers.web.error import InvokeRateLimitError as InvokeRateLimitHttpError +from core.app.app_config.common.parameters_mapping import get_parameters_from_feature_dict +from core.app.apps.base_app_queue_manager import AppQueueManager +from core.app.entities.app_invoke_entities import InvokeFrom +from core.errors.error import ( + ModelCurrentlyNotSupportError, + ProviderTokenNotInitError, + QuotaExceededError, +) +from core.model_runtime.errors.invoke import InvokeError +from core.workflow.graph_engine.manager import GraphEngineManager +from extensions.ext_database import db +from fields.app_fields import app_detail_fields_with_site +from fields.dataset_fields import dataset_fields +from fields.workflow_fields import workflow_fields +from libs import helper +from libs.helper import uuid_value +from libs.login import current_user +from models import Account +from models.account import TenantStatus +from models.model import AppMode, Site +from models.workflow import Workflow +from services.app_generate_service import AppGenerateService +from services.app_service import AppService +from services.audio_service import AudioService +from services.dataset_service import DatasetService +from services.errors.audio import ( + AudioTooLargeServiceError, + NoAudioUploadedServiceError, + ProviderNotSupportSpeechToTextServiceError, + UnsupportedAudioTypeServiceError, +) +from services.errors.conversation import ConversationNotExistsError +from services.errors.llm import InvokeRateLimitError +from services.errors.message import ( + MessageNotExistsError, + SuggestedQuestionsAfterAnswerDisabledError, +) +from services.message_service import MessageService +from services.recommended_app_service import RecommendedAppService + +logger = logging.getLogger(__name__) + + +class TrialAppWorkflowRunApi(TrialAppResource): + def post(self, trial_app): + """ + Run workflow + """ + app_model = trial_app + if not app_model: + raise NotWorkflowAppError() + app_mode = AppMode.value_of(app_model.mode) + if app_mode != AppMode.WORKFLOW: + raise NotWorkflowAppError() + + parser = reqparse.RequestParser() + parser.add_argument("inputs", type=dict, required=True, nullable=False, location="json") + parser.add_argument("files", type=list, required=False, location="json") + args = parser.parse_args() + assert current_user is not None + try: + app_id = app_model.id + user_id = current_user.id + response = AppGenerateService.generate( + app_model=app_model, user=current_user, args=args, invoke_from=InvokeFrom.EXPLORE, streaming=True + ) + RecommendedAppService.add_trial_app_record(app_id, user_id) + return helper.compact_generate_response(response) + except ProviderTokenNotInitError as ex: + raise ProviderNotInitializeError(ex.description) + except QuotaExceededError: + raise ProviderQuotaExceededError() + except ModelCurrentlyNotSupportError: + raise ProviderModelCurrentlyNotSupportError() + except InvokeError as e: + raise CompletionRequestError(e.description) + except InvokeRateLimitError as ex: + raise InvokeRateLimitHttpError(ex.description) + except ValueError as e: + raise e + except Exception: + logger.exception("internal server error.") + raise InternalServerError() + + +class TrialAppWorkflowTaskStopApi(TrialAppResource): + def post(self, trial_app, task_id: str): + """ + Stop workflow task + """ + app_model = trial_app + if not app_model: + raise NotWorkflowAppError() + app_mode = AppMode.value_of(app_model.mode) + if app_mode != AppMode.WORKFLOW: + raise NotWorkflowAppError() + assert current_user is not None + + # Stop using both mechanisms for backward compatibility + # Legacy stop flag mechanism (without user check) + AppQueueManager.set_stop_flag_no_user_check(task_id) + + # New graph engine command channel mechanism + GraphEngineManager.send_stop_command(task_id) + + return {"result": "success"} + + +class TrialChatApi(TrialAppResource): + @trial_feature_enable + def post(self, trial_app): + app_model = trial_app + app_mode = AppMode.value_of(app_model.mode) + if app_mode not in {AppMode.CHAT, AppMode.AGENT_CHAT, AppMode.ADVANCED_CHAT}: + raise NotChatAppError() + + parser = reqparse.RequestParser() + parser.add_argument("inputs", type=dict, required=True, location="json") + parser.add_argument("query", type=str, required=True, location="json") + parser.add_argument("files", type=list, required=False, location="json") + parser.add_argument("conversation_id", type=uuid_value, location="json") + parser.add_argument("parent_message_id", type=uuid_value, required=False, location="json") + parser.add_argument("retriever_from", type=str, required=False, default="explore_app", location="json") + args = parser.parse_args() + + args["auto_generate_name"] = False + + try: + if not isinstance(current_user, Account): + raise ValueError("current_user must be an Account instance") + + # Get IDs before they might be detached from session + app_id = app_model.id + user_id = current_user.id + + response = AppGenerateService.generate( + app_model=app_model, user=current_user, args=args, invoke_from=InvokeFrom.EXPLORE, streaming=True + ) + RecommendedAppService.add_trial_app_record(app_id, user_id) + return helper.compact_generate_response(response) + except services.errors.conversation.ConversationNotExistsError: + raise NotFound("Conversation Not Exists.") + except services.errors.conversation.ConversationCompletedError: + raise ConversationCompletedError() + except services.errors.app_model_config.AppModelConfigBrokenError: + logger.exception("App model config broken.") + raise AppUnavailableError() + except ProviderTokenNotInitError as ex: + raise ProviderNotInitializeError(ex.description) + except QuotaExceededError: + raise ProviderQuotaExceededError() + except ModelCurrentlyNotSupportError: + raise ProviderModelCurrentlyNotSupportError() + except InvokeError as e: + raise CompletionRequestError(e.description) + except InvokeRateLimitError as ex: + raise InvokeRateLimitHttpError(ex.description) + except ValueError as e: + raise e + except Exception: + logger.exception("internal server error.") + raise InternalServerError() + + +class TrialMessageSuggestedQuestionApi(TrialAppResource): + @trial_feature_enable + def get(self, trial_app, message_id): + app_model = trial_app + app_mode = AppMode.value_of(app_model.mode) + if app_mode not in {AppMode.CHAT, AppMode.AGENT_CHAT, AppMode.ADVANCED_CHAT}: + raise NotChatAppError() + + message_id = str(message_id) + + try: + if not isinstance(current_user, Account): + raise ValueError("current_user must be an Account instance") + questions = MessageService.get_suggested_questions_after_answer( + app_model=app_model, user=current_user, message_id=message_id, invoke_from=InvokeFrom.EXPLORE + ) + except MessageNotExistsError: + raise NotFound("Message not found") + except ConversationNotExistsError: + raise NotFound("Conversation not found") + except SuggestedQuestionsAfterAnswerDisabledError: + raise AppSuggestedQuestionsAfterAnswerDisabledError() + except ProviderTokenNotInitError as ex: + raise ProviderNotInitializeError(ex.description) + except QuotaExceededError: + raise ProviderQuotaExceededError() + except ModelCurrentlyNotSupportError: + raise ProviderModelCurrentlyNotSupportError() + except InvokeError as e: + raise CompletionRequestError(e.description) + except Exception: + logger.exception("internal server error.") + raise InternalServerError() + + return {"data": questions} + + +class TrialChatAudioApi(TrialAppResource): + @trial_feature_enable + def post(self, trial_app): + app_model = trial_app + + file = request.files["file"] + + try: + if not isinstance(current_user, Account): + raise ValueError("current_user must be an Account instance") + + # Get IDs before they might be detached from session + app_id = app_model.id + user_id = current_user.id + + response = AudioService.transcript_asr(app_model=app_model, file=file, end_user=None) + RecommendedAppService.add_trial_app_record(app_id, user_id) + return response + except services.errors.app_model_config.AppModelConfigBrokenError: + logger.exception("App model config broken.") + raise AppUnavailableError() + except NoAudioUploadedServiceError: + raise NoAudioUploadedError() + except AudioTooLargeServiceError as e: + raise AudioTooLargeError(str(e)) + except UnsupportedAudioTypeServiceError: + raise UnsupportedAudioTypeError() + except ProviderNotSupportSpeechToTextServiceError: + raise ProviderNotSupportSpeechToTextError() + except ProviderTokenNotInitError as ex: + raise ProviderNotInitializeError(ex.description) + except QuotaExceededError: + raise ProviderQuotaExceededError() + except ModelCurrentlyNotSupportError: + raise ProviderModelCurrentlyNotSupportError() + except InvokeError as e: + raise CompletionRequestError(e.description) + except ValueError as e: + raise e + except Exception as e: + logger.exception("internal server error.") + raise InternalServerError() + + +class TrialChatTextApi(TrialAppResource): + @trial_feature_enable + def post(self, trial_app): + app_model = trial_app + try: + parser = reqparse.RequestParser() + parser.add_argument("message_id", type=str, required=False, location="json") + parser.add_argument("voice", type=str, location="json") + parser.add_argument("text", type=str, location="json") + parser.add_argument("streaming", type=bool, location="json") + args = parser.parse_args() + + message_id = args.get("message_id", None) + text = args.get("text", None) + voice = args.get("voice", None) + if not isinstance(current_user, Account): + raise ValueError("current_user must be an Account instance") + + # Get IDs before they might be detached from session + app_id = app_model.id + user_id = current_user.id + + response = AudioService.transcript_tts(app_model=app_model, text=text, voice=voice, message_id=message_id) + RecommendedAppService.add_trial_app_record(app_id, user_id) + return response + except services.errors.app_model_config.AppModelConfigBrokenError: + logger.exception("App model config broken.") + raise AppUnavailableError() + except NoAudioUploadedServiceError: + raise NoAudioUploadedError() + except AudioTooLargeServiceError as e: + raise AudioTooLargeError(str(e)) + except UnsupportedAudioTypeServiceError: + raise UnsupportedAudioTypeError() + except ProviderNotSupportSpeechToTextServiceError: + raise ProviderNotSupportSpeechToTextError() + except ProviderTokenNotInitError as ex: + raise ProviderNotInitializeError(ex.description) + except QuotaExceededError: + raise ProviderQuotaExceededError() + except ModelCurrentlyNotSupportError: + raise ProviderModelCurrentlyNotSupportError() + except InvokeError as e: + raise CompletionRequestError(e.description) + except ValueError as e: + raise e + except Exception as e: + logger.exception("internal server error.") + raise InternalServerError() + + +class TrialCompletionApi(TrialAppResource): + @trial_feature_enable + def post(self, trial_app): + app_model = trial_app + if app_model.mode != "completion": + raise NotCompletionAppError() + + parser = reqparse.RequestParser() + parser.add_argument("inputs", type=dict, required=True, location="json") + parser.add_argument("query", type=str, location="json", default="") + parser.add_argument("files", type=list, required=False, location="json") + parser.add_argument("response_mode", type=str, choices=["blocking", "streaming"], location="json") + parser.add_argument("retriever_from", type=str, required=False, default="explore_app", location="json") + args = parser.parse_args() + + streaming = args["response_mode"] == "streaming" + args["auto_generate_name"] = False + + try: + if not isinstance(current_user, Account): + raise ValueError("current_user must be an Account instance") + + # Get IDs before they might be detached from session + app_id = app_model.id + user_id = current_user.id + + response = AppGenerateService.generate( + app_model=app_model, user=current_user, args=args, invoke_from=InvokeFrom.EXPLORE, streaming=streaming + ) + + RecommendedAppService.add_trial_app_record(app_id, user_id) + return helper.compact_generate_response(response) + except services.errors.conversation.ConversationNotExistsError: + raise NotFound("Conversation Not Exists.") + except services.errors.conversation.ConversationCompletedError: + raise ConversationCompletedError() + except services.errors.app_model_config.AppModelConfigBrokenError: + logger.exception("App model config broken.") + raise AppUnavailableError() + except ProviderTokenNotInitError as ex: + raise ProviderNotInitializeError(ex.description) + except QuotaExceededError: + raise ProviderQuotaExceededError() + except ModelCurrentlyNotSupportError: + raise ProviderModelCurrentlyNotSupportError() + except InvokeError as e: + raise CompletionRequestError(e.description) + except ValueError as e: + raise e + except Exception: + logger.exception("internal server error.") + raise InternalServerError() + + +class TrialSitApi(Resource): + """Resource for trial app sites.""" + + @trial_feature_enable + @get_app_model_with_trial + @service_api_ns.marshal_with(build_site_model(service_api_ns)) + def get(self, app_model): + """Retrieve app site info. + + Returns the site configuration for the application including theme, icons, and text. + """ + site = db.session.query(Site).where(Site.app_id == app_model.id).first() + + if not site: + raise Forbidden() + + assert app_model.tenant + if app_model.tenant.status == TenantStatus.ARCHIVE: + raise Forbidden() + + return site + + +class TrialAppParameterApi(Resource): + """Resource for app variables.""" + + @trial_feature_enable + @get_app_model_with_trial + @marshal_with(fields.parameters_fields) + def get(self, app_model): + """Retrieve app parameters.""" + + if app_model is None: + raise AppUnavailableError() + + if app_model.mode in {AppMode.ADVANCED_CHAT, AppMode.WORKFLOW}: + workflow = app_model.workflow + if workflow is None: + raise AppUnavailableError() + + features_dict = workflow.features_dict + user_input_form = workflow.user_input_form(to_old_structure=True) + else: + app_model_config = app_model.app_model_config + if app_model_config is None: + raise AppUnavailableError() + + features_dict = app_model_config.to_dict() + + user_input_form = features_dict.get("user_input_form", []) + + return get_parameters_from_feature_dict(features_dict=features_dict, user_input_form=user_input_form) + + +class AppApi(Resource): + @trial_feature_enable + @get_app_model_with_trial + @marshal_with(app_detail_fields_with_site) + def get(self, app_model): + """Get app detail""" + + app_service = AppService() + app_model = app_service.get_app(app_model) + + return app_model + + +class AppWorkflowApi(Resource): + @trial_feature_enable + @get_app_model_with_trial + @marshal_with(workflow_fields) + def get(self, app_model): + """Get workflow detail""" + if not app_model.workflow_id: + raise AppUnavailableError() + + workflow = ( + db.session.query(Workflow) + .where( + Workflow.id == app_model.workflow_id, + ) + .first() + ) + return workflow + + +class DatasetListApi(Resource): + @trial_feature_enable + @get_app_model_with_trial + def get(self, app_model): + page = request.args.get("page", default=1, type=int) + limit = request.args.get("limit", default=20, type=int) + ids = request.args.getlist("ids") + + tenant_id = app_model.tenant_id + if ids: + datasets, total = DatasetService.get_datasets_by_ids(ids, tenant_id) + else: + raise NeedAddIdsError() + + data = cast(list[dict[str, Any]], marshal(datasets, dataset_fields)) + + response = {"data": data, "has_more": len(datasets) == limit, "limit": limit, "total": total, "page": page} + return response + + +api.add_resource(TrialChatApi, "/trial-apps//chat-messages", endpoint="trial_app_chat_completion") + +api.add_resource( + TrialMessageSuggestedQuestionApi, + "/trial-apps//messages//suggested-questions", + endpoint="trial_app_suggested_question", +) + +api.add_resource(TrialChatAudioApi, "/trial-apps//audio-to-text", endpoint="trial_app_audio") +api.add_resource(TrialChatTextApi, "/trial-apps//text-to-audio", endpoint="trial_app_text") + +api.add_resource(TrialCompletionApi, "/trial-apps//completion-messages", endpoint="trial_app_completion") + +api.add_resource(TrialSitApi, "/trial-apps//site") + +api.add_resource(TrialAppParameterApi, "/trial-apps//parameters", endpoint="trial_app_parameters") + +api.add_resource(AppApi, "/trial-apps/", endpoint="trial_app") + +api.add_resource(TrialAppWorkflowRunApi, "/trial-apps//workflows/run", endpoint="trial_app_workflow_run") +api.add_resource(TrialAppWorkflowTaskStopApi, "/trial-apps//workflows/tasks//stop") + +api.add_resource(AppWorkflowApi, "/trial-apps//workflows", endpoint="trial_app_workflow") +api.add_resource(DatasetListApi, "/trial-apps//datasets", endpoint="trial_app_datasets") diff --git a/api/controllers/console/explore/wraps.py b/api/controllers/console/explore/wraps.py index 2a97d312aa..38f0a04904 100644 --- a/api/controllers/console/explore/wraps.py +++ b/api/controllers/console/explore/wraps.py @@ -2,14 +2,15 @@ from collections.abc import Callable from functools import wraps from typing import Concatenate, ParamSpec, TypeVar +from flask import abort from flask_restx import Resource from werkzeug.exceptions import NotFound -from controllers.console.explore.error import AppAccessDeniedError +from controllers.console.explore.error import AppAccessDeniedError, TrialAppLimitExceeded, TrialAppNotAllowed from controllers.console.wraps import account_initialization_required from extensions.ext_database import db from libs.login import current_account_with_tenant, login_required -from models import InstalledApp +from models import AccountTrialAppRecord, App, InstalledApp, TrialApp from services.enterprise.enterprise_service import EnterpriseService from services.feature_service import FeatureService @@ -71,6 +72,61 @@ def user_allowed_to_access_app(view: Callable[Concatenate[InstalledApp, P], R] | return decorator +def trial_app_required(view: Callable[Concatenate[App, P], R] | None = None): + def decorator(view: Callable[Concatenate[App, P], R]): + @wraps(view) + def decorated(app_id: str, *args: P.args, **kwargs: P.kwargs): + current_user, _ = current_account_with_tenant() + + trial_app = db.session.query(TrialApp).where(TrialApp.app_id == str(app_id)).first() + + if trial_app is None: + raise TrialAppNotAllowed() + app = trial_app.app + + if app is None: + raise TrialAppNotAllowed() + + account_trial_app_record = ( + db.session.query(AccountTrialAppRecord) + .where(AccountTrialAppRecord.account_id == current_user.id, AccountTrialAppRecord.app_id == app_id) + .first() + ) + if account_trial_app_record: + if account_trial_app_record.count >= trial_app.trial_limit: + raise TrialAppLimitExceeded() + + return view(app, *args, **kwargs) + + return decorated + + if view: + return decorator(view) + return decorator + + +def trial_feature_enable(view: Callable[..., R]) -> Callable[..., R]: + @wraps(view) + def decorated(*args, **kwargs): + features = FeatureService.get_system_features() + if not features.enable_trial_app: + abort(403, "Trial app feature is not enabled.") + return view(*args, **kwargs) + + return decorated + + +def explore_banner_enabled(view: Callable[..., R]) -> Callable[..., R]: + @wraps(view) + def decorated(*args, **kwargs): + features = FeatureService.get_system_features() + if not features.enable_explore_banner: + abort(403, "Explore banner feature is not enabled.") + return view(*args, **kwargs) + + return decorated + + class InstalledAppResource(Resource): # must be reversed if there are multiple decorators @@ -80,3 +136,13 @@ class InstalledAppResource(Resource): account_initialization_required, login_required, ] + + +class TrialAppResource(Resource): + # must be reversed if there are multiple decorators + + method_decorators = [ + trial_app_required, + account_initialization_required, + login_required, + ] diff --git a/api/migrations/versions/2025_10_23_1110-f9f6d18a37f9_add_table_explore_banner_and_trial.py b/api/migrations/versions/2025_10_23_1110-f9f6d18a37f9_add_table_explore_banner_and_trial.py new file mode 100644 index 0000000000..ea2145c2d5 --- /dev/null +++ b/api/migrations/versions/2025_10_23_1110-f9f6d18a37f9_add_table_explore_banner_and_trial.py @@ -0,0 +1,73 @@ +"""add table explore banner and trial + +Revision ID: f9f6d18a37f9 +Revises: ae662b25d9bc +Create Date: 2025-10-23 11:10:18.079355 + +""" +from alembic import op +import models as models +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision = 'f9f6d18a37f9' +down_revision = 'ae662b25d9bc' +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.create_table('account_trial_app_records', + sa.Column('id', models.types.StringUUID(), server_default=sa.text('uuid_generate_v4()'), nullable=False), + sa.Column('account_id', models.types.StringUUID(), nullable=False), + sa.Column('app_id', models.types.StringUUID(), nullable=False), + sa.Column('count', sa.Integer(), nullable=False), + sa.Column('created_at', sa.DateTime(), server_default=sa.text('CURRENT_TIMESTAMP'), nullable=False), + sa.PrimaryKeyConstraint('id', name='user_trial_app_pkey'), + sa.UniqueConstraint('account_id', 'app_id', name='unique_account_trial_app_record') + ) + with op.batch_alter_table('account_trial_app_records', schema=None) as batch_op: + batch_op.create_index('account_trial_app_record_account_id_idx', ['account_id'], unique=False) + batch_op.create_index('account_trial_app_record_app_id_idx', ['app_id'], unique=False) + + op.create_table('exporle_banners', + sa.Column('id', models.types.StringUUID(), server_default=sa.text('uuid_generate_v4()'), nullable=False), + sa.Column('content', sa.JSON(), nullable=False), + sa.Column('link', sa.String(length=255), nullable=False), + sa.Column('sort', sa.Integer(), nullable=False), + sa.Column('status', sa.String(length=255), server_default=sa.text("'enabled'::character varying"), nullable=False), + sa.Column('created_at', sa.DateTime(), server_default=sa.text('CURRENT_TIMESTAMP'), nullable=False), + sa.Column('language', sa.String(length=255), server_default=sa.text("'en-US'::character varying"), nullable=False), + sa.PrimaryKeyConstraint('id', name='exporler_banner_pkey') + ) + op.create_table('trial_apps', + sa.Column('id', models.types.StringUUID(), server_default=sa.text('uuid_generate_v4()'), nullable=False), + sa.Column('app_id', models.types.StringUUID(), nullable=False), + sa.Column('tenant_id', models.types.StringUUID(), nullable=False), + sa.Column('created_at', sa.DateTime(), server_default=sa.text('CURRENT_TIMESTAMP'), nullable=False), + sa.Column('trial_limit', sa.Integer(), nullable=False), + sa.PrimaryKeyConstraint('id', name='trial_app_pkey'), + sa.UniqueConstraint('app_id', name='unique_trail_app_id') + ) + with op.batch_alter_table('trial_apps', schema=None) as batch_op: + batch_op.create_index('trial_app_app_id_idx', ['app_id'], unique=False) + batch_op.create_index('trial_app_tenant_id_idx', ['tenant_id'], unique=False) + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table('trial_apps', schema=None) as batch_op: + batch_op.drop_index('trial_app_tenant_id_idx') + batch_op.drop_index('trial_app_app_id_idx') + + op.drop_table('trial_apps') + op.drop_table('exporle_banners') + with op.batch_alter_table('account_trial_app_records', schema=None) as batch_op: + batch_op.drop_index('account_trial_app_record_app_id_idx') + batch_op.drop_index('account_trial_app_record_account_id_idx') + + op.drop_table('account_trial_app_records') + # ### end Alembic commands ### diff --git a/api/models/__init__.py b/api/models/__init__.py index 7b81cea415..4c2ef0fb71 100644 --- a/api/models/__init__.py +++ b/api/models/__init__.py @@ -35,6 +35,7 @@ from .enums import ( WorkflowTriggerStatus, ) from .model import ( + AccountTrialAppRecord, ApiRequest, ApiToken, App, @@ -47,6 +48,7 @@ from .model import ( DatasetRetrieverResource, DifySetup, EndUser, + ExporleBanner, IconType, InstalledApp, LLMGenerationDetail, @@ -63,6 +65,7 @@ from .model import ( TagBinding, TenantCreditPool, TraceAppConfig, + TrialApp, UploadFile, ) from .oauth import DatasourceOauthParamConfig, DatasourceProvider @@ -115,6 +118,7 @@ __all__ = [ "Account", "AccountIntegrate", "AccountStatus", + "AccountTrialAppRecord", "ApiRequest", "ApiToken", "ApiToolProvider", @@ -151,6 +155,7 @@ __all__ = [ "DocumentSegment", "Embedding", "EndUser", + "ExporleBanner", "ExternalKnowledgeApis", "ExternalKnowledgeBindings", "IconType", @@ -190,6 +195,7 @@ __all__ = [ "ToolLabelBinding", "ToolModelInvoke", "TraceAppConfig", + "TrialApp", "TriggerOAuthSystemClient", "TriggerOAuthTenantClient", "TriggerSubscription", diff --git a/api/models/model.py b/api/models/model.py index 76a78bdfba..a0e9e6a518 100644 --- a/api/models/model.py +++ b/api/models/model.py @@ -605,6 +605,64 @@ class InstalledApp(TypeBase): return tenant +class TrialApp(Base): + __tablename__ = "trial_apps" + __table_args__ = ( + sa.PrimaryKeyConstraint("id", name="trial_app_pkey"), + sa.Index("trial_app_app_id_idx", "app_id"), + sa.Index("trial_app_tenant_id_idx", "tenant_id"), + sa.UniqueConstraint("app_id", name="unique_trail_app_id"), + ) + + id = mapped_column(StringUUID, server_default=sa.text("uuid_generate_v4()")) + app_id = mapped_column(StringUUID, nullable=False) + tenant_id = mapped_column(StringUUID, nullable=False) + created_at = mapped_column(sa.DateTime, nullable=False, server_default=func.current_timestamp()) + trial_limit = mapped_column(sa.Integer, nullable=False, default=3) + + @property + def app(self) -> App | None: + app = db.session.query(App).where(App.id == self.app_id).first() + return app + + +class AccountTrialAppRecord(Base): + __tablename__ = "account_trial_app_records" + __table_args__ = ( + sa.PrimaryKeyConstraint("id", name="user_trial_app_pkey"), + sa.Index("account_trial_app_record_account_id_idx", "account_id"), + sa.Index("account_trial_app_record_app_id_idx", "app_id"), + sa.UniqueConstraint("account_id", "app_id", name="unique_account_trial_app_record"), + ) + id = mapped_column(StringUUID, server_default=sa.text("uuid_generate_v4()")) + account_id = mapped_column(StringUUID, nullable=False) + app_id = mapped_column(StringUUID, nullable=False) + count = mapped_column(sa.Integer, nullable=False, default=0) + created_at = mapped_column(sa.DateTime, nullable=False, server_default=func.current_timestamp()) + + @property + def app(self) -> App | None: + app = db.session.query(App).where(App.id == self.app_id).first() + return app + + @property + def user(self) -> Account | None: + user = db.session.query(Account).where(Account.id == self.account_id).first() + return user + + +class ExporleBanner(Base): + __tablename__ = "exporle_banners" + __table_args__ = (sa.PrimaryKeyConstraint("id", name="exporler_banner_pkey"),) + id = mapped_column(StringUUID, server_default=sa.text("uuid_generate_v4()")) + content = mapped_column(sa.JSON, nullable=False) + link = mapped_column(String(255), nullable=False) + sort = mapped_column(sa.Integer, nullable=False) + status = mapped_column(sa.String(255), nullable=False, server_default=sa.text("'enabled'::character varying")) + created_at = mapped_column(sa.DateTime, nullable=False, server_default=func.current_timestamp()) + language = mapped_column(String(255), nullable=False, server_default=sa.text("'en-US'::character varying")) + + class OAuthProviderApp(TypeBase): """ Globally shared OAuth provider app information. diff --git a/api/services/feature_service.py b/api/services/feature_service.py index 9b853b8337..fc91f450b7 100644 --- a/api/services/feature_service.py +++ b/api/services/feature_service.py @@ -170,6 +170,8 @@ class SystemFeatureModel(BaseModel): plugin_installation_permission: PluginInstallationPermissionModel = PluginInstallationPermissionModel() enable_change_email: bool = True plugin_manager: PluginManagerModel = PluginManagerModel() + enable_trial_app: bool = False + enable_explore_banner: bool = False class FeatureService: @@ -225,6 +227,8 @@ class FeatureService: system_features.is_allow_register = dify_config.ALLOW_REGISTER system_features.is_allow_create_workspace = dify_config.ALLOW_CREATE_WORKSPACE system_features.is_email_setup = dify_config.MAIL_TYPE is not None and dify_config.MAIL_TYPE != "" + system_features.enable_trial_app = dify_config.ENABLE_TRIAL_APP + system_features.enable_explore_banner = dify_config.ENABLE_EXPLORE_BANNER @classmethod def _fulfill_params_from_env(cls, features: FeatureModel): diff --git a/api/services/recommended_app_service.py b/api/services/recommended_app_service.py index 544383a106..6b211a5632 100644 --- a/api/services/recommended_app_service.py +++ b/api/services/recommended_app_service.py @@ -1,4 +1,7 @@ from configs import dify_config +from extensions.ext_database import db +from models.model import AccountTrialAppRecord, TrialApp +from services.feature_service import FeatureService from services.recommend_app.recommend_app_factory import RecommendAppRetrievalFactory @@ -20,6 +23,15 @@ class RecommendedAppService: ) ) + if FeatureService.get_system_features().enable_trial_app: + apps = result["recommended_apps"] + for app in apps: + app_id = app["app_id"] + trial_app_model = db.session.query(TrialApp).where(TrialApp.app_id == app_id).first() + if trial_app_model: + app["can_trial"] = True + else: + app["can_trial"] = False return result @classmethod @@ -32,4 +44,30 @@ class RecommendedAppService: mode = dify_config.HOSTED_FETCH_APP_TEMPLATES_MODE retrieval_instance = RecommendAppRetrievalFactory.get_recommend_app_factory(mode)() result: dict = retrieval_instance.get_recommend_app_detail(app_id) + if FeatureService.get_system_features().enable_trial_app: + app_id = result["id"] + trial_app_model = db.session.query(TrialApp).where(TrialApp.app_id == app_id).first() + if trial_app_model: + result["can_trial"] = True + else: + result["can_trial"] = False return result + + @classmethod + def add_trial_app_record(cls, app_id: str, account_id: str): + """ + Add trial app record. + :param app_id: app id + :return: + """ + account_trial_app_record = ( + db.session.query(AccountTrialAppRecord) + .where(AccountTrialAppRecord.app_id == app_id, AccountTrialAppRecord.account_id == account_id) + .first() + ) + if account_trial_app_record: + account_trial_app_record.count += 1 + db.session.commit() + else: + db.session.add(AccountTrialAppRecord(app_id=app_id, count=1, account_id=account_id)) + db.session.commit() diff --git a/web/app/components/app/log/list.spec.tsx b/web/app/components/app/log/list.spec.tsx new file mode 100644 index 0000000000..81901c6cad --- /dev/null +++ b/web/app/components/app/log/list.spec.tsx @@ -0,0 +1,228 @@ +/** + * Tests for race condition prevention logic in chat message loading. + * These tests verify the core algorithms used in fetchData and loadMoreMessages + * to prevent race conditions, infinite loops, and stale state issues. + * See GitHub issue #30259 for context. + */ + +// Test the race condition prevention logic in isolation +describe('Chat Message Loading Race Condition Prevention', () => { + beforeEach(() => { + vi.clearAllMocks() + vi.useFakeTimers() + }) + + afterEach(() => { + vi.useRealTimers() + }) + + describe('Request Deduplication', () => { + it('should deduplicate messages with same IDs when merging responses', async () => { + // Simulate the deduplication logic used in setAllChatItems + const existingItems = [ + { id: 'msg-1', isAnswer: false }, + { id: 'msg-2', isAnswer: true }, + ] + const newItems = [ + { id: 'msg-2', isAnswer: true }, // duplicate + { id: 'msg-3', isAnswer: false }, // new + ] + + const existingIds = new Set(existingItems.map(item => item.id)) + const uniqueNewItems = newItems.filter(item => !existingIds.has(item.id)) + const mergedItems = [...uniqueNewItems, ...existingItems] + + expect(uniqueNewItems).toHaveLength(1) + expect(uniqueNewItems[0].id).toBe('msg-3') + expect(mergedItems).toHaveLength(3) + }) + }) + + describe('Retry Counter Logic', () => { + const MAX_RETRY_COUNT = 3 + + it('should increment retry counter when no unique items found', () => { + const state = { retryCount: 0 } + const prevItemsLength = 5 + + // Simulate the retry logic from loadMoreMessages + const uniqueNewItemsLength = 0 + + if (uniqueNewItemsLength === 0) { + if (state.retryCount < MAX_RETRY_COUNT && prevItemsLength > 1) { + state.retryCount++ + } + else { + state.retryCount = 0 + } + } + + expect(state.retryCount).toBe(1) + }) + + it('should reset retry counter after MAX_RETRY_COUNT attempts', () => { + const state = { retryCount: MAX_RETRY_COUNT } + const prevItemsLength = 5 + const uniqueNewItemsLength = 0 + + if (uniqueNewItemsLength === 0) { + if (state.retryCount < MAX_RETRY_COUNT && prevItemsLength > 1) { + state.retryCount++ + } + else { + state.retryCount = 0 + } + } + + expect(state.retryCount).toBe(0) + }) + + it('should reset retry counter when unique items are found', () => { + const state = { retryCount: 2 } + + // Simulate finding unique items (length > 0) + const processRetry = (uniqueCount: number) => { + if (uniqueCount === 0) { + state.retryCount++ + } + else { + state.retryCount = 0 + } + } + + processRetry(3) // Found 3 unique items + + expect(state.retryCount).toBe(0) + }) + }) + + describe('Throttling Logic', () => { + const SCROLL_DEBOUNCE_MS = 200 + + it('should throttle requests within debounce window', () => { + const state = { lastLoadTime: 0 } + const results: boolean[] = [] + + const tryRequest = (now: number): boolean => { + if (now - state.lastLoadTime >= SCROLL_DEBOUNCE_MS) { + state.lastLoadTime = now + return true + } + return false + } + + // First request - should pass + results.push(tryRequest(1000)) + // Second request within debounce - should be blocked + results.push(tryRequest(1100)) + // Third request after debounce - should pass + results.push(tryRequest(1300)) + + expect(results).toEqual([true, false, true]) + }) + }) + + describe('AbortController Cancellation', () => { + it('should abort previous request when new request starts', () => { + const state: { controller: AbortController | null } = { controller: null } + const abortedSignals: boolean[] = [] + + // First request + const controller1 = new AbortController() + state.controller = controller1 + + // Second request - should abort first + if (state.controller) { + state.controller.abort() + abortedSignals.push(state.controller.signal.aborted) + } + const controller2 = new AbortController() + state.controller = controller2 + + expect(abortedSignals).toEqual([true]) + expect(controller1.signal.aborted).toBe(true) + expect(controller2.signal.aborted).toBe(false) + }) + }) + + describe('Stale Response Detection', () => { + it('should ignore responses from outdated requests', () => { + const state = { requestId: 0 } + const processedResponses: number[] = [] + + // Simulate concurrent requests - each gets its own captured ID + const request1Id = ++state.requestId + const request2Id = ++state.requestId + + // Request 2 completes first (current requestId is 2) + if (request2Id === state.requestId) { + processedResponses.push(request2Id) + } + + // Request 1 completes later (stale - requestId is still 2) + if (request1Id === state.requestId) { + processedResponses.push(request1Id) + } + + expect(processedResponses).toEqual([2]) + expect(processedResponses).not.toContain(1) + }) + }) + + describe('Pagination Anchor Management', () => { + it('should track oldest answer ID for pagination', () => { + let oldestAnswerIdRef: string | undefined + + const chatItems = [ + { id: 'question-1', isAnswer: false }, + { id: 'answer-1', isAnswer: true }, + { id: 'question-2', isAnswer: false }, + { id: 'answer-2', isAnswer: true }, + ] + + // Update pagination anchor with oldest answer ID + const answerItems = chatItems.filter(item => item.isAnswer) + const oldestAnswer = answerItems[answerItems.length - 1] + if (oldestAnswer?.id) { + oldestAnswerIdRef = oldestAnswer.id + } + + expect(oldestAnswerIdRef).toBe('answer-2') + }) + + it('should use pagination anchor in subsequent requests', () => { + const oldestAnswerIdRef = 'answer-123' + const params: { conversation_id: string, limit: number, first_id?: string } = { + conversation_id: 'conv-1', + limit: 10, + } + + if (oldestAnswerIdRef) { + params.first_id = oldestAnswerIdRef + } + + expect(params.first_id).toBe('answer-123') + }) + }) +}) + +describe('Functional State Update Pattern', () => { + it('should use functional update to avoid stale closures', () => { + // Simulate the functional update pattern used in setAllChatItems + let state = [{ id: '1' }, { id: '2' }] + + const newItems = [{ id: '3' }, { id: '2' }] // id '2' is duplicate + + // Functional update pattern + const updater = (prevItems: { id: string }[]) => { + const existingIds = new Set(prevItems.map(item => item.id)) + const uniqueNewItems = newItems.filter(item => !existingIds.has(item.id)) + return [...uniqueNewItems, ...prevItems] + } + + state = updater(state) + + expect(state).toHaveLength(3) + expect(state.map(i => i.id)).toEqual(['3', '1', '2']) + }) +}) diff --git a/web/app/components/app/log/list.tsx b/web/app/components/app/log/list.tsx index a17177bf7e..410953ccf7 100644 --- a/web/app/components/app/log/list.tsx +++ b/web/app/components/app/log/list.tsx @@ -209,7 +209,6 @@ type IDetailPanel = { function DetailPanel({ detail, onFeedback }: IDetailPanel) { const MIN_ITEMS_FOR_SCROLL_LOADING = 8 - const SCROLL_THRESHOLD_PX = 50 const SCROLL_DEBOUNCE_MS = 200 const { userProfile: { timezone } } = useAppContext() const { formatTime } = useTimestamp() @@ -228,69 +227,103 @@ function DetailPanel({ detail, onFeedback }: IDetailPanel) { const [hasMore, setHasMore] = useState(true) const [varValues, setVarValues] = useState>({}) const isLoadingRef = useRef(false) + const abortControllerRef = useRef(null) + const requestIdRef = useRef(0) + const lastLoadTimeRef = useRef(0) + const retryCountRef = useRef(0) + const oldestAnswerIdRef = useRef(undefined) + const MAX_RETRY_COUNT = 3 const [allChatItems, setAllChatItems] = useState([]) const [chatItemTree, setChatItemTree] = useState([]) const [threadChatItems, setThreadChatItems] = useState([]) const fetchData = useCallback(async () => { - if (isLoadingRef.current) + if (isLoadingRef.current || !hasMore) return + // Cancel any in-flight request + if (abortControllerRef.current) { + abortControllerRef.current.abort() + } + + const controller = new AbortController() + abortControllerRef.current = controller + const currentRequestId = ++requestIdRef.current + try { isLoadingRef.current = true - if (!hasMore) - return - const params: ChatMessagesRequest = { conversation_id: detail.id, limit: 10, } - // Use the oldest answer item ID for pagination - const answerItems = allChatItems.filter(item => item.isAnswer) - const oldestAnswerItem = answerItems[answerItems.length - 1] - if (oldestAnswerItem?.id) - params.first_id = oldestAnswerItem.id + // Use ref for pagination anchor to avoid stale closure issues + if (oldestAnswerIdRef.current) + params.first_id = oldestAnswerIdRef.current + const messageRes = await fetchChatMessages({ url: `/apps/${appDetail?.id}/chat-messages`, params, }) + + // Ignore stale responses + if (currentRequestId !== requestIdRef.current || controller.signal.aborted) + return if (messageRes.data.length > 0) { const varValues = messageRes.data.at(-1)!.inputs setVarValues(varValues) } setHasMore(messageRes.has_more) - const newAllChatItems = [ - ...getFormattedChatList(messageRes.data, detail.id, timezone!, t('dateTimeFormat', { ns: 'appLog' }) as string), - ...allChatItems, - ] - setAllChatItems(newAllChatItems) + const newItems = getFormattedChatList(messageRes.data, detail.id, timezone!, t('dateTimeFormat', { ns: 'appLog' }) as string) - let tree = buildChatItemTree(newAllChatItems) - if (messageRes.has_more === false && detail?.model_config?.configs?.introduction) { - tree = [{ - id: 'introduction', - isAnswer: true, - isOpeningStatement: true, - content: detail?.model_config?.configs?.introduction ?? 'hello', - feedbackDisabled: true, - children: tree, - }] - } - setChatItemTree(tree) - - const lastMessageId = newAllChatItems.length > 0 ? newAllChatItems[newAllChatItems.length - 1].id : undefined - setThreadChatItems(getThreadMessages(tree, lastMessageId)) + // Use functional update to avoid stale state issues + setAllChatItems((prevItems: IChatItem[]) => { + const existingIds = new Set(prevItems.map(item => item.id)) + const uniqueNewItems = newItems.filter(item => !existingIds.has(item.id)) + return [...uniqueNewItems, ...prevItems] + }) } - catch (err) { + catch (err: unknown) { + if (err instanceof Error && err.name === 'AbortError') + return console.error('fetchData execution failed:', err) } finally { isLoadingRef.current = false + if (abortControllerRef.current === controller) + abortControllerRef.current = null } - }, [allChatItems, detail.id, hasMore, timezone, t, appDetail, detail?.model_config?.configs?.introduction]) + }, [detail.id, hasMore, timezone, t, appDetail, detail?.model_config?.configs?.introduction]) + + // Derive chatItemTree, threadChatItems, and oldestAnswerIdRef from allChatItems + useEffect(() => { + if (allChatItems.length === 0) + return + + let tree = buildChatItemTree(allChatItems) + if (!hasMore && detail?.model_config?.configs?.introduction) { + tree = [{ + id: 'introduction', + isAnswer: true, + isOpeningStatement: true, + content: detail?.model_config?.configs?.introduction ?? 'hello', + feedbackDisabled: true, + children: tree, + }] + } + setChatItemTree(tree) + + const lastMessageId = allChatItems.length > 0 ? allChatItems[allChatItems.length - 1].id : undefined + setThreadChatItems(getThreadMessages(tree, lastMessageId)) + + // Update pagination anchor ref with the oldest answer ID + const answerItems = allChatItems.filter(item => item.isAnswer) + const oldestAnswer = answerItems[answerItems.length - 1] + if (oldestAnswer?.id) + oldestAnswerIdRef.current = oldestAnswer.id + }, [allChatItems, hasMore, detail?.model_config?.configs?.introduction]) const switchSibling = useCallback((siblingMessageId: string) => { const newThreadChatItems = getThreadMessages(chatItemTree, siblingMessageId) @@ -397,6 +430,12 @@ function DetailPanel({ detail, onFeedback }: IDetailPanel) { if (isLoading || !hasMore || !appDetail?.id || !detail.id) return + // Throttle using ref to persist across re-renders + const now = Date.now() + if (now - lastLoadTimeRef.current < SCROLL_DEBOUNCE_MS) + return + lastLoadTimeRef.current = now + setIsLoading(true) try { @@ -405,15 +444,9 @@ function DetailPanel({ detail, onFeedback }: IDetailPanel) { limit: 10, } - // Use the earliest response item as the first_id - const answerItems = allChatItems.filter(item => item.isAnswer) - const oldestAnswerItem = answerItems[answerItems.length - 1] - if (oldestAnswerItem?.id) { - params.first_id = oldestAnswerItem.id - } - else if (allChatItems.length > 0 && allChatItems[0]?.id) { - const firstId = allChatItems[0].id.replace('question-', '').replace('answer-', '') - params.first_id = firstId + // Use ref for pagination anchor to avoid stale closure issues + if (oldestAnswerIdRef.current) { + params.first_id = oldestAnswerIdRef.current } const messageRes = await fetchChatMessages({ @@ -423,6 +456,7 @@ function DetailPanel({ detail, onFeedback }: IDetailPanel) { if (!messageRes.data || messageRes.data.length === 0) { setHasMore(false) + retryCountRef.current = 0 return } @@ -440,91 +474,36 @@ function DetailPanel({ detail, onFeedback }: IDetailPanel) { t('dateTimeFormat', { ns: 'appLog' }) as string, ) - // Check for duplicate messages - const existingIds = new Set(allChatItems.map(item => item.id)) - const uniqueNewItems = newItems.filter(item => !existingIds.has(item.id)) + // Use functional update to get latest state and avoid stale closures + setAllChatItems((prevItems: IChatItem[]) => { + const existingIds = new Set(prevItems.map(item => item.id)) + const uniqueNewItems = newItems.filter(item => !existingIds.has(item.id)) - if (uniqueNewItems.length === 0) { - if (allChatItems.length > 1) { - const nextId = allChatItems[1].id.replace('question-', '').replace('answer-', '') - - const retryParams = { - ...params, - first_id: nextId, + // If no unique items and we haven't exceeded retry limit, signal retry needed + if (uniqueNewItems.length === 0) { + if (retryCountRef.current < MAX_RETRY_COUNT && prevItems.length > 1) { + retryCountRef.current++ + return prevItems } - - const retryRes = await fetchChatMessages({ - url: `/apps/${appDetail.id}/chat-messages`, - params: retryParams, - }) - - if (retryRes.data && retryRes.data.length > 0) { - const retryItems = getFormattedChatList( - retryRes.data, - detail.id, - timezone!, - t('dateTimeFormat', { ns: 'appLog' }) as string, - ) - - const retryUniqueItems = retryItems.filter(item => !existingIds.has(item.id)) - if (retryUniqueItems.length > 0) { - const newAllChatItems = [ - ...retryUniqueItems, - ...allChatItems, - ] - - setAllChatItems(newAllChatItems) - - let tree = buildChatItemTree(newAllChatItems) - if (retryRes.has_more === false && detail?.model_config?.configs?.introduction) { - tree = [{ - id: 'introduction', - isAnswer: true, - isOpeningStatement: true, - content: detail?.model_config?.configs?.introduction ?? 'hello', - feedbackDisabled: true, - children: tree, - }] - } - setChatItemTree(tree) - setHasMore(retryRes.has_more) - setThreadChatItems(getThreadMessages(tree, newAllChatItems.at(-1)?.id)) - return - } + else { + retryCountRef.current = 0 + return prevItems } } - } - const newAllChatItems = [ - ...uniqueNewItems, - ...allChatItems, - ] - - setAllChatItems(newAllChatItems) - - let tree = buildChatItemTree(newAllChatItems) - if (messageRes.has_more === false && detail?.model_config?.configs?.introduction) { - tree = [{ - id: 'introduction', - isAnswer: true, - isOpeningStatement: true, - content: detail?.model_config?.configs?.introduction ?? 'hello', - feedbackDisabled: true, - children: tree, - }] - } - setChatItemTree(tree) - - setThreadChatItems(getThreadMessages(tree, newAllChatItems.at(-1)?.id)) + retryCountRef.current = 0 + return [...uniqueNewItems, ...prevItems] + }) } catch (error) { console.error(error) setHasMore(false) + retryCountRef.current = 0 } finally { setIsLoading(false) } - }, [allChatItems, detail.id, hasMore, isLoading, timezone, t, appDetail]) + }, [detail.id, hasMore, isLoading, timezone, t, appDetail, detail?.model_config?.configs?.introduction]) useEffect(() => { const scrollableDiv = document.getElementById('scrollableDiv') @@ -556,24 +535,11 @@ function DetailPanel({ detail, onFeedback }: IDetailPanel) { if (!scrollContainer) return - let lastLoadTime = 0 - const throttleDelay = 200 - const handleScroll = () => { const currentScrollTop = scrollContainer!.scrollTop - const scrollHeight = scrollContainer!.scrollHeight - const clientHeight = scrollContainer!.clientHeight + const isNearTop = currentScrollTop < 30 - const distanceFromTop = currentScrollTop - const distanceFromBottom = scrollHeight - currentScrollTop - clientHeight - - const now = Date.now() - - const isNearTop = distanceFromTop < 30 - // eslint-disable-next-line sonarjs/no-unused-vars - const _distanceFromBottom = distanceFromBottom < 30 - if (isNearTop && hasMore && !isLoading && (now - lastLoadTime > throttleDelay)) { - lastLoadTime = now + if (isNearTop && hasMore && !isLoading) { loadMoreMessages() } } @@ -619,36 +585,6 @@ function DetailPanel({ detail, onFeedback }: IDetailPanel) { return () => cancelAnimationFrame(raf) }, []) - // Add scroll listener to ensure loading is triggered - useEffect(() => { - if (threadChatItems.length >= MIN_ITEMS_FOR_SCROLL_LOADING && hasMore) { - const scrollableDiv = document.getElementById('scrollableDiv') - - if (scrollableDiv) { - let loadingTimeout: NodeJS.Timeout | null = null - - const handleScroll = () => { - const { scrollTop } = scrollableDiv - - // Trigger loading when scrolling near the top - if (scrollTop < SCROLL_THRESHOLD_PX && !isLoadingRef.current) { - if (loadingTimeout) - clearTimeout(loadingTimeout) - - loadingTimeout = setTimeout(fetchData, SCROLL_DEBOUNCE_MS) // 200ms debounce - } - } - - scrollableDiv.addEventListener('scroll', handleScroll) - return () => { - scrollableDiv.removeEventListener('scroll', handleScroll) - if (loadingTimeout) - clearTimeout(loadingTimeout) - } - } - } - }, [threadChatItems.length, hasMore, fetchData]) - return (
{/* Panel Header */} diff --git a/web/app/components/base/chat/embedded-chatbot/header/index.tsx b/web/app/components/base/chat/embedded-chatbot/header/index.tsx index 869f88efb6..95ba6d212d 100644 --- a/web/app/components/base/chat/embedded-chatbot/header/index.tsx +++ b/web/app/components/base/chat/embedded-chatbot/header/index.tsx @@ -66,7 +66,9 @@ const Header: FC = ({ const listener = (event: MessageEvent) => handleMessageReceived(event) window.addEventListener('message', listener) - window.parent.postMessage({ type: 'dify-chatbot-iframe-ready' }, '*') + // Security: Use document.referrer to get parent origin + const targetOrigin = document.referrer ? new URL(document.referrer).origin : '*' + window.parent.postMessage({ type: 'dify-chatbot-iframe-ready' }, targetOrigin) return () => window.removeEventListener('message', listener) }, [isIframe, handleMessageReceived]) diff --git a/web/app/components/datasets/create/embedding-process/index.spec.tsx b/web/app/components/datasets/create/embedding-process/index.spec.tsx new file mode 100644 index 0000000000..8d2bae03cd --- /dev/null +++ b/web/app/components/datasets/create/embedding-process/index.spec.tsx @@ -0,0 +1,1562 @@ +import type { FullDocumentDetail, IndexingStatusResponse, ProcessRuleResponse } from '@/models/datasets' +import { act, render, renderHook, screen } from '@testing-library/react' +import { DataSourceType, ProcessMode } from '@/models/datasets' +import { RETRIEVE_METHOD } from '@/types/app' +import IndexingProgressItem from './indexing-progress-item' +import RuleDetail from './rule-detail' +import UpgradeBanner from './upgrade-banner' +import { useIndexingStatusPolling } from './use-indexing-status-polling' +import { + createDocumentLookup, + getFileType, + getSourcePercent, + isLegacyDataSourceInfo, + isSourceEmbedding, +} from './utils' + +// ============================================================================= +// Mock External Dependencies +// ============================================================================= + +// Mock next/navigation +const mockPush = vi.fn() +const mockRouter = { push: mockPush } +vi.mock('next/navigation', () => ({ + useRouter: () => mockRouter, +})) + +// Mock next/image +vi.mock('next/image', () => ({ + default: ({ src, alt, className }: { src: string, alt: string, className?: string }) => ( + // eslint-disable-next-line next/no-img-element + {alt} + ), +})) + +// Mock API service +const mockFetchIndexingStatusBatch = vi.fn() +vi.mock('@/service/datasets', () => ({ + fetchIndexingStatusBatch: (params: { datasetId: string, batchId: string }) => + mockFetchIndexingStatusBatch(params), +})) + +// Mock service hooks +const mockProcessRuleData: ProcessRuleResponse | undefined = undefined +vi.mock('@/service/knowledge/use-dataset', () => ({ + useProcessRule: vi.fn(() => ({ data: mockProcessRuleData })), +})) + +const mockInvalidDocumentList = vi.fn() +vi.mock('@/service/knowledge/use-document', () => ({ + useInvalidDocumentList: () => mockInvalidDocumentList, +})) + +// Mock useDatasetApiAccessUrl hook +vi.mock('@/hooks/use-api-access-url', () => ({ + useDatasetApiAccessUrl: () => 'https://api.example.com/docs', +})) + +// Mock provider context +let mockEnableBilling = false +let mockPlanType = 'sandbox' +vi.mock('@/context/provider-context', () => ({ + useProviderContext: () => ({ + enableBilling: mockEnableBilling, + plan: { type: mockPlanType }, + }), +})) + +// Mock icons +vi.mock('../icons', () => ({ + indexMethodIcon: { + economical: '/icons/economical.svg', + high_quality: '/icons/high-quality.svg', + }, + retrievalIcon: { + fullText: '/icons/full-text.svg', + hybrid: '/icons/hybrid.svg', + vector: '/icons/vector.svg', + }, +})) + +// Mock IndexingType enum from step-two +vi.mock('../step-two', () => ({ + IndexingType: { + QUALIFIED: 'high_quality', + ECONOMICAL: 'economy', + }, +})) + +// ============================================================================= +// Factory Functions for Test Data +// ============================================================================= + +/** + * Create a mock IndexingStatusResponse + */ +const createMockIndexingStatus = ( + overrides: Partial = {}, +): IndexingStatusResponse => ({ + id: 'doc-1', + indexing_status: 'completed', + processing_started_at: Date.now(), + parsing_completed_at: Date.now(), + cleaning_completed_at: Date.now(), + splitting_completed_at: Date.now(), + completed_at: Date.now(), + paused_at: null, + error: null, + stopped_at: null, + completed_segments: 10, + total_segments: 10, + ...overrides, +}) + +/** + * Create a mock FullDocumentDetail + */ +const createMockDocument = ( + overrides: Partial = {}, +): FullDocumentDetail => ({ + id: 'doc-1', + name: 'test-document.txt', + data_source_type: DataSourceType.FILE, + data_source_info: { + upload_file: { + id: 'file-1', + name: 'test-document.txt', + extension: 'txt', + mime_type: 'text/plain', + size: 1024, + created_by: 'user-1', + created_at: Date.now(), + }, + }, + batch: 'batch-1', + created_api_request_id: 'req-1', + processing_started_at: Date.now(), + parsing_completed_at: Date.now(), + cleaning_completed_at: Date.now(), + splitting_completed_at: Date.now(), + tokens: 100, + indexing_latency: 5000, + completed_at: Date.now(), + paused_by: '', + paused_at: 0, + stopped_at: 0, + indexing_status: 'completed', + disabled_at: 0, + ...overrides, +} as FullDocumentDetail) + +/** + * Create a mock ProcessRuleResponse + */ +const createMockProcessRule = ( + overrides: Partial = {}, +): ProcessRuleResponse => ({ + mode: ProcessMode.general, + rules: { + segmentation: { + separator: '\n', + max_tokens: 500, + chunk_overlap: 50, + }, + pre_processing_rules: [ + { id: 'remove_extra_spaces', enabled: true }, + { id: 'remove_urls_emails', enabled: false }, + ], + }, + ...overrides, +} as ProcessRuleResponse) + +// ============================================================================= +// Utils Tests +// ============================================================================= + +describe('utils', () => { + // Test utility functions for document handling + + describe('isLegacyDataSourceInfo', () => { + it('should return true for legacy data source with upload_file object', () => { + // Arrange + const info = { + upload_file: { id: 'file-1', name: 'test.txt' }, + } + + // Act & Assert + expect(isLegacyDataSourceInfo(info as Parameters[0])).toBe(true) + }) + + it('should return false for null', () => { + expect(isLegacyDataSourceInfo(null as unknown as Parameters[0])).toBe(false) + }) + + it('should return false for undefined', () => { + expect(isLegacyDataSourceInfo(undefined as unknown as Parameters[0])).toBe(false) + }) + + it('should return false when upload_file is not an object', () => { + // Arrange + const info = { upload_file: 'string-value' } + + // Act & Assert + expect(isLegacyDataSourceInfo(info as unknown as Parameters[0])).toBe(false) + }) + }) + + describe('isSourceEmbedding', () => { + it.each([ + ['indexing', true], + ['splitting', true], + ['parsing', true], + ['cleaning', true], + ['waiting', true], + ['completed', false], + ['error', false], + ['paused', false], + ])('should return %s for status "%s"', (status, expected) => { + // Arrange + const detail = createMockIndexingStatus({ indexing_status: status as IndexingStatusResponse['indexing_status'] }) + + // Act & Assert + expect(isSourceEmbedding(detail)).toBe(expected) + }) + }) + + describe('getSourcePercent', () => { + it('should return 0 when total_segments is 0', () => { + // Arrange + const detail = createMockIndexingStatus({ + completed_segments: 0, + total_segments: 0, + }) + + // Act & Assert + expect(getSourcePercent(detail)).toBe(0) + }) + + it('should calculate correct percentage', () => { + // Arrange + const detail = createMockIndexingStatus({ + completed_segments: 5, + total_segments: 10, + }) + + // Act & Assert + expect(getSourcePercent(detail)).toBe(50) + }) + + it('should cap percentage at 100', () => { + // Arrange + const detail = createMockIndexingStatus({ + completed_segments: 15, + total_segments: 10, + }) + + // Act & Assert + expect(getSourcePercent(detail)).toBe(100) + }) + + it('should handle undefined values', () => { + // Arrange + const detail = { indexing_status: 'indexing' } as IndexingStatusResponse + + // Act & Assert + expect(getSourcePercent(detail)).toBe(0) + }) + + it('should round to nearest integer', () => { + // Arrange + const detail = createMockIndexingStatus({ + completed_segments: 1, + total_segments: 3, + }) + + // Act & Assert + expect(getSourcePercent(detail)).toBe(33) + }) + }) + + describe('getFileType', () => { + it('should extract extension from filename', () => { + expect(getFileType('document.pdf')).toBe('pdf') + expect(getFileType('file.name.txt')).toBe('txt') + expect(getFileType('archive.tar.gz')).toBe('gz') + }) + + it('should return "txt" for undefined', () => { + expect(getFileType(undefined)).toBe('txt') + }) + + it('should return filename without extension', () => { + expect(getFileType('filename')).toBe('filename') + }) + }) + + describe('createDocumentLookup', () => { + it('should create lookup functions for documents', () => { + // Arrange + const documents = [ + createMockDocument({ id: 'doc-1', name: 'file1.txt' }), + createMockDocument({ id: 'doc-2', name: 'file2.pdf', data_source_type: DataSourceType.NOTION }), + ] + + // Act + const lookup = createDocumentLookup(documents) + + // Assert + expect(lookup.getName('doc-1')).toBe('file1.txt') + expect(lookup.getName('doc-2')).toBe('file2.pdf') + expect(lookup.getName('non-existent')).toBeUndefined() + }) + + it('should return source type correctly', () => { + // Arrange + const documents = [ + createMockDocument({ id: 'doc-1', data_source_type: DataSourceType.FILE }), + createMockDocument({ id: 'doc-2', data_source_type: DataSourceType.NOTION }), + ] + const lookup = createDocumentLookup(documents) + + // Assert + expect(lookup.getSourceType('doc-1')).toBe(DataSourceType.FILE) + expect(lookup.getSourceType('doc-2')).toBe(DataSourceType.NOTION) + }) + + it('should return notion icon for legacy data source', () => { + // Arrange + const documents = [ + createMockDocument({ + id: 'doc-1', + data_source_info: { + upload_file: { id: 'f1' }, + notion_page_icon: '📄', + } as FullDocumentDetail['data_source_info'], + }), + ] + const lookup = createDocumentLookup(documents) + + // Assert + expect(lookup.getNotionIcon('doc-1')).toBe('📄') + }) + + it('should return undefined for non-legacy notion icon', () => { + // Arrange + const documents = [ + createMockDocument({ + id: 'doc-1', + data_source_info: { some_other_field: 'value' } as unknown as FullDocumentDetail['data_source_info'], + }), + ] + const lookup = createDocumentLookup(documents) + + // Assert + expect(lookup.getNotionIcon('doc-1')).toBeUndefined() + }) + + it('should memoize lookups with Map for performance', () => { + // Arrange + const documents = Array.from({ length: 1000 }, (_, i) => + createMockDocument({ id: `doc-${i}`, name: `file${i}.txt` })) + + // Act + const lookup = createDocumentLookup(documents) + const startTime = performance.now() + for (let i = 0; i < 1000; i++) + lookup.getName(`doc-${i}`) + + const duration = performance.now() - startTime + + // Assert - should be very fast due to Map lookup + expect(duration).toBeLessThan(50) + }) + }) +}) + +// ============================================================================= +// useIndexingStatusPolling Hook Tests +// ============================================================================= + +describe('useIndexingStatusPolling', () => { + // Test the polling hook for indexing status + + beforeEach(() => { + vi.clearAllMocks() + vi.useFakeTimers() + }) + + afterEach(() => { + vi.useRealTimers() + }) + + it('should fetch status on mount', async () => { + // Arrange + const mockStatus = [createMockIndexingStatus({ indexing_status: 'completed' })] + mockFetchIndexingStatusBatch.mockResolvedValue({ data: mockStatus }) + + // Act + const { result } = renderHook(() => + useIndexingStatusPolling({ datasetId: 'ds-1', batchId: 'batch-1' }), + ) + + await act(async () => { + await vi.runOnlyPendingTimersAsync() + }) + + // Assert + expect(mockFetchIndexingStatusBatch).toHaveBeenCalledWith({ + datasetId: 'ds-1', + batchId: 'batch-1', + }) + expect(result.current.statusList).toEqual(mockStatus) + }) + + it('should stop polling when all statuses are completed', async () => { + // Arrange + const mockStatus = [createMockIndexingStatus({ indexing_status: 'completed' })] + mockFetchIndexingStatusBatch.mockResolvedValue({ data: mockStatus }) + + // Act + renderHook(() => + useIndexingStatusPolling({ datasetId: 'ds-1', batchId: 'batch-1' }), + ) + + await act(async () => { + await vi.runOnlyPendingTimersAsync() + }) + + // Assert - should only be called once since status is completed + expect(mockFetchIndexingStatusBatch).toHaveBeenCalledTimes(1) + }) + + it('should continue polling when status is indexing', async () => { + // Arrange + const indexingStatus = [createMockIndexingStatus({ indexing_status: 'indexing' })] + const completedStatus = [createMockIndexingStatus({ indexing_status: 'completed' })] + + mockFetchIndexingStatusBatch + .mockResolvedValueOnce({ data: indexingStatus }) + .mockResolvedValueOnce({ data: completedStatus }) + + // Act + renderHook(() => + useIndexingStatusPolling({ datasetId: 'ds-1', batchId: 'batch-1' }), + ) + + // First poll + await act(async () => { + await vi.runOnlyPendingTimersAsync() + }) + + // Advance timer for next poll (2500ms) + await act(async () => { + await vi.advanceTimersByTimeAsync(2500) + }) + + // Assert + expect(mockFetchIndexingStatusBatch).toHaveBeenCalledTimes(2) + }) + + it('should stop polling when status is error', async () => { + // Arrange + const mockStatus = [createMockIndexingStatus({ indexing_status: 'error', error: 'Some error' })] + mockFetchIndexingStatusBatch.mockResolvedValue({ data: mockStatus }) + + // Act + const { result } = renderHook(() => + useIndexingStatusPolling({ datasetId: 'ds-1', batchId: 'batch-1' }), + ) + + await act(async () => { + await vi.runOnlyPendingTimersAsync() + }) + + // Assert + expect(result.current.isEmbeddingCompleted).toBe(true) + expect(mockFetchIndexingStatusBatch).toHaveBeenCalledTimes(1) + }) + + it('should stop polling when status is paused', async () => { + // Arrange + const mockStatus = [createMockIndexingStatus({ indexing_status: 'paused' })] + mockFetchIndexingStatusBatch.mockResolvedValue({ data: mockStatus }) + + // Act + const { result } = renderHook(() => + useIndexingStatusPolling({ datasetId: 'ds-1', batchId: 'batch-1' }), + ) + + await act(async () => { + await vi.runOnlyPendingTimersAsync() + }) + + // Assert + expect(result.current.isEmbeddingCompleted).toBe(true) + }) + + it('should continue polling on API error', async () => { + // Arrange + mockFetchIndexingStatusBatch + .mockRejectedValueOnce(new Error('Network error')) + .mockResolvedValueOnce({ data: [createMockIndexingStatus({ indexing_status: 'completed' })] }) + + // Act + renderHook(() => + useIndexingStatusPolling({ datasetId: 'ds-1', batchId: 'batch-1' }), + ) + + await act(async () => { + await vi.runOnlyPendingTimersAsync() + }) + + await act(async () => { + await vi.advanceTimersByTimeAsync(2500) + }) + + // Assert - should retry after error + expect(mockFetchIndexingStatusBatch).toHaveBeenCalledTimes(2) + }) + + it('should return correct isEmbedding state', async () => { + // Arrange + const mockStatus = [createMockIndexingStatus({ indexing_status: 'indexing' })] + mockFetchIndexingStatusBatch.mockResolvedValue({ data: mockStatus }) + + // Act + const { result } = renderHook(() => + useIndexingStatusPolling({ datasetId: 'ds-1', batchId: 'batch-1' }), + ) + + await act(async () => { + await vi.runOnlyPendingTimersAsync() + }) + + // Assert + expect(result.current.isEmbedding).toBe(true) + expect(result.current.isEmbeddingCompleted).toBe(false) + }) + + it('should cleanup timeout on unmount', async () => { + // Arrange + const mockStatus = [createMockIndexingStatus({ indexing_status: 'indexing' })] + mockFetchIndexingStatusBatch.mockResolvedValue({ data: mockStatus }) + + // Act + const { unmount } = renderHook(() => + useIndexingStatusPolling({ datasetId: 'ds-1', batchId: 'batch-1' }), + ) + + await act(async () => { + await vi.runOnlyPendingTimersAsync() + }) + + const callCountBeforeUnmount = mockFetchIndexingStatusBatch.mock.calls.length + + unmount() + + // Advance timers - should not trigger more calls after unmount + await act(async () => { + await vi.advanceTimersByTimeAsync(5000) + }) + + // Assert - no additional calls after unmount + expect(mockFetchIndexingStatusBatch).toHaveBeenCalledTimes(callCountBeforeUnmount) + }) + + it('should handle multiple documents with mixed statuses', async () => { + // Arrange + const mockStatus = [ + createMockIndexingStatus({ id: 'doc-1', indexing_status: 'completed' }), + createMockIndexingStatus({ id: 'doc-2', indexing_status: 'indexing' }), + ] + mockFetchIndexingStatusBatch.mockResolvedValue({ data: mockStatus }) + + // Act + const { result } = renderHook(() => + useIndexingStatusPolling({ datasetId: 'ds-1', batchId: 'batch-1' }), + ) + + await act(async () => { + await vi.runOnlyPendingTimersAsync() + }) + + // Assert + expect(result.current.isEmbedding).toBe(true) + expect(result.current.isEmbeddingCompleted).toBe(false) + expect(result.current.statusList).toHaveLength(2) + }) + + it('should return empty statusList initially', () => { + // Arrange & Act + const { result } = renderHook(() => + useIndexingStatusPolling({ datasetId: 'ds-1', batchId: 'batch-1' }), + ) + + // Assert + expect(result.current.statusList).toEqual([]) + expect(result.current.isEmbedding).toBe(false) + expect(result.current.isEmbeddingCompleted).toBe(false) + }) +}) + +// ============================================================================= +// UpgradeBanner Component Tests +// ============================================================================= + +describe('UpgradeBanner', () => { + // Test the upgrade banner component + + beforeEach(() => { + vi.clearAllMocks() + }) + + it('should render upgrade message', () => { + // Arrange & Act + render() + + // Assert + expect(screen.getByText(/billing\.plansCommon\.documentProcessingPriorityUpgrade/i)).toBeInTheDocument() + }) + + it('should render ZapFast icon', () => { + // Arrange & Act + const { container } = render() + + // Assert + expect(container.querySelector('svg')).toBeInTheDocument() + }) + + it('should render UpgradeBtn component', () => { + // Arrange & Act + render() + + // Assert - UpgradeBtn should be rendered + const upgradeContainer = screen.getByText(/billing\.plansCommon\.documentProcessingPriorityUpgrade/i).parentElement + expect(upgradeContainer).toBeInTheDocument() + }) +}) + +// ============================================================================= +// IndexingProgressItem Component Tests +// ============================================================================= + +describe('IndexingProgressItem', () => { + // Test the progress item component for individual documents + + beforeEach(() => { + vi.clearAllMocks() + }) + + describe('Rendering', () => { + it('should render document name', () => { + // Arrange + const detail = createMockIndexingStatus() + + // Act + render() + + // Assert + expect(screen.getByText('test-document.txt')).toBeInTheDocument() + }) + + it('should render progress percentage when embedding', () => { + // Arrange + const detail = createMockIndexingStatus({ + indexing_status: 'indexing', + completed_segments: 5, + total_segments: 10, + }) + + // Act + render() + + // Assert + expect(screen.getByText('50%')).toBeInTheDocument() + }) + + it('should not render progress percentage when completed', () => { + // Arrange + const detail = createMockIndexingStatus({ indexing_status: 'completed' }) + + // Act + render() + + // Assert + expect(screen.queryByText('%')).not.toBeInTheDocument() + }) + }) + + describe('Status Icons', () => { + it('should render success icon for completed status', () => { + // Arrange + const detail = createMockIndexingStatus({ indexing_status: 'completed' }) + + // Act + const { container } = render() + + // Assert + expect(container.querySelector('.text-text-success')).toBeInTheDocument() + }) + + it('should render error icon for error status', () => { + // Arrange + const detail = createMockIndexingStatus({ + indexing_status: 'error', + error: 'Processing failed', + }) + + // Act + const { container } = render() + + // Assert + expect(container.querySelector('.text-text-destructive')).toBeInTheDocument() + }) + + it('should not render status icon for indexing status', () => { + // Arrange + const detail = createMockIndexingStatus({ indexing_status: 'indexing' }) + + // Act + const { container } = render() + + // Assert + expect(container.querySelector('.text-text-success')).not.toBeInTheDocument() + expect(container.querySelector('.text-text-destructive')).not.toBeInTheDocument() + }) + }) + + describe('Source Type Icons', () => { + it('should render file icon for FILE source type', () => { + // Arrange + const detail = createMockIndexingStatus() + + // Act + render( + , + ) + + // Assert - DocumentFileIcon should be rendered + expect(screen.getByText('document.pdf')).toBeInTheDocument() + }) + + // DocumentFileIcon branch coverage: different file extensions + describe('DocumentFileIcon file extensions', () => { + it.each([ + ['document.pdf', 'pdf'], + ['data.json', 'json'], + ['page.html', 'html'], + ['readme.txt', 'txt'], + ['notes.markdown', 'markdown'], + ['readme.md', 'md'], + ['spreadsheet.xlsx', 'xlsx'], + ['legacy.xls', 'xls'], + ['data.csv', 'csv'], + ['letter.doc', 'doc'], + ['report.docx', 'docx'], + ])('should render file icon for %s (%s extension)', (filename) => { + // Arrange + const detail = createMockIndexingStatus() + + // Act + render( + , + ) + + // Assert + expect(screen.getByText(filename)).toBeInTheDocument() + }) + + it('should handle unknown file extension with default icon', () => { + // Arrange + const detail = createMockIndexingStatus() + + // Act + render( + , + ) + + // Assert - should still render with default document icon + expect(screen.getByText('archive.zip')).toBeInTheDocument() + }) + + it('should handle uppercase extension', () => { + // Arrange + const detail = createMockIndexingStatus() + + // Act + render( + , + ) + + // Assert + expect(screen.getByText('REPORT.PDF')).toBeInTheDocument() + }) + + it('should handle mixed case extension', () => { + // Arrange + const detail = createMockIndexingStatus() + + // Act + render( + , + ) + + // Assert + expect(screen.getByText('Document.Docx')).toBeInTheDocument() + }) + + it('should handle filename with multiple dots', () => { + // Arrange + const detail = createMockIndexingStatus() + + // Act + render( + , + ) + + // Assert - should extract "pdf" as extension + expect(screen.getByText('my.file.name.pdf')).toBeInTheDocument() + }) + + it('should handle filename without extension', () => { + // Arrange + const detail = createMockIndexingStatus() + + // Act + render( + , + ) + + // Assert - should use filename itself as fallback + expect(screen.getByText('noextension')).toBeInTheDocument() + }) + }) + + it('should render notion icon for NOTION source type', () => { + // Arrange + const detail = createMockIndexingStatus() + + // Act + render( + , + ) + + // Assert + expect(screen.getByText('Notion Page')).toBeInTheDocument() + }) + }) + + describe('Progress Bar', () => { + it('should render progress bar when embedding', () => { + // Arrange + const detail = createMockIndexingStatus({ + indexing_status: 'indexing', + completed_segments: 30, + total_segments: 100, + }) + + // Act + const { container } = render() + + // Assert + const progressBar = container.querySelector('[style*="width: 30%"]') + expect(progressBar).toBeInTheDocument() + }) + + it('should not render progress bar when completed', () => { + // Arrange + const detail = createMockIndexingStatus({ indexing_status: 'completed' }) + + // Act + const { container } = render() + + // Assert + const progressBar = container.querySelector('.bg-components-progress-bar-progress') + expect(progressBar).not.toBeInTheDocument() + }) + + it('should apply error styling for error status', () => { + // Arrange + const detail = createMockIndexingStatus({ indexing_status: 'error' }) + + // Act + const { container } = render() + + // Assert + expect(container.querySelector('.bg-state-destructive-hover-alt')).toBeInTheDocument() + }) + }) + + describe('Billing', () => { + it('should render PriorityLabel when enableBilling is true', () => { + // Arrange + const detail = createMockIndexingStatus() + + // Act + render() + + // Assert - PriorityLabel component should be in the DOM + const container = screen.getByText('test.txt').parentElement + expect(container).toBeInTheDocument() + }) + + it('should not render PriorityLabel when enableBilling is false', () => { + // Arrange + const detail = createMockIndexingStatus() + + // Act + render() + + // Assert + expect(screen.getByText('test.txt')).toBeInTheDocument() + }) + }) + + describe('Edge Cases', () => { + it('should handle undefined name', () => { + // Arrange + const detail = createMockIndexingStatus() + + // Act + render() + + // Assert - should not crash + expect(document.body).toBeInTheDocument() + }) + + it('should handle undefined sourceType', () => { + // Arrange + const detail = createMockIndexingStatus() + + // Act + render() + + // Assert - should render without source icon + expect(screen.getByText('test.txt')).toBeInTheDocument() + }) + }) +}) + +// ============================================================================= +// RuleDetail Component Tests +// ============================================================================= + +describe('RuleDetail', () => { + // Test the rule detail component for process configuration display + + beforeEach(() => { + vi.clearAllMocks() + }) + + describe('Rendering', () => { + it('should render without crashing', () => { + // Arrange & Act + render() + + // Assert + expect(screen.getByText(/datasetDocuments\.embedding\.mode/i)).toBeInTheDocument() + }) + + it('should render all field labels', () => { + // Arrange & Act + render() + + // Assert + expect(screen.getByText(/datasetDocuments\.embedding\.mode/i)).toBeInTheDocument() + expect(screen.getByText(/datasetDocuments\.embedding\.segmentLength/i)).toBeInTheDocument() + expect(screen.getByText(/datasetDocuments\.embedding\.textCleaning/i)).toBeInTheDocument() + expect(screen.getByText(/datasetCreation\.stepTwo\.indexMode/i)).toBeInTheDocument() + expect(screen.getByText(/datasetSettings\.form\.retrievalSetting\.title/i)).toBeInTheDocument() + }) + }) + + describe('Mode Display', () => { + it('should show "-" when sourceData is undefined', () => { + // Arrange & Act + render() + + // Assert + expect(screen.getAllByText('-')).toHaveLength(3) // mode, segmentLength, textCleaning + }) + + it('should show "custom" for general process mode', () => { + // Arrange + const sourceData = createMockProcessRule({ mode: ProcessMode.general }) + + // Act + render() + + // Assert + expect(screen.getByText(/datasetDocuments\.embedding\.custom/i)).toBeInTheDocument() + }) + + it('should show hierarchical mode with paragraph parent', () => { + // Arrange + const sourceData = createMockProcessRule({ + mode: ProcessMode.parentChild, + rules: { + parent_mode: 'paragraph', + segmentation: { max_tokens: 500 }, + }, + } as Partial) + + // Act + render() + + // Assert + expect(screen.getByText(/datasetDocuments\.embedding\.hierarchical/i)).toBeInTheDocument() + }) + }) + + describe('Segment Length Display', () => { + it('should show max_tokens for general mode', () => { + // Arrange + const sourceData = createMockProcessRule({ + mode: ProcessMode.general, + rules: { + segmentation: { max_tokens: 500 }, + }, + } as Partial) + + // Act + render() + + // Assert + expect(screen.getByText('500')).toBeInTheDocument() + }) + + it('should show parent and child tokens for hierarchical mode', () => { + // Arrange + const sourceData = createMockProcessRule({ + mode: ProcessMode.parentChild, + rules: { + segmentation: { max_tokens: 1000 }, + subchunk_segmentation: { max_tokens: 200 }, + }, + } as Partial) + + // Act + render() + + // Assert + expect(screen.getByText(/1000/)).toBeInTheDocument() + expect(screen.getByText(/200/)).toBeInTheDocument() + }) + }) + + describe('Text Cleaning Rules', () => { + it('should show enabled rule names', () => { + // Arrange + const sourceData = createMockProcessRule({ + mode: ProcessMode.general, + rules: { + pre_processing_rules: [ + { id: 'remove_extra_spaces', enabled: true }, + { id: 'remove_urls_emails', enabled: true }, + { id: 'remove_stopwords', enabled: false }, + ], + }, + } as Partial) + + // Act + render() + + // Assert + expect(screen.getByText(/removeExtraSpaces/i)).toBeInTheDocument() + expect(screen.getByText(/removeUrlEmails/i)).toBeInTheDocument() + }) + + it('should show "-" when no rules are enabled', () => { + // Arrange + const sourceData = createMockProcessRule({ + mode: ProcessMode.general, + rules: { + pre_processing_rules: [ + { id: 'remove_extra_spaces', enabled: false }, + ], + }, + } as Partial) + + // Act + render() + + // Assert - textCleaning should show "-" + const dashElements = screen.getAllByText('-') + expect(dashElements.length).toBeGreaterThan(0) + }) + }) + + describe('Indexing Type', () => { + it('should show qualified for high_quality indexing', () => { + // Arrange & Act + render() + + // Assert + expect(screen.getByText(/datasetCreation\.stepTwo\.qualified/i)).toBeInTheDocument() + }) + + it('should show economical for economy indexing', () => { + // Arrange & Act + render() + + // Assert + expect(screen.getByText(/datasetCreation\.stepTwo\.economical/i)).toBeInTheDocument() + }) + + it('should render correct icon for indexing type', () => { + // Arrange & Act + render() + + // Assert + const images = screen.getAllByTestId('next-image') + expect(images.length).toBeGreaterThan(0) + }) + }) + + describe('Retrieval Method', () => { + it('should show semantic search by default', () => { + // Arrange & Act + render() + + // Assert + expect(screen.getByText(/dataset\.retrieval\.semantic_search\.title/i)).toBeInTheDocument() + }) + + it('should show keyword search for economical indexing', () => { + // Arrange & Act + render() + + // Assert + expect(screen.getByText(/dataset\.retrieval\.keyword_search\.title/i)).toBeInTheDocument() + }) + + it.each([ + [RETRIEVE_METHOD.fullText, 'full_text_search'], + [RETRIEVE_METHOD.hybrid, 'hybrid_search'], + [RETRIEVE_METHOD.semantic, 'semantic_search'], + ])('should show correct label for %s retrieval method', (method, expectedKey) => { + // Arrange & Act + render() + + // Assert + expect(screen.getByText(new RegExp(`dataset\\.retrieval\\.${expectedKey}\\.title`, 'i'))).toBeInTheDocument() + }) + }) +}) + +// ============================================================================= +// EmbeddingProcess Integration Tests +// ============================================================================= + +describe('EmbeddingProcess', () => { + // Integration tests for the main EmbeddingProcess component + + // Import the main component after mocks are set up + let EmbeddingProcess: typeof import('./index').default + + beforeEach(async () => { + vi.clearAllMocks() + vi.useFakeTimers() + mockEnableBilling = false + mockPlanType = 'sandbox' + + // Dynamically import to get fresh component with mocks + const embeddingModule = await import('./index') + EmbeddingProcess = embeddingModule.default + }) + + afterEach(() => { + vi.useRealTimers() + }) + + describe('Rendering', () => { + it('should render without crashing', async () => { + // Arrange + mockFetchIndexingStatusBatch.mockResolvedValue({ data: [] }) + + // Act + render() + + await act(async () => { + await vi.runOnlyPendingTimersAsync() + }) + + // Assert + expect(document.body).toBeInTheDocument() + }) + + it('should render status header', async () => { + // Arrange + const mockStatus = [createMockIndexingStatus({ indexing_status: 'indexing' })] + mockFetchIndexingStatusBatch.mockResolvedValue({ data: mockStatus }) + + // Act + render() + + await act(async () => { + await vi.runOnlyPendingTimersAsync() + }) + + // Assert + expect(screen.getByText(/datasetDocuments\.embedding\.processing/i)).toBeInTheDocument() + }) + + it('should show completed status when all documents are done', async () => { + // Arrange + const mockStatus = [createMockIndexingStatus({ indexing_status: 'completed' })] + mockFetchIndexingStatusBatch.mockResolvedValue({ data: mockStatus }) + + // Act + render() + + await act(async () => { + await vi.runOnlyPendingTimersAsync() + }) + + // Assert + expect(screen.getByText(/datasetDocuments\.embedding\.completed/i)).toBeInTheDocument() + }) + }) + + describe('Progress Items', () => { + it('should render progress items for each document', async () => { + // Arrange + const documents = [ + createMockDocument({ id: 'doc-1', name: 'file1.txt' }), + createMockDocument({ id: 'doc-2', name: 'file2.pdf' }), + ] + const mockStatus = [ + createMockIndexingStatus({ id: 'doc-1' }), + createMockIndexingStatus({ id: 'doc-2' }), + ] + mockFetchIndexingStatusBatch.mockResolvedValue({ data: mockStatus }) + + // Act + render( + , + ) + + await act(async () => { + await vi.runOnlyPendingTimersAsync() + }) + + // Assert + expect(screen.getByText('file1.txt')).toBeInTheDocument() + expect(screen.getByText('file2.pdf')).toBeInTheDocument() + }) + }) + + describe('Upgrade Banner', () => { + it('should show upgrade banner when billing is enabled and not team plan', async () => { + // Arrange + mockEnableBilling = true + mockPlanType = 'sandbox' + mockFetchIndexingStatusBatch.mockResolvedValue({ data: [] }) + + // Re-import to get updated mock values + const embeddingModule = await import('./index') + EmbeddingProcess = embeddingModule.default + + // Act + render() + + await act(async () => { + await vi.runOnlyPendingTimersAsync() + }) + + // Assert + expect(screen.getByText(/billing\.plansCommon\.documentProcessingPriorityUpgrade/i)).toBeInTheDocument() + }) + + it('should not show upgrade banner when billing is disabled', async () => { + // Arrange + mockEnableBilling = false + mockFetchIndexingStatusBatch.mockResolvedValue({ data: [] }) + + // Act + render() + + await act(async () => { + await vi.runOnlyPendingTimersAsync() + }) + + // Assert + expect(screen.queryByText(/billing\.plansCommon\.documentProcessingPriorityUpgrade/i)).not.toBeInTheDocument() + }) + + it('should not show upgrade banner for team plan', async () => { + // Arrange + mockEnableBilling = true + mockPlanType = 'team' + mockFetchIndexingStatusBatch.mockResolvedValue({ data: [] }) + + // Re-import to get updated mock values + const embeddingModule = await import('./index') + EmbeddingProcess = embeddingModule.default + + // Act + render() + + await act(async () => { + await vi.runOnlyPendingTimersAsync() + }) + + // Assert + expect(screen.queryByText(/billing\.plansCommon\.documentProcessingPriorityUpgrade/i)).not.toBeInTheDocument() + }) + }) + + describe('Action Buttons', () => { + it('should render API access button with correct link', async () => { + // Arrange + mockFetchIndexingStatusBatch.mockResolvedValue({ data: [] }) + + // Act + render() + + await act(async () => { + await vi.runOnlyPendingTimersAsync() + }) + + // Assert + const apiButton = screen.getByText('Access the API') + expect(apiButton).toBeInTheDocument() + expect(apiButton.closest('a')).toHaveAttribute('href', 'https://api.example.com/docs') + }) + + it('should render navigation button', async () => { + // Arrange + mockFetchIndexingStatusBatch.mockResolvedValue({ data: [] }) + + // Act + render() + + await act(async () => { + await vi.runOnlyPendingTimersAsync() + }) + + // Assert + expect(screen.getByText(/datasetCreation\.stepThree\.navTo/i)).toBeInTheDocument() + }) + + it('should navigate to documents list when nav button clicked', async () => { + // Arrange + mockFetchIndexingStatusBatch.mockResolvedValue({ data: [] }) + + // Act + render() + + await act(async () => { + await vi.runOnlyPendingTimersAsync() + }) + + const navButton = screen.getByText(/datasetCreation\.stepThree\.navTo/i) + + await act(async () => { + navButton.click() + }) + + // Assert + expect(mockInvalidDocumentList).toHaveBeenCalled() + expect(mockPush).toHaveBeenCalledWith('/datasets/ds-1/documents') + }) + }) + + describe('Rule Detail', () => { + it('should render RuleDetail component', async () => { + // Arrange + mockFetchIndexingStatusBatch.mockResolvedValue({ data: [] }) + + // Act + render( + , + ) + + await act(async () => { + await vi.runOnlyPendingTimersAsync() + }) + + // Assert + expect(screen.getByText(/datasetDocuments\.embedding\.mode/i)).toBeInTheDocument() + }) + + it('should pass indexingType to RuleDetail', async () => { + // Arrange + mockFetchIndexingStatusBatch.mockResolvedValue({ data: [] }) + + // Act + render( + , + ) + + await act(async () => { + await vi.runOnlyPendingTimersAsync() + }) + + // Assert + expect(screen.getByText(/datasetCreation\.stepTwo\.economical/i)).toBeInTheDocument() + }) + }) + + describe('Document Lookup Memoization', () => { + it('should memoize document lookup based on documents array', async () => { + // Arrange + const documents = [createMockDocument({ id: 'doc-1', name: 'test.txt' })] + mockFetchIndexingStatusBatch.mockResolvedValue({ + data: [createMockIndexingStatus({ id: 'doc-1' })], + }) + + // Act + const { rerender } = render( + , + ) + + await act(async () => { + await vi.runOnlyPendingTimersAsync() + }) + + // Rerender with same documents reference + rerender( + , + ) + + // Assert - component should render without issues + expect(screen.getByText('test.txt')).toBeInTheDocument() + }) + }) + + describe('Edge Cases', () => { + it('should handle empty documents array', async () => { + // Arrange + mockFetchIndexingStatusBatch.mockResolvedValue({ data: [] }) + + // Act + render() + + await act(async () => { + await vi.runOnlyPendingTimersAsync() + }) + + // Assert - should render without crashing + expect(document.body).toBeInTheDocument() + }) + + it('should handle undefined documents', async () => { + // Arrange + mockFetchIndexingStatusBatch.mockResolvedValue({ data: [] }) + + // Act + render() + + await act(async () => { + await vi.runOnlyPendingTimersAsync() + }) + + // Assert - should render without crashing + expect(document.body).toBeInTheDocument() + }) + + it('should handle status with missing document', async () => { + // Arrange + const documents = [createMockDocument({ id: 'doc-1', name: 'test.txt' })] + mockFetchIndexingStatusBatch.mockResolvedValue({ + data: [ + createMockIndexingStatus({ id: 'doc-1' }), + createMockIndexingStatus({ id: 'doc-unknown' }), // No matching document + ], + }) + + // Act + render( + , + ) + + await act(async () => { + await vi.runOnlyPendingTimersAsync() + }) + + // Assert - should render known document and handle unknown gracefully + expect(screen.getByText('test.txt')).toBeInTheDocument() + }) + + it('should handle undefined retrievalMethod', async () => { + // Arrange + mockFetchIndexingStatusBatch.mockResolvedValue({ data: [] }) + + // Act + render( + , + ) + + await act(async () => { + await vi.runOnlyPendingTimersAsync() + }) + + // Assert - should use default semantic search + expect(screen.getByText(/dataset\.retrieval\.semantic_search\.title/i)).toBeInTheDocument() + }) + }) +}) diff --git a/web/app/components/datasets/create/embedding-process/index.tsx b/web/app/components/datasets/create/embedding-process/index.tsx index aa1f6cee50..e9cea84f00 100644 --- a/web/app/components/datasets/create/embedding-process/index.tsx +++ b/web/app/components/datasets/create/embedding-process/index.tsx @@ -1,47 +1,29 @@ import type { FC } from 'react' -import type { - DataSourceInfo, - FullDocumentDetail, - IndexingStatusResponse, - LegacyDataSourceInfo, - ProcessRuleResponse, -} from '@/models/datasets' +import type { FullDocumentDetail } from '@/models/datasets' +import type { RETRIEVE_METHOD } from '@/types/app' import { RiArrowRightLine, - RiCheckboxCircleFill, - RiErrorWarningFill, RiLoader2Fill, RiTerminalBoxLine, } from '@remixicon/react' -import Image from 'next/image' import Link from 'next/link' import { useRouter } from 'next/navigation' -import * as React from 'react' -import { useCallback, useEffect, useMemo, useRef, useState } from 'react' +import { useMemo } from 'react' import { useTranslation } from 'react-i18next' import Button from '@/app/components/base/button' import Divider from '@/app/components/base/divider' -import { ZapFast } from '@/app/components/base/icons/src/vender/solid/general' -import NotionIcon from '@/app/components/base/notion-icon' -import Tooltip from '@/app/components/base/tooltip' -import PriorityLabel from '@/app/components/billing/priority-label' import { Plan } from '@/app/components/billing/type' -import UpgradeBtn from '@/app/components/billing/upgrade-btn' -import { FieldInfo } from '@/app/components/datasets/documents/detail/metadata' import { useProviderContext } from '@/context/provider-context' import { useDatasetApiAccessUrl } from '@/hooks/use-api-access-url' -import { DataSourceType, ProcessMode } from '@/models/datasets' -import { fetchIndexingStatusBatch as doFetchIndexingStatus } from '@/service/datasets' import { useProcessRule } from '@/service/knowledge/use-dataset' import { useInvalidDocumentList } from '@/service/knowledge/use-document' -import { RETRIEVE_METHOD } from '@/types/app' -import { sleep } from '@/utils' -import { cn } from '@/utils/classnames' -import DocumentFileIcon from '../../common/document-file-icon' -import { indexMethodIcon, retrievalIcon } from '../icons' -import { IndexingType } from '../step-two' +import IndexingProgressItem from './indexing-progress-item' +import RuleDetail from './rule-detail' +import UpgradeBanner from './upgrade-banner' +import { useIndexingStatusPolling } from './use-indexing-status-polling' +import { createDocumentLookup } from './utils' -type Props = { +type EmbeddingProcessProps = { datasetId: string batchId: string documents?: FullDocumentDetail[] @@ -49,333 +31,121 @@ type Props = { retrievalMethod?: RETRIEVE_METHOD } -const RuleDetail: FC<{ - sourceData?: ProcessRuleResponse - indexingType?: string - retrievalMethod?: RETRIEVE_METHOD -}> = ({ sourceData, indexingType, retrievalMethod }) => { +// Status header component +const StatusHeader: FC<{ isEmbedding: boolean, isCompleted: boolean }> = ({ + isEmbedding, + isCompleted, +}) => { const { t } = useTranslation() - const segmentationRuleMap = { - mode: t('embedding.mode', { ns: 'datasetDocuments' }), - segmentLength: t('embedding.segmentLength', { ns: 'datasetDocuments' }), - textCleaning: t('embedding.textCleaning', { ns: 'datasetDocuments' }), - } - - const getRuleName = (key: string) => { - if (key === 'remove_extra_spaces') - return t('stepTwo.removeExtraSpaces', { ns: 'datasetCreation' }) - - if (key === 'remove_urls_emails') - return t('stepTwo.removeUrlEmails', { ns: 'datasetCreation' }) - - if (key === 'remove_stopwords') - return t('stepTwo.removeStopwords', { ns: 'datasetCreation' }) - } - - const isNumber = (value: unknown) => { - return typeof value === 'number' - } - - const getValue = useCallback((field: string) => { - let value: string | number | undefined = '-' - const maxTokens = isNumber(sourceData?.rules?.segmentation?.max_tokens) - ? sourceData.rules.segmentation.max_tokens - : value - const childMaxTokens = isNumber(sourceData?.rules?.subchunk_segmentation?.max_tokens) - ? sourceData.rules.subchunk_segmentation.max_tokens - : value - switch (field) { - case 'mode': - value = !sourceData?.mode - ? value - : sourceData.mode === ProcessMode.general - ? (t('embedding.custom', { ns: 'datasetDocuments' }) as string) - : `${t('embedding.hierarchical', { ns: 'datasetDocuments' })} · ${sourceData?.rules?.parent_mode === 'paragraph' - ? t('parentMode.paragraph', { ns: 'dataset' }) - : t('parentMode.fullDoc', { ns: 'dataset' })}` - break - case 'segmentLength': - value = !sourceData?.mode - ? value - : sourceData.mode === ProcessMode.general - ? maxTokens - : `${t('embedding.parentMaxTokens', { ns: 'datasetDocuments' })} ${maxTokens}; ${t('embedding.childMaxTokens', { ns: 'datasetDocuments' })} ${childMaxTokens}` - break - default: - value = !sourceData?.mode - ? value - : sourceData?.rules?.pre_processing_rules?.filter(rule => - rule.enabled).map(rule => getRuleName(rule.id)).join(',') - break - } - return value - }, [sourceData]) - return ( -
- {Object.keys(segmentationRuleMap).map((field) => { - return ( - - ) - })} - - )} - /> - - )} - /> +
+ {isEmbedding && ( + <> + + {t('embedding.processing', { ns: 'datasetDocuments' })} + + )} + {isCompleted && t('embedding.completed', { ns: 'datasetDocuments' })}
) } -const EmbeddingProcess: FC = ({ datasetId, batchId, documents = [], indexingType, retrievalMethod }) => { +// Action buttons component +const ActionButtons: FC<{ + apiReferenceUrl: string + onNavToDocuments: () => void +}> = ({ apiReferenceUrl, onNavToDocuments }) => { const { t } = useTranslation() + + return ( +
+ + + + +
+ ) +} + +const EmbeddingProcess: FC = ({ + datasetId, + batchId, + documents = [], + indexingType, + retrievalMethod, +}) => { const { enableBilling, plan } = useProviderContext() - - const getFirstDocument = documents[0] - - const [indexingStatusBatchDetail, setIndexingStatusDetail] = useState([]) - const fetchIndexingStatus = async () => { - const status = await doFetchIndexingStatus({ datasetId, batchId }) - setIndexingStatusDetail(status.data) - return status.data - } - - const [isStopQuery, setIsStopQuery] = useState(false) - const isStopQueryRef = useRef(isStopQuery) - useEffect(() => { - isStopQueryRef.current = isStopQuery - }, [isStopQuery]) - const stopQueryStatus = () => { - setIsStopQuery(true) - } - - const startQueryStatus = async () => { - if (isStopQueryRef.current) - return - - try { - const indexingStatusBatchDetail = await fetchIndexingStatus() - const isCompleted = indexingStatusBatchDetail.every(indexingStatusDetail => ['completed', 'error', 'paused'].includes(indexingStatusDetail.indexing_status)) - if (isCompleted) { - stopQueryStatus() - return - } - await sleep(2500) - await startQueryStatus() - } - catch { - await sleep(2500) - await startQueryStatus() - } - } - - useEffect(() => { - setIsStopQuery(false) - startQueryStatus() - return () => { - stopQueryStatus() - } - }, []) - - // get rule - const { data: ruleDetail } = useProcessRule(getFirstDocument?.id) - const router = useRouter() const invalidDocumentList = useInvalidDocumentList() - const navToDocumentList = () => { + const apiReferenceUrl = useDatasetApiAccessUrl() + + // Polling hook for indexing status + const { statusList, isEmbedding, isEmbeddingCompleted } = useIndexingStatusPolling({ + datasetId, + batchId, + }) + + // Get process rule for the first document + const firstDocumentId = documents[0]?.id + const { data: ruleDetail } = useProcessRule(firstDocumentId) + + // Document lookup utilities - memoized for performance + const documentLookup = useMemo( + () => createDocumentLookup(documents), + [documents], + ) + + const handleNavToDocuments = () => { invalidDocumentList() router.push(`/datasets/${datasetId}/documents`) } - const apiReferenceUrl = useDatasetApiAccessUrl() - const isEmbedding = useMemo(() => { - return indexingStatusBatchDetail.some(indexingStatusDetail => ['indexing', 'splitting', 'parsing', 'cleaning'].includes(indexingStatusDetail?.indexing_status || '')) - }, [indexingStatusBatchDetail]) - const isEmbeddingCompleted = useMemo(() => { - return indexingStatusBatchDetail.every(indexingStatusDetail => ['completed', 'error', 'paused'].includes(indexingStatusDetail?.indexing_status || '')) - }, [indexingStatusBatchDetail]) - - const getSourceName = (id: string) => { - const doc = documents.find(document => document.id === id) - return doc?.name - } - const getFileType = (name?: string) => name?.split('.').pop() || 'txt' - const getSourcePercent = (detail: IndexingStatusResponse) => { - const completedCount = detail.completed_segments || 0 - const totalCount = detail.total_segments || 0 - if (totalCount === 0) - return 0 - const percent = Math.round(completedCount * 100 / totalCount) - return percent > 100 ? 100 : percent - } - const getSourceType = (id: string) => { - const doc = documents.find(document => document.id === id) - return doc?.data_source_type as DataSourceType - } - - const isLegacyDataSourceInfo = (info: DataSourceInfo): info is LegacyDataSourceInfo => { - return info != null && typeof (info as LegacyDataSourceInfo).upload_file === 'object' - } - - const getIcon = (id: string) => { - const doc = documents.find(document => document.id === id) - const info = doc?.data_source_info - if (info && isLegacyDataSourceInfo(info)) - return info.notion_page_icon - return undefined - } - const isSourceEmbedding = (detail: IndexingStatusResponse) => - ['indexing', 'splitting', 'parsing', 'cleaning', 'waiting'].includes(detail.indexing_status || '') + const showUpgradeBanner = enableBilling && plan.type !== Plan.team return ( <>
-
- {isEmbedding && ( - <> - - {t('embedding.processing', { ns: 'datasetDocuments' })} - - )} - {isEmbeddingCompleted && t('embedding.completed', { ns: 'datasetDocuments' })} -
- { - enableBilling && plan.type !== Plan.team && ( -
-
- -
-
- {t('plansCommon.documentProcessingPriorityUpgrade', { ns: 'billing' })} -
- -
- ) - } + + + {showUpgradeBanner && } +
- {indexingStatusBatchDetail.map(indexingStatusDetail => ( -
- {isSourceEmbedding(indexingStatusDetail) && ( -
- )} -
- {getSourceType(indexingStatusDetail.id) === DataSourceType.FILE && ( - - )} - {getSourceType(indexingStatusDetail.id) === DataSourceType.NOTION && ( - - )} -
-
- {getSourceName(indexingStatusDetail.id)} -
- { - enableBilling && ( - - ) - } -
- {isSourceEmbedding(indexingStatusDetail) && ( -
{`${getSourcePercent(indexingStatusDetail)}%`}
- )} - {indexingStatusDetail.indexing_status === 'error' && ( - - - - - - )} - {indexingStatusDetail.indexing_status === 'completed' && ( - - )} -
-
+ {statusList.map(detail => ( + ))}
+ +
-
- - - - -
+ + ) } diff --git a/web/app/components/datasets/create/embedding-process/indexing-progress-item.tsx b/web/app/components/datasets/create/embedding-process/indexing-progress-item.tsx new file mode 100644 index 0000000000..b7c085cff9 --- /dev/null +++ b/web/app/components/datasets/create/embedding-process/indexing-progress-item.tsx @@ -0,0 +1,120 @@ +import type { FC } from 'react' +import type { IndexingStatusResponse } from '@/models/datasets' +import { + RiCheckboxCircleFill, + RiErrorWarningFill, +} from '@remixicon/react' +import NotionIcon from '@/app/components/base/notion-icon' +import Tooltip from '@/app/components/base/tooltip' +import PriorityLabel from '@/app/components/billing/priority-label' +import { DataSourceType } from '@/models/datasets' +import { cn } from '@/utils/classnames' +import DocumentFileIcon from '../../common/document-file-icon' +import { getFileType, getSourcePercent, isSourceEmbedding } from './utils' + +type IndexingProgressItemProps = { + detail: IndexingStatusResponse + name?: string + sourceType?: DataSourceType + notionIcon?: string + enableBilling?: boolean +} + +// Status icon component for completed/error states +const StatusIcon: FC<{ status: string, error?: string }> = ({ status, error }) => { + if (status === 'completed') + return + + if (status === 'error') { + return ( + + + + + + ) + } + + return null +} + +// Source type icon component +const SourceTypeIcon: FC<{ + sourceType?: DataSourceType + name?: string + notionIcon?: string +}> = ({ sourceType, name, notionIcon }) => { + if (sourceType === DataSourceType.FILE) { + return ( + + ) + } + + if (sourceType === DataSourceType.NOTION) { + return ( + + ) + } + + return null +} + +const IndexingProgressItem: FC = ({ + detail, + name, + sourceType, + notionIcon, + enableBilling, +}) => { + const isEmbedding = isSourceEmbedding(detail) + const percent = getSourcePercent(detail) + const isError = detail.indexing_status === 'error' + + return ( +
+ {isEmbedding && ( +
+ )} +
+ +
+
+ {name} +
+ {enableBilling && } +
+ {isEmbedding && ( +
{`${percent}%`}
+ )} + +
+
+ ) +} + +export default IndexingProgressItem diff --git a/web/app/components/datasets/create/embedding-process/rule-detail.tsx b/web/app/components/datasets/create/embedding-process/rule-detail.tsx new file mode 100644 index 0000000000..dff35100cb --- /dev/null +++ b/web/app/components/datasets/create/embedding-process/rule-detail.tsx @@ -0,0 +1,133 @@ +import type { FC } from 'react' +import type { ProcessRuleResponse } from '@/models/datasets' +import Image from 'next/image' +import { useCallback } from 'react' +import { useTranslation } from 'react-i18next' +import { FieldInfo } from '@/app/components/datasets/documents/detail/metadata' +import { ProcessMode } from '@/models/datasets' +import { RETRIEVE_METHOD } from '@/types/app' +import { indexMethodIcon, retrievalIcon } from '../icons' +import { IndexingType } from '../step-two' + +type RuleDetailProps = { + sourceData?: ProcessRuleResponse + indexingType?: string + retrievalMethod?: RETRIEVE_METHOD +} + +// Lookup table for pre-processing rule names +const PRE_PROCESSING_RULE_KEYS = { + remove_extra_spaces: 'stepTwo.removeExtraSpaces', + remove_urls_emails: 'stepTwo.removeUrlEmails', + remove_stopwords: 'stepTwo.removeStopwords', +} as const + +// Lookup table for retrieval method icons +const RETRIEVAL_ICON_MAP: Partial> = { + [RETRIEVE_METHOD.fullText]: retrievalIcon.fullText, + [RETRIEVE_METHOD.hybrid]: retrievalIcon.hybrid, + [RETRIEVE_METHOD.semantic]: retrievalIcon.vector, + [RETRIEVE_METHOD.invertedIndex]: retrievalIcon.fullText, + [RETRIEVE_METHOD.keywordSearch]: retrievalIcon.fullText, +} + +const isNumber = (value: unknown): value is number => typeof value === 'number' + +const RuleDetail: FC = ({ sourceData, indexingType, retrievalMethod }) => { + const { t } = useTranslation() + + const segmentationRuleLabels = { + mode: t('embedding.mode', { ns: 'datasetDocuments' }), + segmentLength: t('embedding.segmentLength', { ns: 'datasetDocuments' }), + textCleaning: t('embedding.textCleaning', { ns: 'datasetDocuments' }), + } + + const getRuleName = useCallback((key: string): string | undefined => { + const translationKey = PRE_PROCESSING_RULE_KEYS[key as keyof typeof PRE_PROCESSING_RULE_KEYS] + return translationKey ? t(translationKey, { ns: 'datasetCreation' }) : undefined + }, [t]) + + const getModeValue = useCallback((): string => { + if (!sourceData?.mode) + return '-' + + if (sourceData.mode === ProcessMode.general) + return t('embedding.custom', { ns: 'datasetDocuments' }) + + const parentModeLabel = sourceData.rules?.parent_mode === 'paragraph' + ? t('parentMode.paragraph', { ns: 'dataset' }) + : t('parentMode.fullDoc', { ns: 'dataset' }) + + return `${t('embedding.hierarchical', { ns: 'datasetDocuments' })} · ${parentModeLabel}` + }, [sourceData, t]) + + const getSegmentLengthValue = useCallback((): string | number => { + if (!sourceData?.mode) + return '-' + + const maxTokens = isNumber(sourceData.rules?.segmentation?.max_tokens) + ? sourceData.rules.segmentation.max_tokens + : '-' + + if (sourceData.mode === ProcessMode.general) + return maxTokens + + const childMaxTokens = isNumber(sourceData.rules?.subchunk_segmentation?.max_tokens) + ? sourceData.rules.subchunk_segmentation.max_tokens + : '-' + + return `${t('embedding.parentMaxTokens', { ns: 'datasetDocuments' })} ${maxTokens}; ${t('embedding.childMaxTokens', { ns: 'datasetDocuments' })} ${childMaxTokens}` + }, [sourceData, t]) + + const getTextCleaningValue = useCallback((): string => { + if (!sourceData?.mode) + return '-' + + const enabledRules = sourceData.rules?.pre_processing_rules?.filter(rule => rule.enabled) || [] + const ruleNames = enabledRules + .map((rule) => { + const name = getRuleName(rule.id) + return typeof name === 'string' ? name : '' + }) + .filter(name => name) + return ruleNames.length > 0 ? ruleNames.join(',') : '-' + }, [sourceData, getRuleName]) + + const fieldValueGetters: Record string | number> = { + mode: getModeValue, + segmentLength: getSegmentLengthValue, + textCleaning: getTextCleaningValue, + } + + const isEconomical = indexingType === IndexingType.ECONOMICAL + const indexMethodIconSrc = isEconomical ? indexMethodIcon.economical : indexMethodIcon.high_quality + const indexModeLabel = t(`stepTwo.${isEconomical ? 'economical' : 'qualified'}`, { ns: 'datasetCreation' }) + + const effectiveRetrievalMethod = isEconomical ? 'keyword_search' : (retrievalMethod ?? 'semantic_search') + const retrievalLabel = t(`retrieval.${effectiveRetrievalMethod}.title`, { ns: 'dataset' }) + const retrievalIconSrc = RETRIEVAL_ICON_MAP[retrievalMethod as keyof typeof RETRIEVAL_ICON_MAP] ?? retrievalIcon.vector + + return ( +
+ {Object.keys(segmentationRuleLabels).map(field => ( + + ))} + } + /> + } + /> +
+ ) +} + +export default RuleDetail diff --git a/web/app/components/datasets/create/embedding-process/upgrade-banner.tsx b/web/app/components/datasets/create/embedding-process/upgrade-banner.tsx new file mode 100644 index 0000000000..49e5fe99a1 --- /dev/null +++ b/web/app/components/datasets/create/embedding-process/upgrade-banner.tsx @@ -0,0 +1,22 @@ +import type { FC } from 'react' +import { useTranslation } from 'react-i18next' +import { ZapFast } from '@/app/components/base/icons/src/vender/solid/general' +import UpgradeBtn from '@/app/components/billing/upgrade-btn' + +const UpgradeBanner: FC = () => { + const { t } = useTranslation() + + return ( +
+
+ +
+
+ {t('plansCommon.documentProcessingPriorityUpgrade', { ns: 'billing' })} +
+ +
+ ) +} + +export default UpgradeBanner diff --git a/web/app/components/datasets/create/embedding-process/use-indexing-status-polling.ts b/web/app/components/datasets/create/embedding-process/use-indexing-status-polling.ts new file mode 100644 index 0000000000..f8e69e47af --- /dev/null +++ b/web/app/components/datasets/create/embedding-process/use-indexing-status-polling.ts @@ -0,0 +1,90 @@ +import type { IndexingStatusResponse } from '@/models/datasets' +import { useEffect, useRef, useState } from 'react' +import { fetchIndexingStatusBatch } from '@/service/datasets' + +const POLLING_INTERVAL = 2500 +const COMPLETED_STATUSES = ['completed', 'error', 'paused'] as const +const EMBEDDING_STATUSES = ['indexing', 'splitting', 'parsing', 'cleaning', 'waiting'] as const + +type IndexingStatusPollingParams = { + datasetId: string + batchId: string +} + +type IndexingStatusPollingResult = { + statusList: IndexingStatusResponse[] + isEmbedding: boolean + isEmbeddingCompleted: boolean +} + +const isStatusCompleted = (status: string): boolean => + COMPLETED_STATUSES.includes(status as typeof COMPLETED_STATUSES[number]) + +const isAllCompleted = (statusList: IndexingStatusResponse[]): boolean => + statusList.every(item => isStatusCompleted(item.indexing_status)) + +/** + * Custom hook for polling indexing status with automatic stop on completion. + * Handles the polling lifecycle and provides derived states for UI rendering. + */ +export const useIndexingStatusPolling = ({ + datasetId, + batchId, +}: IndexingStatusPollingParams): IndexingStatusPollingResult => { + const [statusList, setStatusList] = useState([]) + const isStopPollingRef = useRef(false) + + useEffect(() => { + // Reset polling state on mount + isStopPollingRef.current = false + let timeoutId: ReturnType | null = null + + const fetchStatus = async (): Promise => { + const response = await fetchIndexingStatusBatch({ datasetId, batchId }) + setStatusList(response.data) + return response.data + } + + const poll = async (): Promise => { + if (isStopPollingRef.current) + return + + try { + const data = await fetchStatus() + if (isAllCompleted(data)) { + isStopPollingRef.current = true + return + } + } + catch { + // Continue polling on error + } + + if (!isStopPollingRef.current) { + timeoutId = setTimeout(() => { + poll() + }, POLLING_INTERVAL) + } + } + + poll() + + return () => { + isStopPollingRef.current = true + if (timeoutId) + clearTimeout(timeoutId) + } + }, [datasetId, batchId]) + + const isEmbedding = statusList.some(item => + EMBEDDING_STATUSES.includes(item?.indexing_status as typeof EMBEDDING_STATUSES[number]), + ) + + const isEmbeddingCompleted = statusList.length > 0 && isAllCompleted(statusList) + + return { + statusList, + isEmbedding, + isEmbeddingCompleted, + } +} diff --git a/web/app/components/datasets/create/embedding-process/utils.ts b/web/app/components/datasets/create/embedding-process/utils.ts new file mode 100644 index 0000000000..6fbefb0230 --- /dev/null +++ b/web/app/components/datasets/create/embedding-process/utils.ts @@ -0,0 +1,64 @@ +import type { + DataSourceInfo, + DataSourceType, + FullDocumentDetail, + IndexingStatusResponse, + LegacyDataSourceInfo, +} from '@/models/datasets' + +const EMBEDDING_STATUSES = ['indexing', 'splitting', 'parsing', 'cleaning', 'waiting'] as const + +/** + * Type guard for legacy data source info with upload_file property + */ +export const isLegacyDataSourceInfo = (info: DataSourceInfo): info is LegacyDataSourceInfo => { + return info != null && typeof (info as LegacyDataSourceInfo).upload_file === 'object' +} + +/** + * Check if a status indicates the source is being embedded + */ +export const isSourceEmbedding = (detail: IndexingStatusResponse): boolean => + EMBEDDING_STATUSES.includes(detail.indexing_status as typeof EMBEDDING_STATUSES[number]) + +/** + * Calculate the progress percentage for a document + */ +export const getSourcePercent = (detail: IndexingStatusResponse): number => { + const completedCount = detail.completed_segments || 0 + const totalCount = detail.total_segments || 0 + + if (totalCount === 0) + return 0 + + const percent = Math.round(completedCount * 100 / totalCount) + return Math.min(percent, 100) +} + +/** + * Get file extension from filename, defaults to 'txt' + */ +export const getFileType = (name?: string): string => + name?.split('.').pop() || 'txt' + +/** + * Document lookup utilities - provides document info by ID from a list + */ +export const createDocumentLookup = (documents: FullDocumentDetail[]) => { + const documentMap = new Map(documents.map(doc => [doc.id, doc])) + + return { + getDocument: (id: string) => documentMap.get(id), + + getName: (id: string) => documentMap.get(id)?.name, + + getSourceType: (id: string) => documentMap.get(id)?.data_source_type as DataSourceType | undefined, + + getNotionIcon: (id: string) => { + const info = documentMap.get(id)?.data_source_info + if (info && isLegacyDataSourceInfo(info)) + return info.notion_page_icon + return undefined + }, + } +} diff --git a/web/app/components/datasets/create/step-two/components/general-chunking-options.tsx b/web/app/components/datasets/create/step-two/components/general-chunking-options.tsx new file mode 100644 index 0000000000..5140c902f5 --- /dev/null +++ b/web/app/components/datasets/create/step-two/components/general-chunking-options.tsx @@ -0,0 +1,199 @@ +'use client' + +import type { FC } from 'react' +import type { PreProcessingRule } from '@/models/datasets' +import { + RiAlertFill, + RiSearchEyeLine, +} from '@remixicon/react' +import Image from 'next/image' +import { useTranslation } from 'react-i18next' +import Button from '@/app/components/base/button' +import Checkbox from '@/app/components/base/checkbox' +import Divider from '@/app/components/base/divider' +import Tooltip from '@/app/components/base/tooltip' +import { IS_CE_EDITION } from '@/config' +import { ChunkingMode } from '@/models/datasets' +import SettingCog from '../../assets/setting-gear-mod.svg' +import s from '../index.module.css' +import LanguageSelect from '../language-select' +import { DelimiterInput, MaxLengthInput, OverlapInput } from './inputs' +import { OptionCard } from './option-card' + +type TextLabelProps = { + children: React.ReactNode +} + +const TextLabel: FC = ({ children }) => { + return +} + +type GeneralChunkingOptionsProps = { + // State + segmentIdentifier: string + maxChunkLength: number + overlap: number + rules: PreProcessingRule[] + currentDocForm: ChunkingMode + docLanguage: string + // Flags + isActive: boolean + isInUpload: boolean + isNotUploadInEmptyDataset: boolean + hasCurrentDatasetDocForm: boolean + // Actions + onSegmentIdentifierChange: (value: string) => void + onMaxChunkLengthChange: (value: number) => void + onOverlapChange: (value: number) => void + onRuleToggle: (id: string) => void + onDocFormChange: (form: ChunkingMode) => void + onDocLanguageChange: (lang: string) => void + onPreview: () => void + onReset: () => void + // Locale + locale: string +} + +export const GeneralChunkingOptions: FC = ({ + segmentIdentifier, + maxChunkLength, + overlap, + rules, + currentDocForm, + docLanguage, + isActive, + isInUpload, + isNotUploadInEmptyDataset, + hasCurrentDatasetDocForm, + onSegmentIdentifierChange, + onMaxChunkLengthChange, + onOverlapChange, + onRuleToggle, + onDocFormChange, + onDocLanguageChange, + onPreview, + onReset, + locale, +}) => { + const { t } = useTranslation() + + const getRuleName = (key: string): string => { + const ruleNameMap: Record = { + remove_extra_spaces: t('stepTwo.removeExtraSpaces', { ns: 'datasetCreation' }), + remove_urls_emails: t('stepTwo.removeUrlEmails', { ns: 'datasetCreation' }), + remove_stopwords: t('stepTwo.removeStopwords', { ns: 'datasetCreation' }), + } + return ruleNameMap[key] ?? key + } + + return ( + } + activeHeaderClassName="bg-dataset-option-card-blue-gradient" + description={t('stepTwo.generalTip', { ns: 'datasetCreation' })} + isActive={isActive} + onSwitched={() => onDocFormChange(ChunkingMode.text)} + actions={( + <> + + + + )} + noHighlight={isInUpload && isNotUploadInEmptyDataset} + > +
+
+ onSegmentIdentifierChange(e.target.value)} + /> + + +
+
+
+
+ {t('stepTwo.rules', { ns: 'datasetCreation' })} +
+ +
+
+ {rules.map(rule => ( +
onRuleToggle(rule.id)} + > + + +
+ ))} + {IS_CE_EDITION && ( + <> + +
+
{ + if (hasCurrentDatasetDocForm) + return + if (currentDocForm === ChunkingMode.qa) + onDocFormChange(ChunkingMode.text) + else + onDocFormChange(ChunkingMode.qa) + }} + > + + +
+ + +
+ {currentDocForm === ChunkingMode.qa && ( +
+ + + {t('stepTwo.QATip', { ns: 'datasetCreation' })} + +
+ )} + + )} +
+
+
+
+ ) +} diff --git a/web/app/components/datasets/create/step-two/components/index.ts b/web/app/components/datasets/create/step-two/components/index.ts new file mode 100644 index 0000000000..d5382e0c4b --- /dev/null +++ b/web/app/components/datasets/create/step-two/components/index.ts @@ -0,0 +1,5 @@ +export { GeneralChunkingOptions } from './general-chunking-options' +export { IndexingModeSection } from './indexing-mode-section' +export { ParentChildOptions } from './parent-child-options' +export { PreviewPanel } from './preview-panel' +export { StepTwoFooter } from './step-two-footer' diff --git a/web/app/components/datasets/create/step-two/components/indexing-mode-section.tsx b/web/app/components/datasets/create/step-two/components/indexing-mode-section.tsx new file mode 100644 index 0000000000..ee49f42903 --- /dev/null +++ b/web/app/components/datasets/create/step-two/components/indexing-mode-section.tsx @@ -0,0 +1,253 @@ +'use client' + +import type { FC } from 'react' +import type { DefaultModel, Model } from '@/app/components/header/account-setting/model-provider-page/declarations' +import type { RetrievalConfig } from '@/types/app' +import Image from 'next/image' +import Link from 'next/link' +import { useTranslation } from 'react-i18next' +import Badge from '@/app/components/base/badge' +import Button from '@/app/components/base/button' +import CustomDialog from '@/app/components/base/dialog' +import Divider from '@/app/components/base/divider' +import { AlertTriangle } from '@/app/components/base/icons/src/vender/solid/alertsAndFeedback' +import Tooltip from '@/app/components/base/tooltip' +import EconomicalRetrievalMethodConfig from '@/app/components/datasets/common/economical-retrieval-method-config' +import RetrievalMethodConfig from '@/app/components/datasets/common/retrieval-method-config' +import ModelSelector from '@/app/components/header/account-setting/model-provider-page/model-selector' +import { useDocLink } from '@/context/i18n' +import { ChunkingMode } from '@/models/datasets' +import { cn } from '@/utils/classnames' +import { indexMethodIcon } from '../../icons' +import { IndexingType } from '../hooks' +import s from '../index.module.css' +import { OptionCard } from './option-card' + +type IndexingModeSectionProps = { + // State + indexType: IndexingType + hasSetIndexType: boolean + docForm: ChunkingMode + embeddingModel: DefaultModel + embeddingModelList?: Model[] + retrievalConfig: RetrievalConfig + showMultiModalTip: boolean + // Flags + isModelAndRetrievalConfigDisabled: boolean + datasetId?: string + // Modal state + isQAConfirmDialogOpen: boolean + // Actions + onIndexTypeChange: (type: IndexingType) => void + onEmbeddingModelChange: (model: DefaultModel) => void + onRetrievalConfigChange: (config: RetrievalConfig) => void + onQAConfirmDialogClose: () => void + onQAConfirmDialogConfirm: () => void +} + +export const IndexingModeSection: FC = ({ + indexType, + hasSetIndexType, + docForm, + embeddingModel, + embeddingModelList, + retrievalConfig, + showMultiModalTip, + isModelAndRetrievalConfigDisabled, + datasetId, + isQAConfirmDialogOpen, + onIndexTypeChange, + onEmbeddingModelChange, + onRetrievalConfigChange, + onQAConfirmDialogClose, + onQAConfirmDialogConfirm, +}) => { + const { t } = useTranslation() + const docLink = useDocLink() + + const getIndexingTechnique = () => indexType + + return ( + <> + {/* Index Mode */} +
+ {t('stepTwo.indexMode', { ns: 'datasetCreation' })} +
+
+ {/* Qualified option */} + {(!hasSetIndexType || (hasSetIndexType && indexType === IndexingType.QUALIFIED)) && ( + + {t('stepTwo.qualified', { ns: 'datasetCreation' })} + + {t('stepTwo.recommend', { ns: 'datasetCreation' })} + + + {!hasSetIndexType && } + +
+ )} + description={t('stepTwo.qualifiedTip', { ns: 'datasetCreation' })} + icon={} + isActive={!hasSetIndexType && indexType === IndexingType.QUALIFIED} + disabled={hasSetIndexType} + onSwitched={() => onIndexTypeChange(IndexingType.QUALIFIED)} + /> + )} + + {/* Economical option */} + {(!hasSetIndexType || (hasSetIndexType && indexType === IndexingType.ECONOMICAL)) && ( + <> + +
+

+ {t('stepTwo.qaSwitchHighQualityTipTitle', { ns: 'datasetCreation' })} +

+

+ {t('stepTwo.qaSwitchHighQualityTipContent', { ns: 'datasetCreation' })} +

+
+
+ + +
+
+ + {docForm === ChunkingMode.qa + ? t('stepTwo.notAvailableForQA', { ns: 'datasetCreation' }) + : t('stepTwo.notAvailableForParentChild', { ns: 'datasetCreation' })} +
+ )} + noDecoration + position="top" + asChild={false} + triggerClassName="flex-1 self-stretch" + > + } + isActive={!hasSetIndexType && indexType === IndexingType.ECONOMICAL} + disabled={hasSetIndexType || docForm !== ChunkingMode.text} + onSwitched={() => onIndexTypeChange(IndexingType.ECONOMICAL)} + /> + + + )} +
+ + {/* High quality tip */} + {!hasSetIndexType && indexType === IndexingType.QUALIFIED && ( +
+
+
+ +
+ + {t('stepTwo.highQualityTip', { ns: 'datasetCreation' })} + +
+ )} + + {/* Economical index setting tip */} + {hasSetIndexType && indexType === IndexingType.ECONOMICAL && ( +
+ {t('stepTwo.indexSettingTip', { ns: 'datasetCreation' })} + + {t('stepTwo.datasetSettingLink', { ns: 'datasetCreation' })} + +
+ )} + + {/* Embedding model */} + {indexType === IndexingType.QUALIFIED && ( +
+
+ {t('form.embeddingModel', { ns: 'datasetSettings' })} +
+ + {isModelAndRetrievalConfigDisabled && ( +
+ {t('stepTwo.indexSettingTip', { ns: 'datasetCreation' })} + + {t('stepTwo.datasetSettingLink', { ns: 'datasetCreation' })} + +
+ )} +
+ )} + + + + {/* Retrieval Method Config */} +
+ {!isModelAndRetrievalConfigDisabled + ? ( +
+
+ {t('form.retrievalSetting.title', { ns: 'datasetSettings' })} +
+
+ + {t('form.retrievalSetting.learnMore', { ns: 'datasetSettings' })} + + {t('form.retrievalSetting.longDescription', { ns: 'datasetSettings' })} +
+
+ ) + : ( +
+
{t('form.retrievalSetting.title', { ns: 'datasetSettings' })}
+
+ )} + +
+ {getIndexingTechnique() === IndexingType.QUALIFIED + ? ( + + ) + : ( + + )} +
+
+ + ) +} diff --git a/web/app/components/datasets/create/step-two/inputs.tsx b/web/app/components/datasets/create/step-two/components/inputs.tsx similarity index 100% rename from web/app/components/datasets/create/step-two/inputs.tsx rename to web/app/components/datasets/create/step-two/components/inputs.tsx diff --git a/web/app/components/datasets/create/step-two/option-card.tsx b/web/app/components/datasets/create/step-two/components/option-card.tsx similarity index 100% rename from web/app/components/datasets/create/step-two/option-card.tsx rename to web/app/components/datasets/create/step-two/components/option-card.tsx diff --git a/web/app/components/datasets/create/step-two/components/parent-child-options.tsx b/web/app/components/datasets/create/step-two/components/parent-child-options.tsx new file mode 100644 index 0000000000..e46aa5817b --- /dev/null +++ b/web/app/components/datasets/create/step-two/components/parent-child-options.tsx @@ -0,0 +1,191 @@ +'use client' + +import type { FC } from 'react' +import type { ParentChildConfig } from '../hooks' +import type { ParentMode, PreProcessingRule } from '@/models/datasets' +import { RiSearchEyeLine } from '@remixicon/react' +import Image from 'next/image' +import { useTranslation } from 'react-i18next' +import Button from '@/app/components/base/button' +import Checkbox from '@/app/components/base/checkbox' +import Divider from '@/app/components/base/divider' +import { ParentChildChunk } from '@/app/components/base/icons/src/vender/knowledge' +import RadioCard from '@/app/components/base/radio-card' +import { ChunkingMode } from '@/models/datasets' +import FileList from '../../assets/file-list-3-fill.svg' +import Note from '../../assets/note-mod.svg' +import BlueEffect from '../../assets/option-card-effect-blue.svg' +import s from '../index.module.css' +import { DelimiterInput, MaxLengthInput } from './inputs' +import { OptionCard } from './option-card' + +type TextLabelProps = { + children: React.ReactNode +} + +const TextLabel: FC = ({ children }) => { + return +} + +type ParentChildOptionsProps = { + // State + parentChildConfig: ParentChildConfig + rules: PreProcessingRule[] + currentDocForm: ChunkingMode + // Flags + isActive: boolean + isInUpload: boolean + isNotUploadInEmptyDataset: boolean + // Actions + onDocFormChange: (form: ChunkingMode) => void + onChunkForContextChange: (mode: ParentMode) => void + onParentDelimiterChange: (value: string) => void + onParentMaxLengthChange: (value: number) => void + onChildDelimiterChange: (value: string) => void + onChildMaxLengthChange: (value: number) => void + onRuleToggle: (id: string) => void + onPreview: () => void + onReset: () => void +} + +export const ParentChildOptions: FC = ({ + parentChildConfig, + rules, + currentDocForm: _currentDocForm, + isActive, + isInUpload, + isNotUploadInEmptyDataset, + onDocFormChange, + onChunkForContextChange, + onParentDelimiterChange, + onParentMaxLengthChange, + onChildDelimiterChange, + onChildMaxLengthChange, + onRuleToggle, + onPreview, + onReset, +}) => { + const { t } = useTranslation() + + const getRuleName = (key: string): string => { + const ruleNameMap: Record = { + remove_extra_spaces: t('stepTwo.removeExtraSpaces', { ns: 'datasetCreation' }), + remove_urls_emails: t('stepTwo.removeUrlEmails', { ns: 'datasetCreation' }), + remove_stopwords: t('stepTwo.removeStopwords', { ns: 'datasetCreation' }), + } + return ruleNameMap[key] ?? key + } + + return ( + } + effectImg={BlueEffect.src} + className="text-util-colors-blue-light-blue-light-500" + activeHeaderClassName="bg-dataset-option-card-blue-gradient" + description={t('stepTwo.parentChildTip', { ns: 'datasetCreation' })} + isActive={isActive} + onSwitched={() => onDocFormChange(ChunkingMode.parentChild)} + actions={( + <> + + + + )} + noHighlight={isInUpload && isNotUploadInEmptyDataset} + > +
+ {/* Parent chunk for context */} +
+
+
+ {t('stepTwo.parentChunkForContext', { ns: 'datasetCreation' })} +
+ +
+ } + title={t('stepTwo.paragraph', { ns: 'datasetCreation' })} + description={t('stepTwo.paragraphTip', { ns: 'datasetCreation' })} + isChosen={parentChildConfig.chunkForContext === 'paragraph'} + onChosen={() => onChunkForContextChange('paragraph')} + chosenConfig={( +
+ onParentDelimiterChange(e.target.value)} + /> + +
+ )} + /> + } + title={t('stepTwo.fullDoc', { ns: 'datasetCreation' })} + description={t('stepTwo.fullDocTip', { ns: 'datasetCreation' })} + onChosen={() => onChunkForContextChange('full-doc')} + isChosen={parentChildConfig.chunkForContext === 'full-doc'} + /> +
+ + {/* Child chunk for retrieval */} +
+
+
+ {t('stepTwo.childChunkForRetrieval', { ns: 'datasetCreation' })} +
+ +
+
+ onChildDelimiterChange(e.target.value)} + /> + +
+
+ + {/* Rules */} +
+
+
+ {t('stepTwo.rules', { ns: 'datasetCreation' })} +
+ +
+
+ {rules.map(rule => ( +
onRuleToggle(rule.id)} + > + + +
+ ))} +
+
+
+
+ ) +} diff --git a/web/app/components/datasets/create/step-two/components/preview-panel.tsx b/web/app/components/datasets/create/step-two/components/preview-panel.tsx new file mode 100644 index 0000000000..4f25cee5bd --- /dev/null +++ b/web/app/components/datasets/create/step-two/components/preview-panel.tsx @@ -0,0 +1,171 @@ +'use client' + +import type { FC } from 'react' +import type { ParentChildConfig } from '../hooks' +import type { DataSourceType, FileIndexingEstimateResponse } from '@/models/datasets' +import { RiSearchEyeLine } from '@remixicon/react' +import { noop } from 'es-toolkit/function' +import { useTranslation } from 'react-i18next' +import Badge from '@/app/components/base/badge' +import FloatRightContainer from '@/app/components/base/float-right-container' +import { SkeletonContainer, SkeletonPoint, SkeletonRectangle, SkeletonRow } from '@/app/components/base/skeleton' +import { FULL_DOC_PREVIEW_LENGTH } from '@/config' +import { ChunkingMode } from '@/models/datasets' +import { cn } from '@/utils/classnames' +import { ChunkContainer, QAPreview } from '../../../chunk' +import PreviewDocumentPicker from '../../../common/document-picker/preview-document-picker' +import { PreviewSlice } from '../../../formatted-text/flavours/preview-slice' +import { FormattedText } from '../../../formatted-text/formatted' +import PreviewContainer from '../../../preview/container' +import { PreviewHeader } from '../../../preview/header' + +type PreviewPanelProps = { + // State + isMobile: boolean + dataSourceType: DataSourceType + currentDocForm: ChunkingMode + estimate?: FileIndexingEstimateResponse + parentChildConfig: ParentChildConfig + isSetting?: boolean + // Picker + pickerFiles: Array<{ id: string, name: string, extension: string }> + pickerValue: { id: string, name: string, extension: string } + // Mutation state + isIdle: boolean + isPending: boolean + // Actions + onPickerChange: (selected: { id: string, name: string }) => void +} + +export const PreviewPanel: FC = ({ + isMobile, + dataSourceType: _dataSourceType, + currentDocForm, + estimate, + parentChildConfig, + isSetting, + pickerFiles, + pickerValue, + isIdle, + isPending, + onPickerChange, +}) => { + const { t } = useTranslation() + + return ( + + +
+ >} + onChange={onPickerChange} + value={isSetting ? pickerFiles[0] : pickerValue} + /> + {currentDocForm !== ChunkingMode.qa && ( + + )} +
+ + )} + className={cn('relative flex h-full w-1/2 shrink-0 p-4 pr-0', isMobile && 'w-full max-w-[524px]')} + mainClassName="space-y-6" + > + {/* QA Preview */} + {currentDocForm === ChunkingMode.qa && estimate?.qa_preview && ( + estimate.qa_preview.map((item, index) => ( + + + + )) + )} + + {/* Text Preview */} + {currentDocForm === ChunkingMode.text && estimate?.preview && ( + estimate.preview.map((item, index) => ( + + {item.content} + + )) + )} + + {/* Parent-Child Preview */} + {currentDocForm === ChunkingMode.parentChild && estimate?.preview && ( + estimate.preview.map((item, index) => { + const indexForLabel = index + 1 + const childChunks = parentChildConfig.chunkForContext === 'full-doc' + ? item.child_chunks.slice(0, FULL_DOC_PREVIEW_LENGTH) + : item.child_chunks + return ( + + + {childChunks.map((child, childIndex) => { + const childIndexForLabel = childIndex + 1 + return ( + + ) + })} + + + ) + }) + )} + + {/* Idle State */} + {isIdle && ( +
+
+ +

+ {t('stepTwo.previewChunkTip', { ns: 'datasetCreation' })} +

+
+
+ )} + + {/* Loading State */} + {isPending && ( +
+ {Array.from({ length: 10 }, (_, i) => ( + + + + + + + + + + + ))} +
+ )} +
+
+ ) +} diff --git a/web/app/components/datasets/create/step-two/components/step-two-footer.tsx b/web/app/components/datasets/create/step-two/components/step-two-footer.tsx new file mode 100644 index 0000000000..a22be64a75 --- /dev/null +++ b/web/app/components/datasets/create/step-two/components/step-two-footer.tsx @@ -0,0 +1,58 @@ +'use client' + +import type { FC } from 'react' +import { RiArrowLeftLine } from '@remixicon/react' +import { useTranslation } from 'react-i18next' +import Button from '@/app/components/base/button' + +type StepTwoFooterProps = { + isSetting?: boolean + isCreating: boolean + onPrevious: () => void + onCreate: () => void + onCancel?: () => void +} + +export const StepTwoFooter: FC = ({ + isSetting, + isCreating, + onPrevious, + onCreate, + onCancel, +}) => { + const { t } = useTranslation() + + if (!isSetting) { + return ( +
+ + +
+ ) + } + + return ( +
+ + +
+ ) +} diff --git a/web/app/components/datasets/create/step-two/escape.ts b/web/app/components/datasets/create/step-two/hooks/escape.ts similarity index 100% rename from web/app/components/datasets/create/step-two/escape.ts rename to web/app/components/datasets/create/step-two/hooks/escape.ts diff --git a/web/app/components/datasets/create/step-two/hooks/index.ts b/web/app/components/datasets/create/step-two/hooks/index.ts new file mode 100644 index 0000000000..f16daaaea5 --- /dev/null +++ b/web/app/components/datasets/create/step-two/hooks/index.ts @@ -0,0 +1,14 @@ +export { useDocumentCreation } from './use-document-creation' +export type { DocumentCreation, ValidationParams } from './use-document-creation' + +export { IndexingType, useIndexingConfig } from './use-indexing-config' +export type { IndexingConfig } from './use-indexing-config' + +export { useIndexingEstimate } from './use-indexing-estimate' +export type { IndexingEstimate } from './use-indexing-estimate' + +export { usePreviewState } from './use-preview-state' +export type { PreviewState } from './use-preview-state' + +export { DEFAULT_MAXIMUM_CHUNK_LENGTH, DEFAULT_OVERLAP, DEFAULT_SEGMENT_IDENTIFIER, defaultParentChildConfig, MAXIMUM_CHUNK_TOKEN_LENGTH, useSegmentationState } from './use-segmentation-state' +export type { ParentChildConfig, SegmentationState } from './use-segmentation-state' diff --git a/web/app/components/datasets/create/step-two/unescape.ts b/web/app/components/datasets/create/step-two/hooks/unescape.ts similarity index 100% rename from web/app/components/datasets/create/step-two/unescape.ts rename to web/app/components/datasets/create/step-two/hooks/unescape.ts diff --git a/web/app/components/datasets/create/step-two/hooks/use-document-creation.ts b/web/app/components/datasets/create/step-two/hooks/use-document-creation.ts new file mode 100644 index 0000000000..fd132b38ef --- /dev/null +++ b/web/app/components/datasets/create/step-two/hooks/use-document-creation.ts @@ -0,0 +1,279 @@ +import type { DefaultModel, Model } from '@/app/components/header/account-setting/model-provider-page/declarations' +import type { NotionPage } from '@/models/common' +import type { + ChunkingMode, + CrawlOptions, + CrawlResultItem, + CreateDocumentReq, + createDocumentResponse, + CustomFile, + FullDocumentDetail, + ProcessRule, +} from '@/models/datasets' +import type { RetrievalConfig, RETRIEVE_METHOD } from '@/types/app' +import { useCallback } from 'react' +import { useTranslation } from 'react-i18next' +import { trackEvent } from '@/app/components/base/amplitude' +import Toast from '@/app/components/base/toast' +import { isReRankModelSelected } from '@/app/components/datasets/common/check-rerank-model' +import { DataSourceProvider } from '@/models/common' +import { + DataSourceType, +} from '@/models/datasets' +import { getNotionInfo, getWebsiteInfo, useCreateDocument, useCreateFirstDocument } from '@/service/knowledge/use-create-dataset' +import { useInvalidDatasetList } from '@/service/knowledge/use-dataset' +import { IndexingType } from './use-indexing-config' +import { MAXIMUM_CHUNK_TOKEN_LENGTH } from './use-segmentation-state' + +export type UseDocumentCreationOptions = { + datasetId?: string + isSetting?: boolean + documentDetail?: FullDocumentDetail + dataSourceType: DataSourceType + files: CustomFile[] + notionPages: NotionPage[] + notionCredentialId: string + websitePages: CrawlResultItem[] + crawlOptions?: CrawlOptions + websiteCrawlProvider?: DataSourceProvider + websiteCrawlJobId?: string + // Callbacks + onStepChange?: (delta: number) => void + updateIndexingTypeCache?: (type: string) => void + updateResultCache?: (res: createDocumentResponse) => void + updateRetrievalMethodCache?: (method: RETRIEVE_METHOD | '') => void + onSave?: () => void + mutateDatasetRes?: () => void +} + +export type ValidationParams = { + segmentationType: string + maxChunkLength: number + limitMaxChunkLength: number + overlap: number + indexType: IndexingType + embeddingModel: DefaultModel + rerankModelList: Model[] + retrievalConfig: RetrievalConfig +} + +export const useDocumentCreation = (options: UseDocumentCreationOptions) => { + const { t } = useTranslation() + const { + datasetId, + isSetting, + documentDetail, + dataSourceType, + files, + notionPages, + notionCredentialId, + websitePages, + crawlOptions, + websiteCrawlProvider = DataSourceProvider.jinaReader, + websiteCrawlJobId = '', + onStepChange, + updateIndexingTypeCache, + updateResultCache, + updateRetrievalMethodCache, + onSave, + mutateDatasetRes, + } = options + + const createFirstDocumentMutation = useCreateFirstDocument() + const createDocumentMutation = useCreateDocument(datasetId!) + const invalidDatasetList = useInvalidDatasetList() + + const isCreating = createFirstDocumentMutation.isPending || createDocumentMutation.isPending + + // Validate creation params + const validateParams = useCallback((params: ValidationParams): boolean => { + const { + segmentationType, + maxChunkLength, + limitMaxChunkLength, + overlap, + indexType, + embeddingModel, + rerankModelList, + retrievalConfig, + } = params + + if (segmentationType === 'general' && overlap > maxChunkLength) { + Toast.notify({ type: 'error', message: t('stepTwo.overlapCheck', { ns: 'datasetCreation' }) }) + return false + } + + if (segmentationType === 'general' && maxChunkLength > limitMaxChunkLength) { + Toast.notify({ + type: 'error', + message: t('stepTwo.maxLengthCheck', { ns: 'datasetCreation', limit: limitMaxChunkLength }), + }) + return false + } + + if (!isSetting) { + if (indexType === IndexingType.QUALIFIED && (!embeddingModel.model || !embeddingModel.provider)) { + Toast.notify({ + type: 'error', + message: t('datasetConfig.embeddingModelRequired', { ns: 'appDebug' }), + }) + return false + } + + if (!isReRankModelSelected({ + rerankModelList, + retrievalConfig, + indexMethod: indexType, + })) { + Toast.notify({ type: 'error', message: t('datasetConfig.rerankModelRequired', { ns: 'appDebug' }) }) + return false + } + } + + return true + }, [t, isSetting]) + + // Build creation params + const buildCreationParams = useCallback(( + currentDocForm: ChunkingMode, + docLanguage: string, + processRule: ProcessRule, + retrievalConfig: RetrievalConfig, + embeddingModel: DefaultModel, + indexingTechnique: string, + ): CreateDocumentReq | null => { + if (isSetting) { + return { + original_document_id: documentDetail?.id, + doc_form: currentDocForm, + doc_language: docLanguage, + process_rule: processRule, + retrieval_model: retrievalConfig, + embedding_model: embeddingModel.model, + embedding_model_provider: embeddingModel.provider, + indexing_technique: indexingTechnique, + } as CreateDocumentReq + } + + const params: CreateDocumentReq = { + data_source: { + type: dataSourceType, + info_list: { + data_source_type: dataSourceType, + }, + }, + indexing_technique: indexingTechnique, + process_rule: processRule, + doc_form: currentDocForm, + doc_language: docLanguage, + retrieval_model: retrievalConfig, + embedding_model: embeddingModel.model, + embedding_model_provider: embeddingModel.provider, + } as CreateDocumentReq + + // Add data source specific info + if (dataSourceType === DataSourceType.FILE) { + params.data_source!.info_list.file_info_list = { + file_ids: files.map(file => file.id || '').filter(Boolean), + } + } + if (dataSourceType === DataSourceType.NOTION) + params.data_source!.info_list.notion_info_list = getNotionInfo(notionPages, notionCredentialId) + + if (dataSourceType === DataSourceType.WEB) { + params.data_source!.info_list.website_info_list = getWebsiteInfo({ + websiteCrawlProvider, + websiteCrawlJobId, + websitePages, + crawlOptions, + }) + } + + return params + }, [ + isSetting, + documentDetail, + dataSourceType, + files, + notionPages, + notionCredentialId, + websitePages, + websiteCrawlProvider, + websiteCrawlJobId, + crawlOptions, + ]) + + // Execute creation + const executeCreation = useCallback(async ( + params: CreateDocumentReq, + indexType: IndexingType, + retrievalConfig: RetrievalConfig, + ) => { + if (!datasetId) { + await createFirstDocumentMutation.mutateAsync(params, { + onSuccess(data) { + updateIndexingTypeCache?.(indexType) + updateResultCache?.(data) + updateRetrievalMethodCache?.(retrievalConfig.search_method as RETRIEVE_METHOD) + }, + }) + } + else { + await createDocumentMutation.mutateAsync(params, { + onSuccess(data) { + updateIndexingTypeCache?.(indexType) + updateResultCache?.(data) + updateRetrievalMethodCache?.(retrievalConfig.search_method as RETRIEVE_METHOD) + }, + }) + } + + mutateDatasetRes?.() + invalidDatasetList() + + trackEvent('create_datasets', { + data_source_type: dataSourceType, + indexing_technique: indexType, + }) + + onStepChange?.(+1) + + if (isSetting) + onSave?.() + }, [ + datasetId, + createFirstDocumentMutation, + createDocumentMutation, + updateIndexingTypeCache, + updateResultCache, + updateRetrievalMethodCache, + mutateDatasetRes, + invalidDatasetList, + dataSourceType, + onStepChange, + isSetting, + onSave, + ]) + + // Validate preview params + const validatePreviewParams = useCallback((maxChunkLength: number): boolean => { + if (maxChunkLength > MAXIMUM_CHUNK_TOKEN_LENGTH) { + Toast.notify({ + type: 'error', + message: t('stepTwo.maxLengthCheck', { ns: 'datasetCreation', limit: MAXIMUM_CHUNK_TOKEN_LENGTH }), + }) + return false + } + return true + }, [t]) + + return { + isCreating, + validateParams, + buildCreationParams, + executeCreation, + validatePreviewParams, + } +} + +export type DocumentCreation = ReturnType diff --git a/web/app/components/datasets/create/step-two/hooks/use-indexing-config.ts b/web/app/components/datasets/create/step-two/hooks/use-indexing-config.ts new file mode 100644 index 0000000000..97fc9c260f --- /dev/null +++ b/web/app/components/datasets/create/step-two/hooks/use-indexing-config.ts @@ -0,0 +1,143 @@ +import type { DefaultModel } from '@/app/components/header/account-setting/model-provider-page/declarations' +import type { RetrievalConfig } from '@/types/app' +import { useEffect, useMemo, useState } from 'react' +import { checkShowMultiModalTip } from '@/app/components/datasets/settings/utils' +import { ModelTypeEnum } from '@/app/components/header/account-setting/model-provider-page/declarations' +import { useDefaultModel, useModelList, useModelListAndDefaultModelAndCurrentProviderAndModel } from '@/app/components/header/account-setting/model-provider-page/hooks' +import { RETRIEVE_METHOD } from '@/types/app' + +export enum IndexingType { + QUALIFIED = 'high_quality', + ECONOMICAL = 'economy', +} + +const DEFAULT_RETRIEVAL_CONFIG: RetrievalConfig = { + search_method: RETRIEVE_METHOD.semantic, + reranking_enable: false, + reranking_model: { + reranking_provider_name: '', + reranking_model_name: '', + }, + top_k: 3, + score_threshold_enabled: false, + score_threshold: 0.5, +} + +export type UseIndexingConfigOptions = { + initialIndexType?: IndexingType + initialEmbeddingModel?: DefaultModel + initialRetrievalConfig?: RetrievalConfig + isAPIKeySet: boolean + hasSetIndexType: boolean +} + +export const useIndexingConfig = (options: UseIndexingConfigOptions) => { + const { + initialIndexType, + initialEmbeddingModel, + initialRetrievalConfig, + isAPIKeySet, + hasSetIndexType, + } = options + + // Rerank model + const { + modelList: rerankModelList, + defaultModel: rerankDefaultModel, + currentModel: isRerankDefaultModelValid, + } = useModelListAndDefaultModelAndCurrentProviderAndModel(ModelTypeEnum.rerank) + + // Embedding model list + const { data: embeddingModelList } = useModelList(ModelTypeEnum.textEmbedding) + const { data: defaultEmbeddingModel } = useDefaultModel(ModelTypeEnum.textEmbedding) + + // Index type state + const [indexType, setIndexType] = useState(() => { + if (initialIndexType) + return initialIndexType + return isAPIKeySet ? IndexingType.QUALIFIED : IndexingType.ECONOMICAL + }) + + // Embedding model state + const [embeddingModel, setEmbeddingModel] = useState( + initialEmbeddingModel ?? { + provider: defaultEmbeddingModel?.provider.provider || '', + model: defaultEmbeddingModel?.model || '', + }, + ) + + // Retrieval config state + const [retrievalConfig, setRetrievalConfig] = useState( + initialRetrievalConfig ?? DEFAULT_RETRIEVAL_CONFIG, + ) + + // Sync retrieval config with rerank model when available + useEffect(() => { + if (initialRetrievalConfig) + return + + setRetrievalConfig({ + search_method: RETRIEVE_METHOD.semantic, + reranking_enable: !!isRerankDefaultModelValid, + reranking_model: { + reranking_provider_name: isRerankDefaultModelValid ? rerankDefaultModel?.provider.provider ?? '' : '', + reranking_model_name: isRerankDefaultModelValid ? rerankDefaultModel?.model ?? '' : '', + }, + top_k: 3, + score_threshold_enabled: false, + score_threshold: 0.5, + }) + }, [rerankDefaultModel, isRerankDefaultModelValid, initialRetrievalConfig]) + + // Sync index type with props + useEffect(() => { + if (initialIndexType) + setIndexType(initialIndexType) + else + setIndexType(isAPIKeySet ? IndexingType.QUALIFIED : IndexingType.ECONOMICAL) + }, [isAPIKeySet, initialIndexType]) + + // Show multimodal tip + const showMultiModalTip = useMemo(() => { + return checkShowMultiModalTip({ + embeddingModel, + rerankingEnable: retrievalConfig.reranking_enable, + rerankModel: { + rerankingProviderName: retrievalConfig.reranking_model.reranking_provider_name, + rerankingModelName: retrievalConfig.reranking_model.reranking_model_name, + }, + indexMethod: indexType, + embeddingModelList, + rerankModelList, + }) + }, [embeddingModel, retrievalConfig, indexType, embeddingModelList, rerankModelList]) + + // Get effective indexing technique + const getIndexingTechnique = () => initialIndexType || indexType + + return { + // Index type + indexType, + setIndexType, + hasSetIndexType, + getIndexingTechnique, + + // Embedding model + embeddingModel, + setEmbeddingModel, + embeddingModelList, + defaultEmbeddingModel, + + // Retrieval config + retrievalConfig, + setRetrievalConfig, + rerankModelList, + rerankDefaultModel, + isRerankDefaultModelValid, + + // Computed + showMultiModalTip, + } +} + +export type IndexingConfig = ReturnType diff --git a/web/app/components/datasets/create/step-two/hooks/use-indexing-estimate.ts b/web/app/components/datasets/create/step-two/hooks/use-indexing-estimate.ts new file mode 100644 index 0000000000..cc5a2bcf33 --- /dev/null +++ b/web/app/components/datasets/create/step-two/hooks/use-indexing-estimate.ts @@ -0,0 +1,123 @@ +import type { IndexingType } from './use-indexing-config' +import type { NotionPage } from '@/models/common' +import type { ChunkingMode, CrawlOptions, CrawlResultItem, CustomFile, ProcessRule } from '@/models/datasets' +import { useCallback } from 'react' +import { DataSourceProvider } from '@/models/common' +import { DataSourceType } from '@/models/datasets' +import { + useFetchFileIndexingEstimateForFile, + useFetchFileIndexingEstimateForNotion, + useFetchFileIndexingEstimateForWeb, +} from '@/service/knowledge/use-create-dataset' + +export type UseIndexingEstimateOptions = { + dataSourceType: DataSourceType + datasetId?: string + // Document settings + currentDocForm: ChunkingMode + docLanguage: string + // File data source + files: CustomFile[] + previewFileName?: string + // Notion data source + previewNotionPage: NotionPage + notionCredentialId: string + // Website data source + previewWebsitePage: CrawlResultItem + crawlOptions?: CrawlOptions + websiteCrawlProvider?: DataSourceProvider + websiteCrawlJobId?: string + // Processing + indexingTechnique: IndexingType + processRule: ProcessRule +} + +export const useIndexingEstimate = (options: UseIndexingEstimateOptions) => { + const { + dataSourceType, + datasetId, + currentDocForm, + docLanguage, + files, + previewFileName, + previewNotionPage, + notionCredentialId, + previewWebsitePage, + crawlOptions, + websiteCrawlProvider, + websiteCrawlJobId, + indexingTechnique, + processRule, + } = options + + // File indexing estimate + const fileQuery = useFetchFileIndexingEstimateForFile({ + docForm: currentDocForm, + docLanguage, + dataSourceType: DataSourceType.FILE, + files: previewFileName + ? [files.find(file => file.name === previewFileName)!] + : files, + indexingTechnique, + processRule, + dataset_id: datasetId!, + }) + + // Notion indexing estimate + const notionQuery = useFetchFileIndexingEstimateForNotion({ + docForm: currentDocForm, + docLanguage, + dataSourceType: DataSourceType.NOTION, + notionPages: [previewNotionPage], + indexingTechnique, + processRule, + dataset_id: datasetId || '', + credential_id: notionCredentialId, + }) + + // Website indexing estimate + const websiteQuery = useFetchFileIndexingEstimateForWeb({ + docForm: currentDocForm, + docLanguage, + dataSourceType: DataSourceType.WEB, + websitePages: [previewWebsitePage], + crawlOptions, + websiteCrawlProvider: websiteCrawlProvider ?? DataSourceProvider.jinaReader, + websiteCrawlJobId: websiteCrawlJobId ?? '', + indexingTechnique, + processRule, + dataset_id: datasetId || '', + }) + + // Get current mutation based on data source type + const getCurrentMutation = useCallback(() => { + if (dataSourceType === DataSourceType.FILE) + return fileQuery + if (dataSourceType === DataSourceType.NOTION) + return notionQuery + return websiteQuery + }, [dataSourceType, fileQuery, notionQuery, websiteQuery]) + + const currentMutation = getCurrentMutation() + + // Trigger estimate fetch + const fetchEstimate = useCallback(() => { + if (dataSourceType === DataSourceType.FILE) + fileQuery.mutate() + else if (dataSourceType === DataSourceType.NOTION) + notionQuery.mutate() + else + websiteQuery.mutate() + }, [dataSourceType, fileQuery, notionQuery, websiteQuery]) + + return { + currentMutation, + estimate: currentMutation.data, + isIdle: currentMutation.isIdle, + isPending: currentMutation.isPending, + fetchEstimate, + reset: currentMutation.reset, + } +} + +export type IndexingEstimate = ReturnType diff --git a/web/app/components/datasets/create/step-two/hooks/use-preview-state.ts b/web/app/components/datasets/create/step-two/hooks/use-preview-state.ts new file mode 100644 index 0000000000..94171c5947 --- /dev/null +++ b/web/app/components/datasets/create/step-two/hooks/use-preview-state.ts @@ -0,0 +1,127 @@ +import type { NotionPage } from '@/models/common' +import type { CrawlResultItem, CustomFile, DocumentItem, FullDocumentDetail } from '@/models/datasets' +import { useCallback, useState } from 'react' +import { DataSourceType } from '@/models/datasets' + +export type UsePreviewStateOptions = { + dataSourceType: DataSourceType + files: CustomFile[] + notionPages: NotionPage[] + websitePages: CrawlResultItem[] + documentDetail?: FullDocumentDetail + datasetId?: string +} + +export const usePreviewState = (options: UsePreviewStateOptions) => { + const { + dataSourceType, + files, + notionPages, + websitePages, + documentDetail, + datasetId, + } = options + + // File preview state + const [previewFile, setPreviewFile] = useState( + (datasetId && documentDetail) + ? documentDetail.file + : files[0], + ) + + // Notion page preview state + const [previewNotionPage, setPreviewNotionPage] = useState( + (datasetId && documentDetail) + ? documentDetail.notion_page + : notionPages[0], + ) + + // Website page preview state + const [previewWebsitePage, setPreviewWebsitePage] = useState( + (datasetId && documentDetail) + ? documentDetail.website_page + : websitePages[0], + ) + + // Get preview items for document picker based on data source type + const getPreviewPickerItems = useCallback(() => { + if (dataSourceType === DataSourceType.FILE) { + return files as Array> + } + if (dataSourceType === DataSourceType.NOTION) { + return notionPages.map(page => ({ + id: page.page_id, + name: page.page_name, + extension: 'md', + })) + } + if (dataSourceType === DataSourceType.WEB) { + return websitePages.map(page => ({ + id: page.source_url, + name: page.title, + extension: 'md', + })) + } + return [] + }, [dataSourceType, files, notionPages, websitePages]) + + // Get current preview value for picker + const getPreviewPickerValue = useCallback(() => { + if (dataSourceType === DataSourceType.FILE) { + return previewFile as Required + } + if (dataSourceType === DataSourceType.NOTION) { + return { + id: previewNotionPage?.page_id || '', + name: previewNotionPage?.page_name || '', + extension: 'md', + } + } + if (dataSourceType === DataSourceType.WEB) { + return { + id: previewWebsitePage?.source_url || '', + name: previewWebsitePage?.title || '', + extension: 'md', + } + } + return { id: '', name: '', extension: '' } + }, [dataSourceType, previewFile, previewNotionPage, previewWebsitePage]) + + // Handle preview change + const handlePreviewChange = useCallback((selected: { id: string, name: string }) => { + if (dataSourceType === DataSourceType.FILE) { + setPreviewFile(selected as DocumentItem) + } + else if (dataSourceType === DataSourceType.NOTION) { + const selectedPage = notionPages.find(page => page.page_id === selected.id) + if (selectedPage) + setPreviewNotionPage(selectedPage) + } + else if (dataSourceType === DataSourceType.WEB) { + const selectedPage = websitePages.find(page => page.source_url === selected.id) + if (selectedPage) + setPreviewWebsitePage(selectedPage) + } + }, [dataSourceType, notionPages, websitePages]) + + return { + // File preview + previewFile, + setPreviewFile, + + // Notion preview + previewNotionPage, + setPreviewNotionPage, + + // Website preview + previewWebsitePage, + setPreviewWebsitePage, + + // Picker helpers + getPreviewPickerItems, + getPreviewPickerValue, + handlePreviewChange, + } +} + +export type PreviewState = ReturnType diff --git a/web/app/components/datasets/create/step-two/hooks/use-segmentation-state.ts b/web/app/components/datasets/create/step-two/hooks/use-segmentation-state.ts new file mode 100644 index 0000000000..69cc089b4f --- /dev/null +++ b/web/app/components/datasets/create/step-two/hooks/use-segmentation-state.ts @@ -0,0 +1,222 @@ +import type { ParentMode, PreProcessingRule, ProcessRule, Rules } from '@/models/datasets' +import { useCallback, useState } from 'react' +import { ChunkingMode, ProcessMode } from '@/models/datasets' +import escape from './escape' +import unescape from './unescape' + +// Constants +export const DEFAULT_SEGMENT_IDENTIFIER = '\\n\\n' +export const DEFAULT_MAXIMUM_CHUNK_LENGTH = 1024 +export const DEFAULT_OVERLAP = 50 +export const MAXIMUM_CHUNK_TOKEN_LENGTH = Number.parseInt( + globalThis.document?.body?.getAttribute('data-public-indexing-max-segmentation-tokens-length') || '4000', + 10, +) + +export type ParentChildConfig = { + chunkForContext: ParentMode + parent: { + delimiter: string + maxLength: number + } + child: { + delimiter: string + maxLength: number + } +} + +export const defaultParentChildConfig: ParentChildConfig = { + chunkForContext: 'paragraph', + parent: { + delimiter: '\\n\\n', + maxLength: 1024, + }, + child: { + delimiter: '\\n', + maxLength: 512, + }, +} + +export type UseSegmentationStateOptions = { + initialSegmentationType?: ProcessMode +} + +export const useSegmentationState = (options: UseSegmentationStateOptions = {}) => { + const { initialSegmentationType } = options + + // Segmentation type (general or parent-child) + const [segmentationType, setSegmentationType] = useState( + initialSegmentationType ?? ProcessMode.general, + ) + + // General chunking settings + const [segmentIdentifier, doSetSegmentIdentifier] = useState(DEFAULT_SEGMENT_IDENTIFIER) + const [maxChunkLength, setMaxChunkLength] = useState(DEFAULT_MAXIMUM_CHUNK_LENGTH) + const [limitMaxChunkLength, setLimitMaxChunkLength] = useState(MAXIMUM_CHUNK_TOKEN_LENGTH) + const [overlap, setOverlap] = useState(DEFAULT_OVERLAP) + + // Pre-processing rules + const [rules, setRules] = useState([]) + const [defaultConfig, setDefaultConfig] = useState() + + // Parent-child config + const [parentChildConfig, setParentChildConfig] = useState(defaultParentChildConfig) + + // Escaped segment identifier setter + const setSegmentIdentifier = useCallback((value: string, canEmpty?: boolean) => { + if (value) { + doSetSegmentIdentifier(escape(value)) + } + else { + doSetSegmentIdentifier(canEmpty ? '' : DEFAULT_SEGMENT_IDENTIFIER) + } + }, []) + + // Rule toggle handler + const toggleRule = useCallback((id: string) => { + setRules(prev => prev.map(rule => + rule.id === id ? { ...rule, enabled: !rule.enabled } : rule, + )) + }, []) + + // Reset to defaults + const resetToDefaults = useCallback(() => { + if (defaultConfig) { + setSegmentIdentifier(defaultConfig.segmentation.separator) + setMaxChunkLength(defaultConfig.segmentation.max_tokens) + setOverlap(defaultConfig.segmentation.chunk_overlap!) + setRules(defaultConfig.pre_processing_rules) + } + setParentChildConfig(defaultParentChildConfig) + }, [defaultConfig, setSegmentIdentifier]) + + // Apply config from document detail + const applyConfigFromRules = useCallback((rulesConfig: Rules, isHierarchical: boolean) => { + const separator = rulesConfig.segmentation.separator + const max = rulesConfig.segmentation.max_tokens + const chunkOverlap = rulesConfig.segmentation.chunk_overlap + + setSegmentIdentifier(separator) + setMaxChunkLength(max) + setOverlap(chunkOverlap!) + setRules(rulesConfig.pre_processing_rules) + setDefaultConfig(rulesConfig) + + if (isHierarchical) { + setParentChildConfig({ + chunkForContext: rulesConfig.parent_mode || 'paragraph', + parent: { + delimiter: escape(rulesConfig.segmentation.separator), + maxLength: rulesConfig.segmentation.max_tokens, + }, + child: { + delimiter: escape(rulesConfig.subchunk_segmentation!.separator), + maxLength: rulesConfig.subchunk_segmentation!.max_tokens, + }, + }) + } + }, [setSegmentIdentifier]) + + // Get process rule for API + const getProcessRule = useCallback((docForm: ChunkingMode): ProcessRule => { + if (docForm === ChunkingMode.parentChild) { + return { + rules: { + pre_processing_rules: rules, + segmentation: { + separator: unescape(parentChildConfig.parent.delimiter), + max_tokens: parentChildConfig.parent.maxLength, + }, + parent_mode: parentChildConfig.chunkForContext, + subchunk_segmentation: { + separator: unescape(parentChildConfig.child.delimiter), + max_tokens: parentChildConfig.child.maxLength, + }, + }, + mode: 'hierarchical', + } as ProcessRule + } + + return { + rules: { + pre_processing_rules: rules, + segmentation: { + separator: unescape(segmentIdentifier), + max_tokens: maxChunkLength, + chunk_overlap: overlap, + }, + }, + mode: segmentationType, + } as ProcessRule + }, [rules, parentChildConfig, segmentIdentifier, maxChunkLength, overlap, segmentationType]) + + // Update parent config field + const updateParentConfig = useCallback((field: 'delimiter' | 'maxLength', value: string | number) => { + setParentChildConfig((prev) => { + let newValue: string | number + if (field === 'delimiter') + newValue = value ? escape(value as string) : '' + else + newValue = value + return { + ...prev, + parent: { ...prev.parent, [field]: newValue }, + } + }) + }, []) + + // Update child config field + const updateChildConfig = useCallback((field: 'delimiter' | 'maxLength', value: string | number) => { + setParentChildConfig((prev) => { + let newValue: string | number + if (field === 'delimiter') + newValue = value ? escape(value as string) : '' + else + newValue = value + return { + ...prev, + child: { ...prev.child, [field]: newValue }, + } + }) + }, []) + + // Set chunk for context mode + const setChunkForContext = useCallback((mode: ParentMode) => { + setParentChildConfig(prev => ({ ...prev, chunkForContext: mode })) + }, []) + + return { + // General chunking state + segmentationType, + setSegmentationType, + segmentIdentifier, + setSegmentIdentifier, + maxChunkLength, + setMaxChunkLength, + limitMaxChunkLength, + setLimitMaxChunkLength, + overlap, + setOverlap, + + // Rules + rules, + setRules, + defaultConfig, + setDefaultConfig, + toggleRule, + + // Parent-child config + parentChildConfig, + setParentChildConfig, + updateParentConfig, + updateChildConfig, + setChunkForContext, + + // Actions + resetToDefaults, + applyConfigFromRules, + getProcessRule, + } +} + +export type SegmentationState = ReturnType diff --git a/web/app/components/datasets/create/step-two/index.spec.tsx b/web/app/components/datasets/create/step-two/index.spec.tsx new file mode 100644 index 0000000000..7145920f60 --- /dev/null +++ b/web/app/components/datasets/create/step-two/index.spec.tsx @@ -0,0 +1,2197 @@ +import type { Model } from '@/app/components/header/account-setting/model-provider-page/declarations' +import type { DataSourceProvider, NotionPage } from '@/models/common' +import type { + CrawlOptions, + CrawlResultItem, + CustomFile, + FileIndexingEstimateResponse, + FullDocumentDetail, + PreProcessingRule, + Rules, +} from '@/models/datasets' +import type { RetrievalConfig } from '@/types/app' +import { act, fireEvent, render, renderHook, screen } from '@testing-library/react' +import { ConfigurationMethodEnum, ModelStatusEnum, ModelTypeEnum } from '@/app/components/header/account-setting/model-provider-page/declarations' +import { ChunkingMode, DataSourceType, ProcessMode } from '@/models/datasets' +import { RETRIEVE_METHOD } from '@/types/app' +import { PreviewPanel } from './components/preview-panel' +import { StepTwoFooter } from './components/step-two-footer' +import { + DEFAULT_MAXIMUM_CHUNK_LENGTH, + DEFAULT_OVERLAP, + DEFAULT_SEGMENT_IDENTIFIER, + defaultParentChildConfig, + IndexingType, + useDocumentCreation, + useIndexingConfig, + useIndexingEstimate, + usePreviewState, + useSegmentationState, +} from './hooks' +import escape from './hooks/escape' +import unescape from './hooks/unescape' + +// ============================================ +// Mock external dependencies +// ============================================ + +// Mock dataset detail context +const mockDataset = { + id: 'test-dataset-id', + doc_form: ChunkingMode.text, + data_source_type: DataSourceType.FILE, + embedding_model: 'text-embedding-ada-002', + embedding_model_provider: 'openai', + retrieval_model_dict: { + search_method: RETRIEVE_METHOD.semantic, + reranking_enable: false, + reranking_model: { reranking_provider_name: '', reranking_model_name: '' }, + top_k: 3, + score_threshold_enabled: false, + score_threshold: 0.5, + } as RetrievalConfig, +} + +let mockCurrentDataset: typeof mockDataset | null = null +const mockMutateDatasetRes = vi.fn() + +vi.mock('@/context/dataset-detail', () => ({ + useDatasetDetailContextWithSelector: (selector: (state: { dataset: typeof mockDataset | null, mutateDatasetRes: () => void }) => unknown) => + selector({ dataset: mockCurrentDataset, mutateDatasetRes: mockMutateDatasetRes }), +})) + +// Note: @/context/i18n is globally mocked in vitest.setup.ts, no need to mock here +// Note: @/hooks/use-breakpoints uses real import + +// Mock model hooks +const mockEmbeddingModelList = [ + { provider: 'openai', model: 'text-embedding-ada-002' }, + { provider: 'cohere', model: 'embed-english-v3.0' }, +] +const mockDefaultEmbeddingModel = { provider: { provider: 'openai' }, model: 'text-embedding-ada-002' } +// Model[] type structure for rerank model list (simplified mock) +const mockRerankModelList: Model[] = [{ + provider: 'cohere', + icon_small: { en_US: 'cohere-icon', zh_Hans: 'cohere-icon' }, + label: { en_US: 'Cohere', zh_Hans: 'Cohere' }, + models: [{ + model: 'rerank-english-v3.0', + label: { en_US: 'Rerank English v3.0', zh_Hans: 'Rerank English v3.0' }, + model_type: ModelTypeEnum.rerank, + features: [], + fetch_from: ConfigurationMethodEnum.predefinedModel, + status: ModelStatusEnum.active, + model_properties: {}, + load_balancing_enabled: false, + }], + status: ModelStatusEnum.active, +}] +const mockRerankDefaultModel = { provider: { provider: 'cohere' }, model: 'rerank-english-v3.0' } +let mockIsRerankDefaultModelValid = true + +vi.mock('@/app/components/header/account-setting/model-provider-page/hooks', () => ({ + useModelListAndDefaultModelAndCurrentProviderAndModel: () => ({ + modelList: mockRerankModelList, + defaultModel: mockRerankDefaultModel, + currentModel: mockIsRerankDefaultModelValid, + }), + useModelList: () => ({ data: mockEmbeddingModelList }), + useDefaultModel: () => ({ data: mockDefaultEmbeddingModel }), +})) + +// Mock service hooks +const mockFetchDefaultProcessRuleMutate = vi.fn() +vi.mock('@/service/knowledge/use-create-dataset', () => ({ + useFetchDefaultProcessRule: ({ onSuccess }: { onSuccess: (data: { rules: Rules, limits: { indexing_max_segmentation_tokens_length: number } }) => void }) => ({ + mutate: (url: string) => { + mockFetchDefaultProcessRuleMutate(url) + onSuccess({ + rules: { + segmentation: { separator: '\\n', max_tokens: 500, chunk_overlap: 50 }, + pre_processing_rules: [ + { id: 'remove_extra_spaces', enabled: true }, + { id: 'remove_urls_emails', enabled: false }, + ], + parent_mode: 'paragraph', + subchunk_segmentation: { separator: '\\n', max_tokens: 256 }, + }, + limits: { indexing_max_segmentation_tokens_length: 4000 }, + }) + }, + isPending: false, + }), + useFetchFileIndexingEstimateForFile: () => ({ + mutate: vi.fn(), + data: undefined, + isIdle: true, + isPending: false, + reset: vi.fn(), + }), + useFetchFileIndexingEstimateForNotion: () => ({ + mutate: vi.fn(), + data: undefined, + isIdle: true, + isPending: false, + reset: vi.fn(), + }), + useFetchFileIndexingEstimateForWeb: () => ({ + mutate: vi.fn(), + data: undefined, + isIdle: true, + isPending: false, + reset: vi.fn(), + }), + useCreateFirstDocument: () => ({ + mutateAsync: vi.fn().mockImplementation(async (params: unknown, options?: { onSuccess?: (data: unknown) => void }) => { + const data = { dataset: { id: 'new-dataset-id' } } + options?.onSuccess?.(data) + return data + }), + isPending: false, + }), + useCreateDocument: () => ({ + mutateAsync: vi.fn().mockImplementation(async (params: unknown, options?: { onSuccess?: (data: unknown) => void }) => { + const data = { document: { id: 'new-doc-id' } } + options?.onSuccess?.(data) + return data + }), + isPending: false, + }), + getNotionInfo: vi.fn().mockReturnValue([{ workspace_id: 'ws-1', pages: [{ page_id: 'page-1' }] }]), + getWebsiteInfo: vi.fn().mockReturnValue({ provider: 'jinaReader', job_id: 'job-123', urls: ['https://test.com'] }), +})) + +vi.mock('@/service/knowledge/use-dataset', () => ({ + useInvalidDatasetList: () => vi.fn(), +})) + +// Mock amplitude tracking (external service) +vi.mock('@/app/components/base/amplitude', () => ({ + trackEvent: vi.fn(), +})) + +// Note: @/app/components/base/toast - uses real import (base component) +// Note: @/app/components/datasets/common/check-rerank-model - uses real import +// Note: @/app/components/base/float-right-container - uses real import (base component) + +// Mock checkShowMultiModalTip - requires complex model list structure +vi.mock('@/app/components/datasets/settings/utils', () => ({ + checkShowMultiModalTip: () => false, +})) + +// ============================================ +// Test data factories +// ============================================ + +const createMockFile = (overrides?: Partial): CustomFile => ({ + id: 'file-1', + name: 'test-file.pdf', + extension: 'pdf', + size: 1024, + type: 'application/pdf', + lastModified: Date.now(), + ...overrides, +} as CustomFile) + +const createMockNotionPage = (overrides?: Partial): NotionPage => ({ + page_id: 'notion-page-1', + page_name: 'Test Notion Page', + page_icon: null, + type: 'page', + ...overrides, +} as NotionPage) + +const createMockWebsitePage = (overrides?: Partial): CrawlResultItem => ({ + source_url: 'https://example.com/page1', + title: 'Test Website Page', + description: 'Test description', + markdown: '# Test Content', + ...overrides, +} as CrawlResultItem) + +const createMockDocumentDetail = (overrides?: Partial): FullDocumentDetail => ({ + id: 'doc-1', + doc_form: ChunkingMode.text, + doc_language: 'English', + file: { id: 'file-1', name: 'test.pdf', extension: 'pdf' }, + notion_page: createMockNotionPage(), + website_page: createMockWebsitePage(), + dataset_process_rule: { + mode: ProcessMode.general, + rules: { + segmentation: { separator: '\\n\\n', max_tokens: 1024, chunk_overlap: 50 }, + pre_processing_rules: [{ id: 'remove_extra_spaces', enabled: true }], + }, + }, + ...overrides, +} as FullDocumentDetail) + +const createMockRules = (overrides?: Partial): Rules => ({ + segmentation: { separator: '\\n\\n', max_tokens: 1024, chunk_overlap: 50 }, + pre_processing_rules: [ + { id: 'remove_extra_spaces', enabled: true }, + { id: 'remove_urls_emails', enabled: false }, + ], + parent_mode: 'paragraph', + subchunk_segmentation: { separator: '\\n', max_tokens: 512 }, + ...overrides, +}) + +const createMockEstimate = (overrides?: Partial): FileIndexingEstimateResponse => ({ + total_segments: 10, + total_nodes: 10, + tokens: 5000, + total_price: 0.01, + currency: 'USD', + qa_preview: [{ question: 'Q1', answer: 'A1' }], + preview: [{ content: 'Chunk 1 content', child_chunks: ['Child 1', 'Child 2'] }], + ...overrides, +}) + +// ============================================ +// Utility Functions Tests (escape/unescape) +// ============================================ + +describe('escape utility', () => { + beforeEach(() => { + vi.clearAllMocks() + }) + + // Tests for escape function + describe('escape function', () => { + it('should return empty string for null/undefined input', () => { + expect(escape(null as unknown as string)).toBe('') + expect(escape(undefined as unknown as string)).toBe('') + expect(escape('')).toBe('') + }) + + it('should escape newline characters', () => { + expect(escape('\n')).toBe('\\n') + expect(escape('\r')).toBe('\\r') + expect(escape('\n\r')).toBe('\\n\\r') + }) + + it('should escape tab characters', () => { + expect(escape('\t')).toBe('\\t') + }) + + it('should escape other special characters', () => { + expect(escape('\0')).toBe('\\0') + expect(escape('\b')).toBe('\\b') + expect(escape('\f')).toBe('\\f') + expect(escape('\v')).toBe('\\v') + }) + + it('should escape single quotes', () => { + expect(escape('\'')).toBe('\\\'') + }) + + it('should handle mixed content', () => { + expect(escape('Hello\nWorld\t!')).toBe('Hello\\nWorld\\t!') + }) + + it('should not escape regular characters', () => { + expect(escape('Hello World')).toBe('Hello World') + expect(escape('abc123')).toBe('abc123') + }) + + it('should return empty string for non-string input', () => { + expect(escape(123 as unknown as string)).toBe('') + expect(escape({} as unknown as string)).toBe('') + }) + }) +}) + +describe('unescape utility', () => { + beforeEach(() => { + vi.clearAllMocks() + }) + + // Tests for unescape function + describe('unescape function', () => { + it('should unescape newline characters', () => { + expect(unescape('\\n')).toBe('\n') + expect(unescape('\\r')).toBe('\r') + }) + + it('should unescape tab characters', () => { + expect(unescape('\\t')).toBe('\t') + }) + + it('should unescape other special characters', () => { + expect(unescape('\\0')).toBe('\0') + expect(unescape('\\b')).toBe('\b') + expect(unescape('\\f')).toBe('\f') + expect(unescape('\\v')).toBe('\v') + }) + + it('should unescape single and double quotes', () => { + expect(unescape('\\\'')).toBe('\'') + expect(unescape('\\"')).toBe('"') + }) + + it('should unescape backslash', () => { + expect(unescape('\\\\')).toBe('\\') + }) + + it('should unescape hex sequences', () => { + expect(unescape('\\x41')).toBe('A') // 0x41 = 65 = 'A' + expect(unescape('\\x5A')).toBe('Z') // 0x5A = 90 = 'Z' + }) + + it('should unescape short hex (2-digit) sequences', () => { + // Short hex format: \xNN (2 hexadecimal digits) + expect(unescape('\\xA5')).toBe('¥') // Yen sign + expect(unescape('\\x7F')).toBe('\x7F') // Delete character + expect(unescape('\\x00')).toBe('\x00') // Null character via hex + }) + + it('should unescape octal sequences', () => { + expect(unescape('\\101')).toBe('A') // Octal 101 = 65 = 'A' + expect(unescape('\\132')).toBe('Z') // Octal 132 = 90 = 'Z' + expect(unescape('\\7')).toBe('\x07') // Single digit octal + }) + + it('should unescape unicode sequences', () => { + expect(unescape('\\u0041')).toBe('A') + expect(unescape('\\u{41}')).toBe('A') + }) + + it('should unescape Python-style unicode', () => { + expect(unescape('\\U00000041')).toBe('A') + }) + + it('should handle mixed content', () => { + expect(unescape('Hello\\nWorld\\t!')).toBe('Hello\nWorld\t!') + }) + + it('should not modify regular text', () => { + expect(unescape('Hello World')).toBe('Hello World') + }) + }) +}) + +// ============================================ +// useSegmentationState Hook Tests +// ============================================ + +describe('useSegmentationState', () => { + beforeEach(() => { + vi.clearAllMocks() + }) + + // Tests for initial state + describe('Initial State', () => { + it('should initialize with default values', () => { + const { result } = renderHook(() => useSegmentationState()) + + expect(result.current.segmentationType).toBe(ProcessMode.general) + expect(result.current.segmentIdentifier).toBe(DEFAULT_SEGMENT_IDENTIFIER) + expect(result.current.maxChunkLength).toBe(DEFAULT_MAXIMUM_CHUNK_LENGTH) + expect(result.current.overlap).toBe(DEFAULT_OVERLAP) + expect(result.current.rules).toEqual([]) + expect(result.current.parentChildConfig).toEqual(defaultParentChildConfig) + }) + + it('should initialize with custom segmentation type', () => { + const { result } = renderHook(() => + useSegmentationState({ initialSegmentationType: ProcessMode.parentChild }), + ) + + expect(result.current.segmentationType).toBe(ProcessMode.parentChild) + }) + }) + + // Tests for state setters + describe('State Management', () => { + it('should update segmentation type', () => { + const { result } = renderHook(() => useSegmentationState()) + + act(() => { + result.current.setSegmentationType(ProcessMode.parentChild) + }) + + expect(result.current.segmentationType).toBe(ProcessMode.parentChild) + }) + + it('should update max chunk length', () => { + const { result } = renderHook(() => useSegmentationState()) + + act(() => { + result.current.setMaxChunkLength(2048) + }) + + expect(result.current.maxChunkLength).toBe(2048) + }) + + it('should update overlap', () => { + const { result } = renderHook(() => useSegmentationState()) + + act(() => { + result.current.setOverlap(100) + }) + + expect(result.current.overlap).toBe(100) + }) + + it('should update rules', () => { + const { result } = renderHook(() => useSegmentationState()) + const newRules: PreProcessingRule[] = [{ id: 'test', enabled: true }] + + act(() => { + result.current.setRules(newRules) + }) + + expect(result.current.rules).toEqual(newRules) + }) + }) + + // Tests for setSegmentIdentifier with escape + describe('setSegmentIdentifier', () => { + it('should escape special characters', () => { + const { result } = renderHook(() => useSegmentationState()) + + act(() => { + result.current.setSegmentIdentifier('\n\n') + }) + + expect(result.current.segmentIdentifier).toBe('\\n\\n') + }) + + it('should use default when empty and canEmpty is false', () => { + const { result } = renderHook(() => useSegmentationState()) + + act(() => { + result.current.setSegmentIdentifier('') + }) + + expect(result.current.segmentIdentifier).toBe(DEFAULT_SEGMENT_IDENTIFIER) + }) + + it('should allow empty when canEmpty is true', () => { + const { result } = renderHook(() => useSegmentationState()) + + act(() => { + result.current.setSegmentIdentifier('', true) + }) + + expect(result.current.segmentIdentifier).toBe('') + }) + }) + + // Tests for toggleRule + describe('toggleRule', () => { + it('should toggle rule enabled state', () => { + const { result } = renderHook(() => useSegmentationState()) + + act(() => { + result.current.setRules([ + { id: 'rule1', enabled: true }, + { id: 'rule2', enabled: false }, + ]) + }) + + act(() => { + result.current.toggleRule('rule1') + }) + + expect(result.current.rules.find(r => r.id === 'rule1')?.enabled).toBe(false) + expect(result.current.rules.find(r => r.id === 'rule2')?.enabled).toBe(false) + }) + + it('should not affect other rules', () => { + const { result } = renderHook(() => useSegmentationState()) + + act(() => { + result.current.setRules([ + { id: 'rule1', enabled: true }, + { id: 'rule2', enabled: false }, + ]) + }) + + act(() => { + result.current.toggleRule('rule2') + }) + + expect(result.current.rules.find(r => r.id === 'rule1')?.enabled).toBe(true) + expect(result.current.rules.find(r => r.id === 'rule2')?.enabled).toBe(true) + }) + }) + + // Tests for parent-child config + describe('Parent-Child Configuration', () => { + it('should update parent config delimiter with truthy value', () => { + const { result } = renderHook(() => useSegmentationState()) + + act(() => { + result.current.updateParentConfig('delimiter', '\n\n\n') + }) + + expect(result.current.parentChildConfig.parent.delimiter).toBe('\\n\\n\\n') + }) + + it('should update parent config delimiter with empty value', () => { + const { result } = renderHook(() => useSegmentationState()) + + act(() => { + result.current.updateParentConfig('delimiter', '') + }) + + expect(result.current.parentChildConfig.parent.delimiter).toBe('') + }) + + it('should update parent config maxLength', () => { + const { result } = renderHook(() => useSegmentationState()) + + act(() => { + result.current.updateParentConfig('maxLength', 2048) + }) + + expect(result.current.parentChildConfig.parent.maxLength).toBe(2048) + }) + + it('should update child config delimiter with truthy value', () => { + const { result } = renderHook(() => useSegmentationState()) + + act(() => { + result.current.updateChildConfig('delimiter', '\n') + }) + + expect(result.current.parentChildConfig.child.delimiter).toBe('\\n') + }) + + it('should update child config delimiter with empty value', () => { + const { result } = renderHook(() => useSegmentationState()) + + act(() => { + result.current.updateChildConfig('delimiter', '') + }) + + expect(result.current.parentChildConfig.child.delimiter).toBe('') + }) + + it('should update child config maxLength', () => { + const { result } = renderHook(() => useSegmentationState()) + + act(() => { + result.current.updateChildConfig('maxLength', 256) + }) + + expect(result.current.parentChildConfig.child.maxLength).toBe(256) + }) + + it('should set chunk for context mode', () => { + const { result } = renderHook(() => useSegmentationState()) + + act(() => { + result.current.setChunkForContext('full-doc') + }) + + expect(result.current.parentChildConfig.chunkForContext).toBe('full-doc') + }) + }) + + // Tests for resetToDefaults + describe('resetToDefaults', () => { + it('should reset to default config when available', () => { + const { result } = renderHook(() => useSegmentationState()) + + // Set non-default values and default config + act(() => { + result.current.setMaxChunkLength(2048) + result.current.setOverlap(100) + result.current.setDefaultConfig(createMockRules()) + }) + + // Reset - should use default config values + act(() => { + result.current.resetToDefaults() + }) + + expect(result.current.maxChunkLength).toBe(1024) + expect(result.current.overlap).toBe(50) + expect(result.current.parentChildConfig).toEqual(defaultParentChildConfig) + }) + + it('should only reset parentChildConfig when no default config', () => { + const { result } = renderHook(() => useSegmentationState()) + + // Set non-default values without setting defaultConfig + act(() => { + result.current.setMaxChunkLength(2048) + result.current.setOverlap(100) + result.current.setChunkForContext('full-doc') + }) + + // Reset - should only reset parentChildConfig since no default config + act(() => { + result.current.resetToDefaults() + }) + + // Values stay the same since no defaultConfig + expect(result.current.maxChunkLength).toBe(2048) + expect(result.current.overlap).toBe(100) + // But parentChildConfig is always reset + expect(result.current.parentChildConfig).toEqual(defaultParentChildConfig) + }) + }) + + // Tests for applyConfigFromRules + describe('applyConfigFromRules', () => { + it('should apply general config from rules', () => { + const { result } = renderHook(() => useSegmentationState()) + const rules = createMockRules({ + segmentation: { separator: '---', max_tokens: 512, chunk_overlap: 25 }, + }) + + act(() => { + result.current.applyConfigFromRules(rules, false) + }) + + expect(result.current.maxChunkLength).toBe(512) + expect(result.current.overlap).toBe(25) + }) + + it('should apply hierarchical config from rules', () => { + const { result } = renderHook(() => useSegmentationState()) + const rules = createMockRules({ + parent_mode: 'paragraph', + subchunk_segmentation: { separator: '\n', max_tokens: 256 }, + }) + + act(() => { + result.current.applyConfigFromRules(rules, true) + }) + + expect(result.current.parentChildConfig.chunkForContext).toBe('paragraph') + expect(result.current.parentChildConfig.child.maxLength).toBe(256) + }) + + it('should apply full hierarchical parent-child config from rules', () => { + const { result } = renderHook(() => useSegmentationState()) + const rules = createMockRules({ + segmentation: { separator: '\n\n', max_tokens: 1024, chunk_overlap: 50 }, + parent_mode: 'full-doc', + subchunk_segmentation: { separator: '\n', max_tokens: 128 }, + }) + + act(() => { + result.current.applyConfigFromRules(rules, true) + }) + + // Should set parent config from segmentation + expect(result.current.parentChildConfig.parent.delimiter).toBe('\\n\\n') + expect(result.current.parentChildConfig.parent.maxLength).toBe(1024) + // Should set child config from subchunk_segmentation + expect(result.current.parentChildConfig.child.delimiter).toBe('\\n') + expect(result.current.parentChildConfig.child.maxLength).toBe(128) + // Should set chunkForContext + expect(result.current.parentChildConfig.chunkForContext).toBe('full-doc') + }) + }) + + // Tests for getProcessRule + describe('getProcessRule', () => { + it('should return general process rule', () => { + const { result } = renderHook(() => useSegmentationState()) + + const processRule = result.current.getProcessRule(ChunkingMode.text) + + expect(processRule.mode).toBe(ProcessMode.general) + expect(processRule.rules.segmentation.max_tokens).toBe(DEFAULT_MAXIMUM_CHUNK_LENGTH) + }) + + it('should return hierarchical process rule for parent-child', () => { + const { result } = renderHook(() => useSegmentationState()) + + const processRule = result.current.getProcessRule(ChunkingMode.parentChild) + + expect(processRule.mode).toBe('hierarchical') + expect(processRule.rules.parent_mode).toBe('paragraph') + expect(processRule.rules.subchunk_segmentation).toBeDefined() + }) + }) +}) + +// ============================================ +// useIndexingConfig Hook Tests +// ============================================ + +describe('useIndexingConfig', () => { + beforeEach(() => { + vi.clearAllMocks() + mockIsRerankDefaultModelValid = true + }) + + // Tests for initial state + // Note: Hook has useEffect that syncs state, so we test the state after effects settle + describe('Initial State', () => { + it('should initialize with QUALIFIED when API key is set', async () => { + const { result } = renderHook(() => + useIndexingConfig({ isAPIKeySet: true, hasSetIndexType: false }), + ) + + // After effects settle, indexType should be QUALIFIED + await vi.waitFor(() => { + expect(result.current.indexType).toBe(IndexingType.QUALIFIED) + }) + }) + + it('should initialize with ECONOMICAL when API key is not set', async () => { + const { result } = renderHook(() => + useIndexingConfig({ isAPIKeySet: false, hasSetIndexType: false }), + ) + + await vi.waitFor(() => { + expect(result.current.indexType).toBe(IndexingType.ECONOMICAL) + }) + }) + + it('should use initial index type when provided', async () => { + const { result } = renderHook(() => + useIndexingConfig({ + isAPIKeySet: false, + hasSetIndexType: true, + initialIndexType: IndexingType.QUALIFIED, + }), + ) + + await vi.waitFor(() => { + expect(result.current.indexType).toBe(IndexingType.QUALIFIED) + }) + }) + }) + + // Tests for state setters + describe('State Management', () => { + it('should update index type', async () => { + const { result } = renderHook(() => + useIndexingConfig({ isAPIKeySet: true, hasSetIndexType: false }), + ) + + // Wait for initial effects to settle + await vi.waitFor(() => { + expect(result.current.indexType).toBeDefined() + }) + + act(() => { + result.current.setIndexType(IndexingType.ECONOMICAL) + }) + + expect(result.current.indexType).toBe(IndexingType.ECONOMICAL) + }) + + it('should update embedding model', async () => { + const { result } = renderHook(() => + useIndexingConfig({ isAPIKeySet: true, hasSetIndexType: false }), + ) + + await vi.waitFor(() => { + expect(result.current.embeddingModel).toBeDefined() + }) + + act(() => { + result.current.setEmbeddingModel({ provider: 'cohere', model: 'embed-v3' }) + }) + + expect(result.current.embeddingModel).toEqual({ provider: 'cohere', model: 'embed-v3' }) + }) + + it('should update retrieval config', async () => { + const { result } = renderHook(() => + useIndexingConfig({ isAPIKeySet: true, hasSetIndexType: false }), + ) + + await vi.waitFor(() => { + expect(result.current.retrievalConfig).toBeDefined() + }) + + const newConfig: RetrievalConfig = { + search_method: RETRIEVE_METHOD.hybrid, + reranking_enable: true, + reranking_model: { reranking_provider_name: 'cohere', reranking_model_name: 'rerank-v3' }, + top_k: 5, + score_threshold_enabled: true, + score_threshold: 0.7, + } + + act(() => { + result.current.setRetrievalConfig(newConfig) + }) + + expect(result.current.retrievalConfig).toEqual(newConfig) + }) + }) + + // Tests for getIndexingTechnique + describe('getIndexingTechnique', () => { + it('should return initial type when set', async () => { + const { result } = renderHook(() => + useIndexingConfig({ + isAPIKeySet: true, + hasSetIndexType: true, + initialIndexType: IndexingType.ECONOMICAL, + }), + ) + + await vi.waitFor(() => { + expect(result.current.getIndexingTechnique()).toBe(IndexingType.ECONOMICAL) + }) + }) + + it('should return current type when no initial type', async () => { + const { result } = renderHook(() => + useIndexingConfig({ isAPIKeySet: true, hasSetIndexType: false }), + ) + + await vi.waitFor(() => { + expect(result.current.indexType).toBeDefined() + }) + + act(() => { + result.current.setIndexType(IndexingType.ECONOMICAL) + }) + + expect(result.current.getIndexingTechnique()).toBe(IndexingType.ECONOMICAL) + }) + }) + + // Tests for initialRetrievalConfig handling + describe('initialRetrievalConfig', () => { + it('should skip retrieval config sync when initialRetrievalConfig is provided', async () => { + const customRetrievalConfig: RetrievalConfig = { + search_method: RETRIEVE_METHOD.hybrid, + reranking_enable: true, + reranking_model: { reranking_provider_name: 'custom', reranking_model_name: 'custom-model' }, + top_k: 10, + score_threshold_enabled: true, + score_threshold: 0.8, + } + + const { result } = renderHook(() => + useIndexingConfig({ + isAPIKeySet: true, + hasSetIndexType: false, + initialRetrievalConfig: customRetrievalConfig, + }), + ) + + await vi.waitFor(() => { + expect(result.current.retrievalConfig).toBeDefined() + }) + + // Should use the provided initial config, not the default synced one + expect(result.current.retrievalConfig.search_method).toBe(RETRIEVE_METHOD.hybrid) + expect(result.current.retrievalConfig.top_k).toBe(10) + }) + }) +}) + +// ============================================ +// usePreviewState Hook Tests +// ============================================ + +describe('usePreviewState', () => { + beforeEach(() => { + vi.clearAllMocks() + }) + + const defaultOptions = { + dataSourceType: DataSourceType.FILE, + files: [createMockFile()], + notionPages: [createMockNotionPage()], + websitePages: [createMockWebsitePage()], + } + + // Tests for initial state + describe('Initial State', () => { + it('should initialize with first file for FILE data source', () => { + const { result } = renderHook(() => usePreviewState(defaultOptions)) + + expect(result.current.previewFile).toEqual(defaultOptions.files[0]) + }) + + it('should initialize with first notion page for NOTION data source', () => { + const { result } = renderHook(() => + usePreviewState({ ...defaultOptions, dataSourceType: DataSourceType.NOTION }), + ) + + expect(result.current.previewNotionPage).toEqual(defaultOptions.notionPages[0]) + }) + + it('should initialize with document detail when provided', () => { + const documentDetail = createMockDocumentDetail() + const { result } = renderHook(() => + usePreviewState({ + ...defaultOptions, + documentDetail, + datasetId: 'test-id', + }), + ) + + expect(result.current.previewFile).toEqual(documentDetail.file) + }) + }) + + // Tests for getPreviewPickerItems + describe('getPreviewPickerItems', () => { + it('should return files for FILE data source', () => { + const { result } = renderHook(() => usePreviewState(defaultOptions)) + + const items = result.current.getPreviewPickerItems() + expect(items).toEqual(defaultOptions.files) + }) + + it('should return mapped notion pages for NOTION data source', () => { + const { result } = renderHook(() => + usePreviewState({ ...defaultOptions, dataSourceType: DataSourceType.NOTION }), + ) + + const items = result.current.getPreviewPickerItems() + expect(items[0]).toEqual({ + id: 'notion-page-1', + name: 'Test Notion Page', + extension: 'md', + }) + }) + + it('should return mapped website pages for WEB data source', () => { + const { result } = renderHook(() => + usePreviewState({ ...defaultOptions, dataSourceType: DataSourceType.WEB }), + ) + + const items = result.current.getPreviewPickerItems() + expect(items[0]).toEqual({ + id: 'https://example.com/page1', + name: 'Test Website Page', + extension: 'md', + }) + }) + + it('should return empty array for unknown data source', () => { + const { result } = renderHook(() => + usePreviewState({ ...defaultOptions, dataSourceType: 'unknown' as DataSourceType }), + ) + + const items = result.current.getPreviewPickerItems() + expect(items).toEqual([]) + }) + }) + + // Tests for getPreviewPickerValue + describe('getPreviewPickerValue', () => { + it('should return file value for FILE data source', () => { + const { result } = renderHook(() => usePreviewState(defaultOptions)) + + const value = result.current.getPreviewPickerValue() + expect(value).toEqual(defaultOptions.files[0]) + }) + + it('should return mapped notion page value for NOTION data source', () => { + const notionPage = createMockNotionPage({ page_id: 'page-123', page_name: 'My Page' }) + const { result } = renderHook(() => + usePreviewState({ + ...defaultOptions, + dataSourceType: DataSourceType.NOTION, + notionPages: [notionPage], + }), + ) + + const value = result.current.getPreviewPickerValue() + expect(value).toEqual({ + id: 'page-123', + name: 'My Page', + extension: 'md', + }) + }) + + it('should return mapped website page value for WEB data source', () => { + const websitePage = createMockWebsitePage({ source_url: 'https://test.com', title: 'Test Title' }) + const { result } = renderHook(() => + usePreviewState({ + ...defaultOptions, + dataSourceType: DataSourceType.WEB, + websitePages: [websitePage], + }), + ) + + const value = result.current.getPreviewPickerValue() + expect(value).toEqual({ + id: 'https://test.com', + name: 'Test Title', + extension: 'md', + }) + }) + + it('should return empty value for unknown data source', () => { + const { result } = renderHook(() => + usePreviewState({ ...defaultOptions, dataSourceType: 'unknown' as DataSourceType }), + ) + + const value = result.current.getPreviewPickerValue() + expect(value).toEqual({ id: '', name: '', extension: '' }) + }) + + it('should handle undefined notion page gracefully', () => { + const { result } = renderHook(() => + usePreviewState({ + ...defaultOptions, + dataSourceType: DataSourceType.NOTION, + notionPages: [], + }), + ) + + const value = result.current.getPreviewPickerValue() + expect(value).toEqual({ + id: '', + name: '', + extension: 'md', + }) + }) + + it('should handle undefined website page gracefully', () => { + const { result } = renderHook(() => + usePreviewState({ + ...defaultOptions, + dataSourceType: DataSourceType.WEB, + websitePages: [], + }), + ) + + const value = result.current.getPreviewPickerValue() + expect(value).toEqual({ + id: '', + name: '', + extension: 'md', + }) + }) + }) + + // Tests for handlePreviewChange + describe('handlePreviewChange', () => { + it('should update preview file for FILE data source', () => { + const files = [createMockFile(), createMockFile({ id: 'file-2', name: 'second.pdf' })] + const { result } = renderHook(() => + usePreviewState({ ...defaultOptions, files }), + ) + + act(() => { + result.current.handlePreviewChange({ id: 'file-2', name: 'second.pdf' }) + }) + + expect(result.current.previewFile).toEqual({ id: 'file-2', name: 'second.pdf' }) + }) + + it('should update preview notion page for NOTION data source', () => { + const notionPages = [ + createMockNotionPage(), + createMockNotionPage({ page_id: 'notion-page-2', page_name: 'Second Page' }), + ] + const { result } = renderHook(() => + usePreviewState({ ...defaultOptions, dataSourceType: DataSourceType.NOTION, notionPages }), + ) + + act(() => { + result.current.handlePreviewChange({ id: 'notion-page-2', name: 'Second Page' }) + }) + + expect(result.current.previewNotionPage?.page_id).toBe('notion-page-2') + }) + + it('should update preview website page for WEB data source', () => { + const websitePages = [ + createMockWebsitePage(), + createMockWebsitePage({ source_url: 'https://example.com/page2', title: 'Second Page' }), + ] + const { result } = renderHook(() => + usePreviewState({ ...defaultOptions, dataSourceType: DataSourceType.WEB, websitePages }), + ) + + act(() => { + result.current.handlePreviewChange({ id: 'https://example.com/page2', name: 'Second Page' }) + }) + + expect(result.current.previewWebsitePage?.source_url).toBe('https://example.com/page2') + }) + }) +}) + +// ============================================ +// useDocumentCreation Hook Tests +// ============================================ + +describe('useDocumentCreation', () => { + beforeEach(() => { + vi.clearAllMocks() + }) + + const defaultOptions = { + dataSourceType: DataSourceType.FILE, + files: [createMockFile()], + notionPages: [] as NotionPage[], + notionCredentialId: '', + websitePages: [] as CrawlResultItem[], + } + + // Tests for validateParams + describe('validateParams', () => { + it('should return false when overlap exceeds max chunk length', () => { + const { result } = renderHook(() => useDocumentCreation(defaultOptions)) + + const isValid = result.current.validateParams({ + segmentationType: 'general', + maxChunkLength: 100, + limitMaxChunkLength: 4000, + overlap: 200, + indexType: IndexingType.QUALIFIED, + embeddingModel: { provider: 'openai', model: 'text-embedding-ada-002' }, + rerankModelList: [], + retrievalConfig: { + search_method: RETRIEVE_METHOD.semantic, + reranking_enable: false, + reranking_model: { reranking_provider_name: '', reranking_model_name: '' }, + top_k: 3, + score_threshold_enabled: false, + score_threshold: 0.5, + }, + }) + + expect(isValid).toBe(false) + }) + + it('should return false when max chunk length exceeds limit', () => { + const { result } = renderHook(() => useDocumentCreation(defaultOptions)) + + const isValid = result.current.validateParams({ + segmentationType: 'general', + maxChunkLength: 5000, + limitMaxChunkLength: 4000, + overlap: 50, + indexType: IndexingType.QUALIFIED, + embeddingModel: { provider: 'openai', model: 'text-embedding-ada-002' }, + rerankModelList: [], + retrievalConfig: { + search_method: RETRIEVE_METHOD.semantic, + reranking_enable: false, + reranking_model: { reranking_provider_name: '', reranking_model_name: '' }, + top_k: 3, + score_threshold_enabled: false, + score_threshold: 0.5, + }, + }) + + expect(isValid).toBe(false) + }) + + it('should return true for valid params', () => { + const { result } = renderHook(() => useDocumentCreation(defaultOptions)) + + const isValid = result.current.validateParams({ + segmentationType: 'general', + maxChunkLength: 1000, + limitMaxChunkLength: 4000, + overlap: 50, + indexType: IndexingType.QUALIFIED, + embeddingModel: { provider: 'openai', model: 'text-embedding-ada-002' }, + rerankModelList: [], + retrievalConfig: { + search_method: RETRIEVE_METHOD.semantic, + reranking_enable: false, + reranking_model: { reranking_provider_name: '', reranking_model_name: '' }, + top_k: 3, + score_threshold_enabled: false, + score_threshold: 0.5, + }, + }) + + expect(isValid).toBe(true) + }) + }) + + // Tests for buildCreationParams + describe('buildCreationParams', () => { + it('should build params for file upload', () => { + const { result } = renderHook(() => useDocumentCreation(defaultOptions)) + + const params = result.current.buildCreationParams( + ChunkingMode.text, + 'English', + { mode: ProcessMode.general, rules: createMockRules() }, + { + search_method: RETRIEVE_METHOD.semantic, + reranking_enable: false, + reranking_model: { reranking_provider_name: '', reranking_model_name: '' }, + top_k: 3, + score_threshold_enabled: false, + score_threshold: 0.5, + }, + { provider: 'openai', model: 'text-embedding-ada-002' }, + IndexingType.QUALIFIED, + ) + + expect(params).toBeDefined() + expect(params?.doc_form).toBe(ChunkingMode.text) + expect(params?.doc_language).toBe('English') + expect(params?.data_source?.type).toBe(DataSourceType.FILE) + }) + + it('should build params for setting mode', () => { + const documentDetail = createMockDocumentDetail() + const { result } = renderHook(() => + useDocumentCreation({ + ...defaultOptions, + isSetting: true, + documentDetail, + }), + ) + + const params = result.current.buildCreationParams( + ChunkingMode.text, + 'English', + { mode: ProcessMode.general, rules: createMockRules() }, + { + search_method: RETRIEVE_METHOD.semantic, + reranking_enable: false, + reranking_model: { reranking_provider_name: '', reranking_model_name: '' }, + top_k: 3, + score_threshold_enabled: false, + score_threshold: 0.5, + }, + { provider: 'openai', model: 'text-embedding-ada-002' }, + IndexingType.QUALIFIED, + ) + + expect(params?.original_document_id).toBe(documentDetail.id) + }) + + it('should build params for notion_import data source', () => { + const { result } = renderHook(() => + useDocumentCreation({ + ...defaultOptions, + dataSourceType: DataSourceType.NOTION, + notionPages: [createMockNotionPage()], + notionCredentialId: 'notion-cred-123', + }), + ) + + const params = result.current.buildCreationParams( + ChunkingMode.text, + 'English', + { mode: ProcessMode.general, rules: createMockRules() }, + { + search_method: RETRIEVE_METHOD.semantic, + reranking_enable: false, + reranking_model: { reranking_provider_name: '', reranking_model_name: '' }, + top_k: 3, + score_threshold_enabled: false, + score_threshold: 0.5, + }, + { provider: 'openai', model: 'text-embedding-ada-002' }, + IndexingType.QUALIFIED, + ) + + expect(params).toBeDefined() + expect(params?.data_source?.type).toBe(DataSourceType.NOTION) + expect(params?.data_source?.info_list.notion_info_list).toBeDefined() + }) + + it('should build params for website_crawl data source', () => { + const { result } = renderHook(() => + useDocumentCreation({ + ...defaultOptions, + dataSourceType: DataSourceType.WEB, + websitePages: [createMockWebsitePage()], + websiteCrawlProvider: 'jinaReader' as DataSourceProvider, + websiteCrawlJobId: 'job-123', + crawlOptions: { max_depth: 2 } as CrawlOptions, + }), + ) + + const params = result.current.buildCreationParams( + ChunkingMode.text, + 'English', + { mode: ProcessMode.general, rules: createMockRules() }, + { + search_method: RETRIEVE_METHOD.semantic, + reranking_enable: false, + reranking_model: { reranking_provider_name: '', reranking_model_name: '' }, + top_k: 3, + score_threshold_enabled: false, + score_threshold: 0.5, + }, + { provider: 'openai', model: 'text-embedding-ada-002' }, + IndexingType.QUALIFIED, + ) + + expect(params).toBeDefined() + expect(params?.data_source?.type).toBe(DataSourceType.WEB) + expect(params?.data_source?.info_list.website_info_list).toBeDefined() + }) + }) + + // Tests for validateParams edge cases + describe('validateParams - additional cases', () => { + it('should return false when embedding model is missing for QUALIFIED index type', () => { + const { result } = renderHook(() => useDocumentCreation(defaultOptions)) + + const isValid = result.current.validateParams({ + segmentationType: 'general', + maxChunkLength: 500, + limitMaxChunkLength: 4000, + overlap: 50, + indexType: IndexingType.QUALIFIED, + embeddingModel: { provider: '', model: '' }, + rerankModelList: mockRerankModelList, + retrievalConfig: { + search_method: RETRIEVE_METHOD.semantic, + reranking_enable: false, + reranking_model: { reranking_provider_name: '', reranking_model_name: '' }, + top_k: 3, + score_threshold_enabled: false, + score_threshold: 0.5, + }, + }) + + expect(isValid).toBe(false) + }) + + it('should return false when rerank model is required but not selected', () => { + const { result } = renderHook(() => useDocumentCreation(defaultOptions)) + + // isReRankModelSelected returns false when: + // - indexMethod === 'high_quality' (IndexingType.QUALIFIED) + // - reranking_enable === true + // - rerankModelSelected === false (model not found in list) + const isValid = result.current.validateParams({ + segmentationType: 'general', + maxChunkLength: 500, + limitMaxChunkLength: 4000, + overlap: 50, + indexType: IndexingType.QUALIFIED, + embeddingModel: { provider: 'openai', model: 'text-embedding-ada-002' }, + rerankModelList: [], // Empty list means model won't be found + retrievalConfig: { + search_method: RETRIEVE_METHOD.semantic, + reranking_enable: true, // Reranking enabled + reranking_model: { + reranking_provider_name: 'nonexistent', + reranking_model_name: 'nonexistent-model', + }, + top_k: 3, + score_threshold_enabled: false, + score_threshold: 0.5, + }, + }) + + expect(isValid).toBe(false) + }) + }) + + // Tests for executeCreation + describe('executeCreation', () => { + it('should call createFirstDocumentMutation when datasetId is not provided', async () => { + const mockOnStepChange = vi.fn() + const mockUpdateIndexingTypeCache = vi.fn() + const mockUpdateResultCache = vi.fn() + const mockUpdateRetrievalMethodCache = vi.fn() + const mockOnSave = vi.fn() + + const { result } = renderHook(() => + useDocumentCreation({ + ...defaultOptions, + datasetId: undefined, + onStepChange: mockOnStepChange, + updateIndexingTypeCache: mockUpdateIndexingTypeCache, + updateResultCache: mockUpdateResultCache, + updateRetrievalMethodCache: mockUpdateRetrievalMethodCache, + onSave: mockOnSave, + }), + ) + + const params = result.current.buildCreationParams( + ChunkingMode.text, + 'English', + { mode: ProcessMode.general, rules: createMockRules() }, + { + search_method: RETRIEVE_METHOD.semantic, + reranking_enable: false, + reranking_model: { reranking_provider_name: '', reranking_model_name: '' }, + top_k: 3, + score_threshold_enabled: false, + score_threshold: 0.5, + }, + { provider: 'openai', model: 'text-embedding-ada-002' }, + IndexingType.QUALIFIED, + ) + + await act(async () => { + await result.current.executeCreation(params!, IndexingType.QUALIFIED, { + search_method: RETRIEVE_METHOD.semantic, + reranking_enable: false, + reranking_model: { reranking_provider_name: '', reranking_model_name: '' }, + top_k: 3, + score_threshold_enabled: false, + score_threshold: 0.5, + }) + }) + + expect(mockOnStepChange).toHaveBeenCalledWith(1) + }) + + it('should call createDocumentMutation when datasetId is provided', async () => { + const mockOnStepChange = vi.fn() + const { result } = renderHook(() => + useDocumentCreation({ + ...defaultOptions, + datasetId: 'existing-dataset-id', + onStepChange: mockOnStepChange, + }), + ) + + const params = result.current.buildCreationParams( + ChunkingMode.text, + 'English', + { mode: ProcessMode.general, rules: createMockRules() }, + { + search_method: RETRIEVE_METHOD.semantic, + reranking_enable: false, + reranking_model: { reranking_provider_name: '', reranking_model_name: '' }, + top_k: 3, + score_threshold_enabled: false, + score_threshold: 0.5, + }, + { provider: 'openai', model: 'text-embedding-ada-002' }, + IndexingType.QUALIFIED, + ) + + await act(async () => { + await result.current.executeCreation(params!, IndexingType.QUALIFIED, { + search_method: RETRIEVE_METHOD.semantic, + reranking_enable: false, + reranking_model: { reranking_provider_name: '', reranking_model_name: '' }, + top_k: 3, + score_threshold_enabled: false, + score_threshold: 0.5, + }) + }) + + expect(mockOnStepChange).toHaveBeenCalledWith(1) + }) + + it('should call onSave when in setting mode', async () => { + const mockOnSave = vi.fn() + const documentDetail = createMockDocumentDetail() + const { result } = renderHook(() => + useDocumentCreation({ + ...defaultOptions, + datasetId: 'existing-dataset-id', + isSetting: true, + documentDetail, + onSave: mockOnSave, + }), + ) + + const params = result.current.buildCreationParams( + ChunkingMode.text, + 'English', + { mode: ProcessMode.general, rules: createMockRules() }, + { + search_method: RETRIEVE_METHOD.semantic, + reranking_enable: false, + reranking_model: { reranking_provider_name: '', reranking_model_name: '' }, + top_k: 3, + score_threshold_enabled: false, + score_threshold: 0.5, + }, + { provider: 'openai', model: 'text-embedding-ada-002' }, + IndexingType.QUALIFIED, + ) + + await act(async () => { + await result.current.executeCreation(params!, IndexingType.QUALIFIED, { + search_method: RETRIEVE_METHOD.semantic, + reranking_enable: false, + reranking_model: { reranking_provider_name: '', reranking_model_name: '' }, + top_k: 3, + score_threshold_enabled: false, + score_threshold: 0.5, + }) + }) + + expect(mockOnSave).toHaveBeenCalled() + }) + }) + + // Tests for validatePreviewParams + describe('validatePreviewParams', () => { + it('should return true for valid max chunk length', () => { + const { result } = renderHook(() => useDocumentCreation(defaultOptions)) + + const isValid = result.current.validatePreviewParams(1000) + expect(isValid).toBe(true) + }) + + it('should return false when max chunk length exceeds maximum', () => { + const { result } = renderHook(() => useDocumentCreation(defaultOptions)) + + const isValid = result.current.validatePreviewParams(10000) + expect(isValid).toBe(false) + }) + }) +}) + +// ============================================ +// useIndexingEstimate Hook Tests +// ============================================ + +describe('useIndexingEstimate', () => { + beforeEach(() => { + vi.clearAllMocks() + }) + + const defaultOptions = { + dataSourceType: DataSourceType.FILE, + currentDocForm: ChunkingMode.text, + docLanguage: 'English', + files: [createMockFile()], + previewNotionPage: createMockNotionPage(), + notionCredentialId: '', + previewWebsitePage: createMockWebsitePage(), + indexingTechnique: IndexingType.QUALIFIED, + processRule: { mode: ProcessMode.general, rules: createMockRules() }, + } + + // Tests for initial state + describe('Initial State', () => { + it('should initialize with idle state', () => { + const { result } = renderHook(() => useIndexingEstimate(defaultOptions)) + + expect(result.current.isIdle).toBe(true) + expect(result.current.isPending).toBe(false) + expect(result.current.estimate).toBeUndefined() + }) + }) + + // Tests for fetchEstimate + describe('fetchEstimate', () => { + it('should have fetchEstimate function', () => { + const { result } = renderHook(() => useIndexingEstimate(defaultOptions)) + + expect(typeof result.current.fetchEstimate).toBe('function') + }) + + it('should have reset function', () => { + const { result } = renderHook(() => useIndexingEstimate(defaultOptions)) + + expect(typeof result.current.reset).toBe('function') + }) + + it('should call fetchEstimate for FILE data source', () => { + const { result } = renderHook(() => + useIndexingEstimate({ + ...defaultOptions, + dataSourceType: DataSourceType.FILE, + previewFileName: 'test-file.pdf', + }), + ) + + act(() => { + result.current.fetchEstimate() + }) + + // fetchEstimate should be callable without error + expect(result.current.fetchEstimate).toBeDefined() + }) + + it('should call fetchEstimate for NOTION data source', () => { + const { result } = renderHook(() => + useIndexingEstimate({ + ...defaultOptions, + dataSourceType: DataSourceType.NOTION, + previewNotionPage: createMockNotionPage(), + notionCredentialId: 'cred-123', + }), + ) + + act(() => { + result.current.fetchEstimate() + }) + + expect(result.current.fetchEstimate).toBeDefined() + }) + + it('should call fetchEstimate for WEB data source', () => { + const { result } = renderHook(() => + useIndexingEstimate({ + ...defaultOptions, + dataSourceType: DataSourceType.WEB, + previewWebsitePage: createMockWebsitePage(), + websiteCrawlProvider: 'jinaReader' as DataSourceProvider, + websiteCrawlJobId: 'job-123', + crawlOptions: { max_depth: 2 } as CrawlOptions, + }), + ) + + act(() => { + result.current.fetchEstimate() + }) + + expect(result.current.fetchEstimate).toBeDefined() + }) + }) + + // Tests for getCurrentMutation based on data source type + describe('Data Source Selection', () => { + it('should use file query for FILE data source', () => { + const { result } = renderHook(() => + useIndexingEstimate({ + ...defaultOptions, + dataSourceType: DataSourceType.FILE, + }), + ) + + expect(result.current.currentMutation).toBeDefined() + expect(result.current.isIdle).toBe(true) + }) + + it('should use notion query for NOTION data source', () => { + const { result } = renderHook(() => + useIndexingEstimate({ + ...defaultOptions, + dataSourceType: DataSourceType.NOTION, + }), + ) + + expect(result.current.currentMutation).toBeDefined() + expect(result.current.isIdle).toBe(true) + }) + + it('should use website query for WEB data source', () => { + const { result } = renderHook(() => + useIndexingEstimate({ + ...defaultOptions, + dataSourceType: DataSourceType.WEB, + websiteCrawlProvider: 'jinaReader' as DataSourceProvider, + websiteCrawlJobId: 'job-123', + }), + ) + + expect(result.current.currentMutation).toBeDefined() + expect(result.current.isIdle).toBe(true) + }) + }) +}) + +// ============================================ +// StepTwoFooter Component Tests +// ============================================ + +describe('StepTwoFooter', () => { + beforeEach(() => { + vi.clearAllMocks() + }) + + const defaultProps = { + isSetting: false, + isCreating: false, + onPrevious: vi.fn(), + onCreate: vi.fn(), + onCancel: vi.fn(), + } + + // Tests for rendering + describe('Rendering', () => { + it('should render without crashing', () => { + render() + + // Should render Previous and Next buttons with correct text + expect(screen.getByText(/previousStep/i)).toBeInTheDocument() + expect(screen.getByText(/nextStep/i)).toBeInTheDocument() + }) + + it('should render Previous and Next buttons when not in setting mode', () => { + render() + + expect(screen.getByText(/previousStep/i)).toBeInTheDocument() + expect(screen.getByText(/nextStep/i)).toBeInTheDocument() + }) + + it('should render Save and Cancel buttons when in setting mode', () => { + render() + + expect(screen.getByText(/save/i)).toBeInTheDocument() + expect(screen.getByText(/cancel/i)).toBeInTheDocument() + }) + }) + + // Tests for user interactions + describe('User Interactions', () => { + it('should call onPrevious when Previous button is clicked', () => { + const onPrevious = vi.fn() + render() + + fireEvent.click(screen.getByText(/previousStep/i)) + + expect(onPrevious).toHaveBeenCalledTimes(1) + }) + + it('should call onCreate when Next/Save button is clicked', () => { + const onCreate = vi.fn() + render() + + fireEvent.click(screen.getByText(/nextStep/i)) + + expect(onCreate).toHaveBeenCalledTimes(1) + }) + + it('should call onCancel when Cancel button is clicked in setting mode', () => { + const onCancel = vi.fn() + render() + + fireEvent.click(screen.getByText(/cancel/i)) + + expect(onCancel).toHaveBeenCalledTimes(1) + }) + }) + + // Tests for loading state + describe('Loading State', () => { + it('should show loading state on Next button when creating', () => { + render() + + const nextButton = screen.getByText(/nextStep/i).closest('button') + // Button has disabled:btn-disabled class which handles the loading state + expect(nextButton).toHaveClass('disabled:btn-disabled') + }) + + it('should show loading state on Save button when creating in setting mode', () => { + render() + + const saveButton = screen.getByText(/save/i).closest('button') + // Button has disabled:btn-disabled class which handles the loading state + expect(saveButton).toHaveClass('disabled:btn-disabled') + }) + }) +}) + +// ============================================ +// PreviewPanel Component Tests +// ============================================ + +describe('PreviewPanel', () => { + beforeEach(() => { + vi.clearAllMocks() + }) + + const defaultProps = { + isMobile: false, + dataSourceType: DataSourceType.FILE, + currentDocForm: ChunkingMode.text, + estimate: undefined as FileIndexingEstimateResponse | undefined, + parentChildConfig: defaultParentChildConfig, + isSetting: false, + pickerFiles: [{ id: 'file-1', name: 'test.pdf', extension: 'pdf' }], + pickerValue: { id: 'file-1', name: 'test.pdf', extension: 'pdf' }, + isIdle: true, + isPending: false, + onPickerChange: vi.fn(), + } + + // Tests for rendering + describe('Rendering', () => { + it('should render without crashing', () => { + render() + + // Check for the preview header title text + expect(screen.getByText('datasetCreation.stepTwo.preview')).toBeInTheDocument() + }) + + it('should render idle state when isIdle is true', () => { + render() + + expect(screen.getByText(/previewChunkTip/i)).toBeInTheDocument() + }) + + it('should render loading skeleton when isPending is true', () => { + render() + + // Should show skeleton containers + expect(screen.queryByText(/previewChunkTip/i)).not.toBeInTheDocument() + }) + }) + + // Tests for different doc forms + describe('Preview Content', () => { + it('should render text preview when docForm is text', () => { + const estimate = createMockEstimate() + render( + , + ) + + expect(screen.getByText('Chunk 1 content')).toBeInTheDocument() + }) + + it('should render QA preview when docForm is qa', () => { + const estimate = createMockEstimate() + render( + , + ) + + expect(screen.getByText('Q1')).toBeInTheDocument() + expect(screen.getByText('A1')).toBeInTheDocument() + }) + + it('should show chunk count badge for non-QA doc form', () => { + const estimate = createMockEstimate({ total_segments: 25 }) + render( + , + ) + + expect(screen.getByText(/25/)).toBeInTheDocument() + }) + + it('should render parent-child preview when docForm is parentChild', () => { + const estimate = createMockEstimate({ + preview: [ + { content: 'Parent chunk content', child_chunks: ['Child 1', 'Child 2', 'Child 3'] }, + ], + }) + render( + , + ) + + // Should render parent chunk label + expect(screen.getByText('Chunk-1')).toBeInTheDocument() + // Should render child chunks + expect(screen.getByText('Child 1')).toBeInTheDocument() + expect(screen.getByText('Child 2')).toBeInTheDocument() + expect(screen.getByText('Child 3')).toBeInTheDocument() + }) + + it('should limit child chunks when chunkForContext is full-doc', () => { + // FULL_DOC_PREVIEW_LENGTH is 50, so we need more than 50 chunks to test the limit + const manyChildChunks = Array.from({ length: 60 }, (_, i) => `ChildChunk${i + 1}`) + const estimate = createMockEstimate({ + preview: [{ content: 'Parent content', child_chunks: manyChildChunks }], + }) + render( + , + ) + + // Should render parent chunk + expect(screen.getByText('Chunk-1')).toBeInTheDocument() + // full-doc mode limits to FULL_DOC_PREVIEW_LENGTH (50) + expect(screen.getByText('ChildChunk1')).toBeInTheDocument() + expect(screen.getByText('ChildChunk50')).toBeInTheDocument() + // Should not render beyond the limit + expect(screen.queryByText('ChildChunk51')).not.toBeInTheDocument() + }) + + it('should render multiple parent chunks in parent-child mode', () => { + const estimate = createMockEstimate({ + preview: [ + { content: 'Parent 1', child_chunks: ['P1-C1'] }, + { content: 'Parent 2', child_chunks: ['P2-C1'] }, + ], + }) + render( + , + ) + + expect(screen.getByText('Chunk-1')).toBeInTheDocument() + expect(screen.getByText('Chunk-2')).toBeInTheDocument() + expect(screen.getByText('P1-C1')).toBeInTheDocument() + expect(screen.getByText('P2-C1')).toBeInTheDocument() + }) + }) + + // Tests for picker + describe('Document Picker', () => { + it('should call onPickerChange when document is selected', () => { + const onPickerChange = vi.fn() + render() + + // The picker interaction would be tested through the actual component + expect(onPickerChange).not.toHaveBeenCalled() + }) + }) +}) + +// ============================================ +// Edge Cases Tests +// ============================================ + +describe('Edge Cases', () => { + beforeEach(() => { + vi.clearAllMocks() + }) + + describe('Empty/Null Values', () => { + it('should handle empty files array in usePreviewState', () => { + const { result } = renderHook(() => + usePreviewState({ + dataSourceType: DataSourceType.FILE, + files: [], + notionPages: [], + websitePages: [], + }), + ) + + expect(result.current.previewFile).toBeUndefined() + }) + + it('should handle empty notion pages array', () => { + const { result } = renderHook(() => + usePreviewState({ + dataSourceType: DataSourceType.NOTION, + files: [], + notionPages: [], + websitePages: [], + }), + ) + + expect(result.current.previewNotionPage).toBeUndefined() + }) + + it('should handle empty website pages array', () => { + const { result } = renderHook(() => + usePreviewState({ + dataSourceType: DataSourceType.WEB, + files: [], + notionPages: [], + websitePages: [], + }), + ) + + expect(result.current.previewWebsitePage).toBeUndefined() + }) + }) + + describe('Boundary Conditions', () => { + it('should handle very large chunk length', () => { + const { result } = renderHook(() => useSegmentationState()) + + act(() => { + result.current.setMaxChunkLength(999999) + }) + + expect(result.current.maxChunkLength).toBe(999999) + }) + + it('should handle zero overlap', () => { + const { result } = renderHook(() => useSegmentationState()) + + act(() => { + result.current.setOverlap(0) + }) + + expect(result.current.overlap).toBe(0) + }) + + it('should handle special characters in segment identifier', () => { + const { result } = renderHook(() => useSegmentationState()) + + act(() => { + result.current.setSegmentIdentifier('<<>>') + }) + + expect(result.current.segmentIdentifier).toBe('<<>>') + }) + }) + + describe('Callback Stability', () => { + it('should maintain stable setSegmentIdentifier reference', () => { + const { result, rerender } = renderHook(() => useSegmentationState()) + const initialSetter = result.current.setSegmentIdentifier + + rerender() + + expect(result.current.setSegmentIdentifier).toBe(initialSetter) + }) + + it('should maintain stable toggleRule reference', () => { + const { result, rerender } = renderHook(() => useSegmentationState()) + const initialToggle = result.current.toggleRule + + rerender() + + expect(result.current.toggleRule).toBe(initialToggle) + }) + + it('should maintain stable getProcessRule reference', () => { + const { result, rerender } = renderHook(() => useSegmentationState()) + + // Update some state to trigger re-render + act(() => { + result.current.setMaxChunkLength(2048) + }) + + rerender() + + // getProcessRule depends on state, so it may change but should remain a function + expect(typeof result.current.getProcessRule).toBe('function') + }) + }) +}) + +// ============================================ +// Integration Scenarios +// ============================================ + +describe('Integration Scenarios', () => { + beforeEach(() => { + vi.clearAllMocks() + mockCurrentDataset = null + }) + + describe('Document Creation Flow', () => { + it('should build and validate params for file upload workflow', () => { + const files = [createMockFile()] + + const { result: segResult } = renderHook(() => useSegmentationState()) + const { result: creationResult } = renderHook(() => + useDocumentCreation({ + dataSourceType: DataSourceType.FILE, + files, + notionPages: [], + notionCredentialId: '', + websitePages: [], + }), + ) + + // Build params + const params = creationResult.current.buildCreationParams( + ChunkingMode.text, + 'English', + segResult.current.getProcessRule(ChunkingMode.text), + { + search_method: RETRIEVE_METHOD.semantic, + reranking_enable: false, + reranking_model: { reranking_provider_name: '', reranking_model_name: '' }, + top_k: 3, + score_threshold_enabled: false, + score_threshold: 0.5, + }, + { provider: 'openai', model: 'text-embedding-ada-002' }, + IndexingType.QUALIFIED, + ) + + expect(params).toBeDefined() + expect(params?.data_source?.info_list.file_info_list?.file_ids).toContain('file-1') + }) + + it('should handle parent-child document form', () => { + const { result } = renderHook(() => useSegmentationState()) + + act(() => { + result.current.setSegmentationType(ProcessMode.parentChild) + result.current.setChunkForContext('full-doc') + result.current.updateParentConfig('maxLength', 2048) + result.current.updateChildConfig('maxLength', 512) + }) + + const processRule = result.current.getProcessRule(ChunkingMode.parentChild) + + expect(processRule.mode).toBe('hierarchical') + expect(processRule.rules.parent_mode).toBe('full-doc') + expect(processRule.rules.segmentation.max_tokens).toBe(2048) + expect(processRule.rules.subchunk_segmentation?.max_tokens).toBe(512) + }) + }) + + describe('Preview Flow', () => { + it('should handle preview file change flow', () => { + const files = [ + createMockFile({ id: 'file-1', name: 'first.pdf' }), + createMockFile({ id: 'file-2', name: 'second.pdf' }), + ] + + const { result } = renderHook(() => + usePreviewState({ + dataSourceType: DataSourceType.FILE, + files, + notionPages: [], + websitePages: [], + }), + ) + + // Initial state + expect(result.current.getPreviewPickerValue().name).toBe('first.pdf') + + // Change preview + act(() => { + result.current.handlePreviewChange({ id: 'file-2', name: 'second.pdf' }) + }) + + expect(result.current.previewFile).toEqual({ id: 'file-2', name: 'second.pdf' }) + }) + }) + + describe('Escape/Unescape Round Trip', () => { + it('should preserve original string through escape/unescape', () => { + const original = '\n\n' + const escaped = escape(original) + const unescaped = unescape(escaped) + + expect(unescaped).toBe(original) + }) + + it('should handle complex strings without backslashes', () => { + // This string contains control characters but no literal backslashes. + const original = 'Hello\nWorld\t!\r\n' + const escaped = escape(original) + const unescaped = unescape(escaped) + expect(unescaped).toBe(original) + }) + + it('should document behavior for strings with existing backslashes', () => { + // When the original string already contains backslash sequences, + // escape/unescape are not perfectly symmetric because escape() + // does not escape backslashes. + const original = 'Hello\\nWorld' + const escaped = escape(original) + const unescaped = unescape(escaped) + // The unescaped value interprets "\n" as a newline, so it differs from the original. + expect(unescaped).toBe('Hello\nWorld') + expect(unescaped).not.toBe(original) + }) + }) +}) diff --git a/web/app/components/datasets/create/step-two/index.tsx b/web/app/components/datasets/create/step-two/index.tsx index 51b5c15178..b4d2c5f6e9 100644 --- a/web/app/components/datasets/create/step-two/index.tsx +++ b/web/app/components/datasets/create/step-two/index.tsx @@ -1,137 +1,30 @@ 'use client' -import type { FC, PropsWithChildren } from 'react' -import type { DefaultModel } from '@/app/components/header/account-setting/model-provider-page/declarations' -import type { NotionPage } from '@/models/common' -import type { CrawlOptions, CrawlResultItem, CreateDocumentReq, createDocumentResponse, CustomFile, DocumentItem, FullDocumentDetail, ParentMode, PreProcessingRule, ProcessRule, Rules } from '@/models/datasets' -import type { RetrievalConfig } from '@/types/app' -import { - RiAlertFill, - RiArrowLeftLine, - RiSearchEyeLine, -} from '@remixicon/react' -import { noop } from 'es-toolkit/function' -import Image from 'next/image' -import Link from 'next/link' -import { useCallback, useEffect, useMemo, useState } from 'react' -import { useTranslation } from 'react-i18next' -import { trackEvent } from '@/app/components/base/amplitude' -import Badge from '@/app/components/base/badge' -import Button from '@/app/components/base/button' -import Checkbox from '@/app/components/base/checkbox' -import CustomDialog from '@/app/components/base/dialog' -import Divider from '@/app/components/base/divider' -import FloatRightContainer from '@/app/components/base/float-right-container' -import { ParentChildChunk } from '@/app/components/base/icons/src/vender/knowledge' -import { AlertTriangle } from '@/app/components/base/icons/src/vender/solid/alertsAndFeedback' -import RadioCard from '@/app/components/base/radio-card' -import { SkeletonContainer, SkeletonPoint, SkeletonRectangle, SkeletonRow } from '@/app/components/base/skeleton' -import Toast from '@/app/components/base/toast' -import Tooltip from '@/app/components/base/tooltip' -import { isReRankModelSelected } from '@/app/components/datasets/common/check-rerank-model' -import EconomicalRetrievalMethodConfig from '@/app/components/datasets/common/economical-retrieval-method-config' -import RetrievalMethodConfig from '@/app/components/datasets/common/retrieval-method-config' -import { ModelTypeEnum } from '@/app/components/header/account-setting/model-provider-page/declarations' -import { useDefaultModel, useModelList, useModelListAndDefaultModelAndCurrentProviderAndModel } from '@/app/components/header/account-setting/model-provider-page/hooks' -import ModelSelector from '@/app/components/header/account-setting/model-provider-page/model-selector' -import { FULL_DOC_PREVIEW_LENGTH, IS_CE_EDITION } from '@/config' +import type { FC } from 'react' +import type { StepTwoProps } from './types' +import { useCallback, useEffect, useState } from 'react' +import { useTranslation } from 'react-i18next' +import Divider from '@/app/components/base/divider' +import Toast from '@/app/components/base/toast' import { useDatasetDetailContextWithSelector } from '@/context/dataset-detail' -import { useDocLink, useLocale } from '@/context/i18n' +import { useLocale } from '@/context/i18n' import useBreakpoints, { MediaType } from '@/hooks/use-breakpoints' import { LanguagesSupported } from '@/i18n-config/language' import { DataSourceProvider } from '@/models/common' -import { ChunkingMode, DataSourceType, ProcessMode } from '@/models/datasets' -import { getNotionInfo, getWebsiteInfo, useCreateDocument, useCreateFirstDocument, useFetchDefaultProcessRule, useFetchFileIndexingEstimateForFile, useFetchFileIndexingEstimateForNotion, useFetchFileIndexingEstimateForWeb } from '@/service/knowledge/use-create-dataset' -import { useInvalidDatasetList } from '@/service/knowledge/use-dataset' -import { RETRIEVE_METHOD } from '@/types/app' +import { ChunkingMode, ProcessMode } from '@/models/datasets' +import { useFetchDefaultProcessRule } from '@/service/knowledge/use-create-dataset' import { cn } from '@/utils/classnames' -import { ChunkContainer, QAPreview } from '../../chunk' -import PreviewDocumentPicker from '../../common/document-picker/preview-document-picker' -import { PreviewSlice } from '../../formatted-text/flavours/preview-slice' -import { FormattedText } from '../../formatted-text/formatted' -import PreviewContainer from '../../preview/container' -import { PreviewHeader } from '../../preview/header' -import { checkShowMultiModalTip } from '../../settings/utils' -import FileList from '../assets/file-list-3-fill.svg' -import Note from '../assets/note-mod.svg' -import BlueEffect from '../assets/option-card-effect-blue.svg' -import SettingCog from '../assets/setting-gear-mod.svg' -import { indexMethodIcon } from '../icons' -import escape from './escape' -import s from './index.module.css' -import { DelimiterInput, MaxLengthInput, OverlapInput } from './inputs' -import LanguageSelect from './language-select' -import { OptionCard } from './option-card' -import unescape from './unescape' +import { GeneralChunkingOptions, IndexingModeSection, ParentChildOptions, PreviewPanel, StepTwoFooter } from './components' +import { IndexingType, MAXIMUM_CHUNK_TOKEN_LENGTH, useDocumentCreation, useIndexingConfig, useIndexingEstimate, usePreviewState, useSegmentationState } from './hooks' -const TextLabel: FC = (props) => { - return -} +export { IndexingType } -type StepTwoProps = { - isSetting?: boolean - documentDetail?: FullDocumentDetail - isAPIKeySet: boolean - onSetting: () => void - datasetId?: string - indexingType?: IndexingType - retrievalMethod?: string - dataSourceType: DataSourceType - files: CustomFile[] - notionPages?: NotionPage[] - notionCredentialId: string - websitePages?: CrawlResultItem[] - crawlOptions?: CrawlOptions - websiteCrawlProvider?: DataSourceProvider - websiteCrawlJobId?: string - onStepChange?: (delta: number) => void - updateIndexingTypeCache?: (type: string) => void - updateRetrievalMethodCache?: (method: RETRIEVE_METHOD | '') => void - updateResultCache?: (res: createDocumentResponse) => void - onSave?: () => void - onCancel?: () => void -} - -export enum IndexingType { - QUALIFIED = 'high_quality', - ECONOMICAL = 'economy', -} - -const DEFAULT_SEGMENT_IDENTIFIER = '\\n\\n' -const DEFAULT_MAXIMUM_CHUNK_LENGTH = 1024 -const DEFAULT_OVERLAP = 50 -const MAXIMUM_CHUNK_TOKEN_LENGTH = Number.parseInt(globalThis.document?.body?.getAttribute('data-public-indexing-max-segmentation-tokens-length') || '4000', 10) - -type ParentChildConfig = { - chunkForContext: ParentMode - parent: { - delimiter: string - maxLength: number - } - child: { - delimiter: string - maxLength: number - } -} - -const defaultParentChildConfig: ParentChildConfig = { - chunkForContext: 'paragraph', - parent: { - delimiter: '\\n\\n', - maxLength: 1024, - }, - child: { - delimiter: '\\n', - maxLength: 512, - }, -} - -const StepTwo = ({ +const StepTwo: FC = ({ isSetting, documentDetail, isAPIKeySet, datasetId, - indexingType, + indexingType: propsIndexingType, dataSourceType: inCreatePageDataSourceType, files, notionPages = [], @@ -146,1099 +39,238 @@ const StepTwo = ({ onSave, onCancel, updateRetrievalMethodCache, -}: StepTwoProps) => { +}) => { const { t } = useTranslation() - const docLink = useDocLink() const locale = useLocale() - const media = useBreakpoints() - const isMobile = media === MediaType.mobile - - const currentDataset = useDatasetDetailContextWithSelector(state => state.dataset) - const mutateDatasetRes = useDatasetDetailContextWithSelector(state => state.mutateDatasetRes) + const isMobile = useBreakpoints() === MediaType.mobile + const currentDataset = useDatasetDetailContextWithSelector(s => s.dataset) + const mutateDatasetRes = useDatasetDetailContextWithSelector(s => s.mutateDatasetRes) + // Computed flags const isInUpload = Boolean(currentDataset) const isUploadInEmptyDataset = isInUpload && !currentDataset?.doc_form const isNotUploadInEmptyDataset = !isUploadInEmptyDataset const isInInit = !isInUpload && !isSetting - const isInCreatePage = !datasetId || (datasetId && !currentDataset?.data_source_type) - const dataSourceType = isInCreatePage ? inCreatePageDataSourceType : currentDataset?.data_source_type - const [segmentationType, setSegmentationType] = useState( - currentDataset?.doc_form === ChunkingMode.parentChild ? ProcessMode.parentChild : ProcessMode.general, - ) - const [segmentIdentifier, doSetSegmentIdentifier] = useState(DEFAULT_SEGMENT_IDENTIFIER) - const setSegmentIdentifier = useCallback((value: string, canEmpty?: boolean) => { - doSetSegmentIdentifier(value ? escape(value) : (canEmpty ? '' : DEFAULT_SEGMENT_IDENTIFIER)) - }, []) - const [maxChunkLength, setMaxChunkLength] = useState(DEFAULT_MAXIMUM_CHUNK_LENGTH) // default chunk length - const [limitMaxChunkLength, setLimitMaxChunkLength] = useState(MAXIMUM_CHUNK_TOKEN_LENGTH) - const [overlap, setOverlap] = useState(DEFAULT_OVERLAP) - const [rules, setRules] = useState([]) - const [defaultConfig, setDefaultConfig] = useState() - const hasSetIndexType = !!indexingType - const [indexType, setIndexType] = useState(() => { - if (hasSetIndexType) - return indexingType - return isAPIKeySet ? IndexingType.QUALIFIED : IndexingType.ECONOMICAL - }) + const dataSourceType = isInCreatePage ? inCreatePageDataSourceType : (currentDataset?.data_source_type ?? inCreatePageDataSourceType) + const hasSetIndexType = !!propsIndexingType + const isModelAndRetrievalConfigDisabled = !!datasetId && !!currentDataset?.data_source_type - const [previewFile, setPreviewFile] = useState( - (datasetId && documentDetail) - ? documentDetail.file - : files[0], - ) - const [previewNotionPage, setPreviewNotionPage] = useState( - (datasetId && documentDetail) - ? documentDetail.notion_page - : notionPages[0], - ) - - const [previewWebsitePage, setPreviewWebsitePage] = useState( - (datasetId && documentDetail) - ? documentDetail.website_page - : websitePages[0], - ) - - // QA Related + // Document form state + const [docForm, setDocForm] = useState((datasetId && documentDetail) ? documentDetail.doc_form as ChunkingMode : ChunkingMode.text) + const [docLanguage, setDocLanguage] = useState(() => (datasetId && documentDetail) ? documentDetail.doc_language : (locale !== LanguagesSupported[1] ? 'English' : 'Chinese Simplified')) const [isQAConfirmDialogOpen, setIsQAConfirmDialogOpen] = useState(false) - const [docForm, setDocForm] = useState( - (datasetId && documentDetail) ? documentDetail.doc_form as ChunkingMode : ChunkingMode.text, - ) - const handleChangeDocform = (value: ChunkingMode) => { - if (value === ChunkingMode.qa && indexType === IndexingType.ECONOMICAL) { - setIsQAConfirmDialogOpen(true) - return - } - if (value === ChunkingMode.parentChild && indexType === IndexingType.ECONOMICAL) - setIndexType(IndexingType.QUALIFIED) - - setDocForm(value) - - if (value === ChunkingMode.parentChild) - setSegmentationType(ProcessMode.parentChild) - else - setSegmentationType(ProcessMode.general) - - // eslint-disable-next-line ts/no-use-before-define - currentEstimateMutation.reset() - } - - const [docLanguage, setDocLanguage] = useState( - (datasetId && documentDetail) ? documentDetail.doc_language : (locale !== LanguagesSupported[1] ? 'English' : 'Chinese Simplified'), - ) - - const [parentChildConfig, setParentChildConfig] = useState(defaultParentChildConfig) - - const getIndexing_technique = () => indexingType || indexType const currentDocForm = currentDataset?.doc_form || docForm - const getProcessRule = (): ProcessRule => { - if (currentDocForm === ChunkingMode.parentChild) { - return { - rules: { - pre_processing_rules: rules, - segmentation: { - separator: unescape( - parentChildConfig.parent.delimiter, - ), - max_tokens: parentChildConfig.parent.maxLength, - }, - parent_mode: parentChildConfig.chunkForContext, - subchunk_segmentation: { - separator: unescape(parentChildConfig.child.delimiter), - max_tokens: parentChildConfig.child.maxLength, - }, - }, - mode: 'hierarchical', - } as ProcessRule - } - return { - rules: { - pre_processing_rules: rules, - segmentation: { - separator: unescape(segmentIdentifier), - max_tokens: maxChunkLength, - chunk_overlap: overlap, - }, - }, // api will check this. It will be removed after api refactored. - mode: segmentationType, - } as ProcessRule - } - - const fileIndexingEstimateQuery = useFetchFileIndexingEstimateForFile({ - docForm: currentDocForm, - docLanguage, - dataSourceType: DataSourceType.FILE, - files: previewFile - ? [files.find(file => file.name === previewFile.name)!] - : files, - indexingTechnique: getIndexing_technique() as any, - processRule: getProcessRule(), - dataset_id: datasetId!, + // Custom hooks + const segmentation = useSegmentationState({ + initialSegmentationType: currentDataset?.doc_form === ChunkingMode.parentChild ? ProcessMode.parentChild : ProcessMode.general, }) - const notionIndexingEstimateQuery = useFetchFileIndexingEstimateForNotion({ - docForm: currentDocForm, - docLanguage, - dataSourceType: DataSourceType.NOTION, - notionPages: [previewNotionPage], - indexingTechnique: getIndexing_technique() as any, - processRule: getProcessRule(), - dataset_id: datasetId || '', - credential_id: notionCredentialId, + const indexing = useIndexingConfig({ + initialIndexType: propsIndexingType, + initialEmbeddingModel: currentDataset?.embedding_model ? { provider: currentDataset.embedding_model_provider, model: currentDataset.embedding_model } : undefined, + initialRetrievalConfig: currentDataset?.retrieval_model_dict, + isAPIKeySet, + hasSetIndexType, }) - - const websiteIndexingEstimateQuery = useFetchFileIndexingEstimateForWeb({ - docForm: currentDocForm, - docLanguage, - dataSourceType: DataSourceType.WEB, - websitePages: [previewWebsitePage], + const preview = usePreviewState({ dataSourceType, files, notionPages, websitePages, documentDetail, datasetId }) + const creation = useDocumentCreation({ + datasetId, + isSetting, + documentDetail, + dataSourceType, + files, + notionPages, + notionCredentialId, + websitePages, crawlOptions, websiteCrawlProvider, websiteCrawlJobId, - indexingTechnique: getIndexing_technique() as any, - processRule: getProcessRule(), - dataset_id: datasetId || '', + onStepChange, + updateIndexingTypeCache, + updateResultCache, + updateRetrievalMethodCache, + onSave, + mutateDatasetRes, + }) + const estimateHook = useIndexingEstimate({ + dataSourceType, + datasetId, + currentDocForm, + docLanguage, + files, + previewFileName: preview.previewFile?.name, + previewNotionPage: preview.previewNotionPage, + notionCredentialId, + previewWebsitePage: preview.previewWebsitePage, + crawlOptions, + websiteCrawlProvider, + websiteCrawlJobId, + indexingTechnique: indexing.getIndexingTechnique() as IndexingType, + processRule: segmentation.getProcessRule(currentDocForm), }) - const currentEstimateMutation = dataSourceType === DataSourceType.FILE - ? fileIndexingEstimateQuery - : dataSourceType === DataSourceType.NOTION - ? notionIndexingEstimateQuery - : websiteIndexingEstimateQuery + // Fetch default process rule + const fetchDefaultProcessRuleMutation = useFetchDefaultProcessRule({ + onSuccess(data) { + segmentation.setSegmentIdentifier(data.rules.segmentation.separator) + segmentation.setMaxChunkLength(data.rules.segmentation.max_tokens) + segmentation.setOverlap(data.rules.segmentation.chunk_overlap!) + segmentation.setRules(data.rules.pre_processing_rules) + segmentation.setDefaultConfig(data.rules) + segmentation.setLimitMaxChunkLength(data.limits.indexing_max_segmentation_tokens_length) + }, + }) - const fetchEstimate = useCallback(() => { - if (dataSourceType === DataSourceType.FILE) - fileIndexingEstimateQuery.mutate() - - if (dataSourceType === DataSourceType.NOTION) - notionIndexingEstimateQuery.mutate() - - if (dataSourceType === DataSourceType.WEB) - websiteIndexingEstimateQuery.mutate() - }, [dataSourceType, fileIndexingEstimateQuery, notionIndexingEstimateQuery, websiteIndexingEstimateQuery]) - - const estimate - = dataSourceType === DataSourceType.FILE - ? fileIndexingEstimateQuery.data - : dataSourceType === DataSourceType.NOTION - ? notionIndexingEstimateQuery.data - : websiteIndexingEstimateQuery.data - - const getRuleName = (key: string) => { - if (key === 'remove_extra_spaces') - return t('stepTwo.removeExtraSpaces', { ns: 'datasetCreation' }) - - if (key === 'remove_urls_emails') - return t('stepTwo.removeUrlEmails', { ns: 'datasetCreation' }) - - if (key === 'remove_stopwords') - return t('stepTwo.removeStopwords', { ns: 'datasetCreation' }) - } - const ruleChangeHandle = (id: string) => { - const newRules = rules.map((rule) => { - if (rule.id === id) { - return { - id: rule.id, - enabled: !rule.enabled, - } - } - return rule - }) - setRules(newRules) - } - const resetRules = () => { - if (defaultConfig) { - setSegmentIdentifier(defaultConfig.segmentation.separator) - setMaxChunkLength(defaultConfig.segmentation.max_tokens) - setOverlap(defaultConfig.segmentation.chunk_overlap!) - setRules(defaultConfig.pre_processing_rules) + // Event handlers + const handleDocFormChange = useCallback((value: ChunkingMode) => { + if (value === ChunkingMode.qa && indexing.indexType === IndexingType.ECONOMICAL) { + setIsQAConfirmDialogOpen(true) + return } - setParentChildConfig(defaultParentChildConfig) - } + if (value === ChunkingMode.parentChild && indexing.indexType === IndexingType.ECONOMICAL) + indexing.setIndexType(IndexingType.QUALIFIED) + setDocForm(value) + segmentation.setSegmentationType(value === ChunkingMode.parentChild ? ProcessMode.parentChild : ProcessMode.general) + estimateHook.reset() + }, [indexing, segmentation, estimateHook]) - const updatePreview = () => { - if (segmentationType === ProcessMode.general && maxChunkLength > MAXIMUM_CHUNK_TOKEN_LENGTH) { + const updatePreview = useCallback(() => { + if (segmentation.segmentationType === ProcessMode.general && segmentation.maxChunkLength > MAXIMUM_CHUNK_TOKEN_LENGTH) { Toast.notify({ type: 'error', message: t('stepTwo.maxLengthCheck', { ns: 'datasetCreation', limit: MAXIMUM_CHUNK_TOKEN_LENGTH }) }) return } - fetchEstimate() - } + estimateHook.fetchEstimate() + }, [segmentation, t, estimateHook]) - const { - modelList: rerankModelList, - defaultModel: rerankDefaultModel, - currentModel: isRerankDefaultModelValid, - } = useModelListAndDefaultModelAndCurrentProviderAndModel(ModelTypeEnum.rerank) - const { data: embeddingModelList } = useModelList(ModelTypeEnum.textEmbedding) - const { data: defaultEmbeddingModel } = useDefaultModel(ModelTypeEnum.textEmbedding) - const [embeddingModel, setEmbeddingModel] = useState( - currentDataset?.embedding_model - ? { - provider: currentDataset.embedding_model_provider, - model: currentDataset.embedding_model, - } - : { - provider: defaultEmbeddingModel?.provider.provider || '', - model: defaultEmbeddingModel?.model || '', - }, - ) - const [retrievalConfig, setRetrievalConfig] = useState(currentDataset?.retrieval_model_dict || { - search_method: RETRIEVE_METHOD.semantic, - reranking_enable: false, - reranking_model: { - reranking_provider_name: '', - reranking_model_name: '', - }, - top_k: 3, - score_threshold_enabled: false, - score_threshold: 0.5, - } as RetrievalConfig) - - useEffect(() => { - if (currentDataset?.retrieval_model_dict) - return - setRetrievalConfig({ - search_method: RETRIEVE_METHOD.semantic, - reranking_enable: !!isRerankDefaultModelValid, - reranking_model: { - reranking_provider_name: isRerankDefaultModelValid ? rerankDefaultModel?.provider.provider ?? '' : '', - reranking_model_name: isRerankDefaultModelValid ? rerankDefaultModel?.model ?? '' : '', - }, - top_k: 3, - score_threshold_enabled: false, - score_threshold: 0.5, + const handleCreate = useCallback(async () => { + const isValid = creation.validateParams({ + segmentationType: segmentation.segmentationType, + maxChunkLength: segmentation.maxChunkLength, + limitMaxChunkLength: segmentation.limitMaxChunkLength, + overlap: segmentation.overlap, + indexType: indexing.indexType, + embeddingModel: indexing.embeddingModel, + rerankModelList: indexing.rerankModelList, + retrievalConfig: indexing.retrievalConfig, }) - }, [rerankDefaultModel, isRerankDefaultModelValid]) - - const getCreationParams = () => { - let params - if (segmentationType === ProcessMode.general && overlap > maxChunkLength) { - Toast.notify({ type: 'error', message: t('stepTwo.overlapCheck', { ns: 'datasetCreation' }) }) + if (!isValid) return - } - if (segmentationType === ProcessMode.general && maxChunkLength > limitMaxChunkLength) { - Toast.notify({ type: 'error', message: t('stepTwo.maxLengthCheck', { ns: 'datasetCreation', limit: limitMaxChunkLength }) }) - return - } - if (isSetting) { - params = { - original_document_id: documentDetail?.id, - doc_form: currentDocForm, - doc_language: docLanguage, - process_rule: getProcessRule(), - retrieval_model: retrievalConfig, // Readonly. If want to changed, just go to settings page. - embedding_model: embeddingModel.model, // Readonly - embedding_model_provider: embeddingModel.provider, // Readonly - indexing_technique: getIndexing_technique(), - } as CreateDocumentReq - } - else { // create - const indexMethod = getIndexing_technique() - if (indexMethod === IndexingType.QUALIFIED && (!embeddingModel.model || !embeddingModel.provider)) { - Toast.notify({ - type: 'error', - message: t('datasetConfig.embeddingModelRequired', { ns: 'appDebug' }), - }) - return - } - if ( - !isReRankModelSelected({ - rerankModelList, - retrievalConfig, - indexMethod: indexMethod as string, - }) - ) { - Toast.notify({ type: 'error', message: t('datasetConfig.rerankModelRequired', { ns: 'appDebug' }) }) - return - } - params = { - data_source: { - type: dataSourceType, - info_list: { - data_source_type: dataSourceType, - }, - }, - indexing_technique: getIndexing_technique(), - process_rule: getProcessRule(), - doc_form: currentDocForm, - doc_language: docLanguage, - retrieval_model: retrievalConfig, - embedding_model: embeddingModel.model, - embedding_model_provider: embeddingModel.provider, - } as CreateDocumentReq - if (dataSourceType === DataSourceType.FILE) { - params.data_source.info_list.file_info_list = { - file_ids: files.map(file => file.id || '').filter(Boolean), - } - } - if (dataSourceType === DataSourceType.NOTION) - params.data_source.info_list.notion_info_list = getNotionInfo(notionPages, notionCredentialId) - - if (dataSourceType === DataSourceType.WEB) { - params.data_source.info_list.website_info_list = getWebsiteInfo({ - websiteCrawlProvider, - websiteCrawlJobId, - websitePages, - }) - } - } - return params - } - - const fetchDefaultProcessRuleMutation = useFetchDefaultProcessRule({ - onSuccess(data) { - const separator = data.rules.segmentation.separator - setSegmentIdentifier(separator) - setMaxChunkLength(data.rules.segmentation.max_tokens) - setOverlap(data.rules.segmentation.chunk_overlap!) - setRules(data.rules.pre_processing_rules) - setDefaultConfig(data.rules) - setLimitMaxChunkLength(data.limits.indexing_max_segmentation_tokens_length) - }, - }) - - const getRulesFromDetail = () => { - if (documentDetail) { - const rules = documentDetail.dataset_process_rule.rules - const separator = rules.segmentation.separator - const max = rules.segmentation.max_tokens - const overlap = rules.segmentation.chunk_overlap - const isHierarchicalDocument = documentDetail.doc_form === ChunkingMode.parentChild - || (rules.parent_mode && rules.subchunk_segmentation) - setSegmentIdentifier(separator) - setMaxChunkLength(max) - setOverlap(overlap!) - setRules(rules.pre_processing_rules) - setDefaultConfig(rules) - - if (isHierarchicalDocument) { - setParentChildConfig({ - chunkForContext: rules.parent_mode || 'paragraph', - parent: { - delimiter: escape(rules.segmentation.separator), - maxLength: rules.segmentation.max_tokens, - }, - child: { - delimiter: escape(rules.subchunk_segmentation.separator), - maxLength: rules.subchunk_segmentation.max_tokens, - }, - }) - } - } - } - - const getDefaultMode = () => { - if (documentDetail) - setSegmentationType(documentDetail.dataset_process_rule.mode) - } - - const createFirstDocumentMutation = useCreateFirstDocument() - const createDocumentMutation = useCreateDocument(datasetId!) - - const isCreating = createFirstDocumentMutation.isPending || createDocumentMutation.isPending - const invalidDatasetList = useInvalidDatasetList() - - const createHandle = async () => { - const params = getCreationParams() + const params = creation.buildCreationParams(currentDocForm, docLanguage, segmentation.getProcessRule(currentDocForm), indexing.retrievalConfig, indexing.embeddingModel, indexing.getIndexingTechnique()) if (!params) - return false + return + await creation.executeCreation(params, indexing.indexType, indexing.retrievalConfig) + }, [creation, segmentation, indexing, currentDocForm, docLanguage]) - if (!datasetId) { - await createFirstDocumentMutation.mutateAsync( - params, - { - onSuccess(data) { - updateIndexingTypeCache?.(indexType as string) - updateResultCache?.(data) - updateRetrievalMethodCache?.(retrievalConfig.search_method as RETRIEVE_METHOD) - }, - }, - ) - } - else { - await createDocumentMutation.mutateAsync(params, { - onSuccess(data) { - updateIndexingTypeCache?.(indexType as string) - updateResultCache?.(data) - updateRetrievalMethodCache?.(retrievalConfig.search_method as RETRIEVE_METHOD) - }, - }) - } - if (mutateDatasetRes) - mutateDatasetRes() - invalidDatasetList() - trackEvent('create_datasets', { - data_source_type: dataSourceType, - indexing_technique: getIndexing_technique(), - }) - onStepChange?.(+1) - if (isSetting) - onSave?.() - } + const handlePickerChange = useCallback((selected: { id: string, name: string }) => { + estimateHook.reset() + preview.handlePreviewChange(selected) + estimateHook.fetchEstimate() + }, [estimateHook, preview]) + const handleQAConfirm = useCallback(() => { + setIsQAConfirmDialogOpen(false) + indexing.setIndexType(IndexingType.QUALIFIED) + setDocForm(ChunkingMode.qa) + }, [indexing]) + + // Initialize rules useEffect(() => { - // fetch rules if (!isSetting) { fetchDefaultProcessRuleMutation.mutate('/datasets/process-rule') } - else { - getRulesFromDetail() - getDefaultMode() + else if (documentDetail) { + const rules = documentDetail.dataset_process_rule.rules + const isHierarchical = documentDetail.doc_form === ChunkingMode.parentChild || Boolean(rules.parent_mode && rules.subchunk_segmentation) + segmentation.applyConfigFromRules(rules, isHierarchical) + segmentation.setSegmentationType(documentDetail.dataset_process_rule.mode) } + // eslint-disable-next-line react-hooks/exhaustive-deps }, []) - useEffect(() => { - // get indexing type by props - if (indexingType) - setIndexType(indexingType as IndexingType) - else - setIndexType(isAPIKeySet ? IndexingType.QUALIFIED : IndexingType.ECONOMICAL) - }, [isAPIKeySet, indexingType, datasetId]) - - const isModelAndRetrievalConfigDisabled = !!datasetId && !!currentDataset?.data_source_type - - const showMultiModalTip = useMemo(() => { - return checkShowMultiModalTip({ - embeddingModel, - rerankingEnable: retrievalConfig.reranking_enable, - rerankModel: { - rerankingProviderName: retrievalConfig.reranking_model.reranking_provider_name, - rerankingModelName: retrievalConfig.reranking_model.reranking_model_name, - }, - indexMethod: indexType, - embeddingModelList, - rerankModelList, - }) - }, [embeddingModel, retrievalConfig.reranking_enable, retrievalConfig.reranking_model, indexType, embeddingModelList, rerankModelList]) + // Show options conditions + const showGeneralOption = (isInUpload && [ChunkingMode.text, ChunkingMode.qa].includes(currentDataset!.doc_form)) || isUploadInEmptyDataset || isInInit + const showParentChildOption = (isInUpload && currentDataset!.doc_form === ChunkingMode.parentChild) || isUploadInEmptyDataset || isInInit return (
{t('stepTwo.segmentation', { ns: 'datasetCreation' })}
- {((isInUpload && [ChunkingMode.text, ChunkingMode.qa].includes(currentDataset!.doc_form)) - || isUploadInEmptyDataset - || isInInit) - && ( - } - activeHeaderClassName="bg-dataset-option-card-blue-gradient" - description={t('stepTwo.generalTip', { ns: 'datasetCreation' })} - isActive={ - [ChunkingMode.text, ChunkingMode.qa].includes(currentDocForm) - } - onSwitched={() => - handleChangeDocform(ChunkingMode.text)} - actions={( - <> - - - - )} - noHighlight={isInUpload && isNotUploadInEmptyDataset} - > -
-
- setSegmentIdentifier(e.target.value, true)} - /> - - -
-
-
-
- {t('stepTwo.rules', { ns: 'datasetCreation' })} -
- -
-
- {rules.map(rule => ( -
{ - ruleChangeHandle(rule.id) - }} - > - - -
- ))} - {IS_CE_EDITION && ( - <> - -
-
{ - if (currentDataset?.doc_form) - return - if (docForm === ChunkingMode.qa) - handleChangeDocform(ChunkingMode.text) - else - handleChangeDocform(ChunkingMode.qa) - }} - > - - -
- - -
- {currentDocForm === ChunkingMode.qa && ( -
- - - {t('stepTwo.QATip', { ns: 'datasetCreation' })} - -
- )} - - )} -
-
-
-
+ {showGeneralOption && ( + segmentation.setSegmentIdentifier(value, true)} + onMaxChunkLengthChange={segmentation.setMaxChunkLength} + onOverlapChange={segmentation.setOverlap} + onRuleToggle={segmentation.toggleRule} + onDocFormChange={handleDocFormChange} + onDocLanguageChange={setDocLanguage} + onPreview={updatePreview} + onReset={segmentation.resetToDefaults} + locale={locale} + /> )} - { - ( - (isInUpload && currentDataset!.doc_form === ChunkingMode.parentChild) - || isUploadInEmptyDataset - || isInInit - ) - && ( - } - effectImg={BlueEffect.src} - className="text-util-colors-blue-light-blue-light-500" - activeHeaderClassName="bg-dataset-option-card-blue-gradient" - description={t('stepTwo.parentChildTip', { ns: 'datasetCreation' })} - isActive={currentDocForm === ChunkingMode.parentChild} - onSwitched={() => handleChangeDocform(ChunkingMode.parentChild)} - actions={( - <> - - - - )} - noHighlight={isInUpload && isNotUploadInEmptyDataset} - > -
-
-
-
- {t('stepTwo.parentChunkForContext', { ns: 'datasetCreation' })} -
- -
- } - title={t('stepTwo.paragraph', { ns: 'datasetCreation' })} - description={t('stepTwo.paragraphTip', { ns: 'datasetCreation' })} - isChosen={parentChildConfig.chunkForContext === 'paragraph'} - onChosen={() => setParentChildConfig( - { - ...parentChildConfig, - chunkForContext: 'paragraph', - }, - )} - chosenConfig={( -
- setParentChildConfig({ - ...parentChildConfig, - parent: { - ...parentChildConfig.parent, - delimiter: e.target.value ? escape(e.target.value) : '', - }, - })} - /> - setParentChildConfig({ - ...parentChildConfig, - parent: { - ...parentChildConfig.parent, - maxLength: value, - }, - })} - /> -
- )} - /> - } - title={t('stepTwo.fullDoc', { ns: 'datasetCreation' })} - description={t('stepTwo.fullDocTip', { ns: 'datasetCreation' })} - onChosen={() => setParentChildConfig( - { - ...parentChildConfig, - chunkForContext: 'full-doc', - }, - )} - isChosen={parentChildConfig.chunkForContext === 'full-doc'} - /> -
- -
-
-
- {t('stepTwo.childChunkForRetrieval', { ns: 'datasetCreation' })} -
- -
-
- setParentChildConfig({ - ...parentChildConfig, - child: { - ...parentChildConfig.child, - delimiter: e.target.value ? escape(e.target.value) : '', - }, - })} - /> - setParentChildConfig({ - ...parentChildConfig, - child: { - ...parentChildConfig.child, - maxLength: value, - }, - })} - /> -
-
-
-
-
- {t('stepTwo.rules', { ns: 'datasetCreation' })} -
- -
-
- {rules.map(rule => ( -
{ - ruleChangeHandle(rule.id) - }} - > - - -
- ))} -
-
-
-
- ) - } - -
{t('stepTwo.indexMode', { ns: 'datasetCreation' })}
-
- {(!hasSetIndexType || (hasSetIndexType && indexingType === IndexingType.QUALIFIED)) && ( - - {t('stepTwo.qualified', { ns: 'datasetCreation' })} - - {t('stepTwo.recommend', { ns: 'datasetCreation' })} - - - {!hasSetIndexType && } - -
- )} - description={t('stepTwo.qualifiedTip', { ns: 'datasetCreation' })} - icon={} - isActive={!hasSetIndexType && indexType === IndexingType.QUALIFIED} - disabled={hasSetIndexType} - onSwitched={() => { - setIndexType(IndexingType.QUALIFIED) - }} - /> - )} - - {(!hasSetIndexType || (hasSetIndexType && indexingType === IndexingType.ECONOMICAL)) && ( - <> - setIsQAConfirmDialogOpen(false)} className="w-[432px]"> -
-

- {t('stepTwo.qaSwitchHighQualityTipTitle', { ns: 'datasetCreation' })} -

-

- {t('stepTwo.qaSwitchHighQualityTipContent', { ns: 'datasetCreation' })} -

-
-
- - -
-
- - { - docForm === ChunkingMode.qa - ? t('stepTwo.notAvailableForQA', { ns: 'datasetCreation' }) - : t('stepTwo.notAvailableForParentChild', { ns: 'datasetCreation' }) - } -
- )} - noDecoration - position="top" - asChild={false} - triggerClassName="flex-1 self-stretch" - > - } - isActive={!hasSetIndexType && indexType === IndexingType.ECONOMICAL} - disabled={hasSetIndexType || docForm !== ChunkingMode.text} - onSwitched={() => { - setIndexType(IndexingType.ECONOMICAL) - }} - /> - - - )} -
- {!hasSetIndexType && indexType === IndexingType.QUALIFIED && ( -
-
-
- -
- {t('stepTwo.highQualityTip', { ns: 'datasetCreation' })} -
- )} - {hasSetIndexType && indexType === IndexingType.ECONOMICAL && ( -
- {t('stepTwo.indexSettingTip', { ns: 'datasetCreation' })} - {t('stepTwo.datasetSettingLink', { ns: 'datasetCreation' })} -
- )} - {/* Embedding model */} - {indexType === IndexingType.QUALIFIED && ( -
-
{t('form.embeddingModel', { ns: 'datasetSettings' })}
- { - setEmbeddingModel(model) - }} - /> - {isModelAndRetrievalConfigDisabled && ( -
- {t('stepTwo.indexSettingTip', { ns: 'datasetCreation' })} - {t('stepTwo.datasetSettingLink', { ns: 'datasetCreation' })} -
- )} -
+ {showParentChildOption && ( + segmentation.updateParentConfig('delimiter', v)} + onParentMaxLengthChange={v => segmentation.updateParentConfig('maxLength', v)} + onChildDelimiterChange={v => segmentation.updateChildConfig('delimiter', v)} + onChildMaxLengthChange={v => segmentation.updateChildConfig('maxLength', v)} + onRuleToggle={segmentation.toggleRule} + onPreview={updatePreview} + onReset={segmentation.resetToDefaults} + /> )} - {/* Retrieval Method Config */} -
- {!isModelAndRetrievalConfigDisabled - ? ( -
-
{t('form.retrievalSetting.title', { ns: 'datasetSettings' })}
-
- - {t('form.retrievalSetting.learnMore', { ns: 'datasetSettings' })} - - {t('form.retrievalSetting.longDescription', { ns: 'datasetSettings' })} -
-
- ) - : ( -
-
{t('form.retrievalSetting.title', { ns: 'datasetSettings' })}
-
- )} - -
- { - getIndexing_technique() === IndexingType.QUALIFIED - ? ( - - ) - : ( - - ) - } -
-
- - {!isSetting - ? ( -
- - -
- ) - : ( -
- - -
- )} + setIsQAConfirmDialogOpen(false)} + onQAConfirmDialogConfirm={handleQAConfirm} + /> + onStepChange?.(-1)} onCreate={handleCreate} onCancel={onCancel} />
- - -
- {dataSourceType === DataSourceType.FILE - && ( - >} - onChange={(selected) => { - currentEstimateMutation.reset() - setPreviewFile(selected) - currentEstimateMutation.mutate() - }} - // when it is from setting, it just has one file - value={isSetting ? (files[0]! as Required) : previewFile} - /> - )} - {dataSourceType === DataSourceType.NOTION - && ( - ({ - id: page.page_id, - name: page.page_name, - extension: 'md', - })) - } - onChange={(selected) => { - currentEstimateMutation.reset() - const selectedPage = notionPages.find(page => page.page_id === selected.id) - setPreviewNotionPage(selectedPage!) - currentEstimateMutation.mutate() - }} - value={{ - id: previewNotionPage?.page_id || '', - name: previewNotionPage?.page_name || '', - extension: 'md', - }} - /> - )} - {dataSourceType === DataSourceType.WEB - && ( - ({ - id: page.source_url, - name: page.title, - extension: 'md', - })) - } - onChange={(selected) => { - currentEstimateMutation.reset() - const selectedPage = websitePages.find(page => page.source_url === selected.id) - setPreviewWebsitePage(selectedPage!) - currentEstimateMutation.mutate() - }} - value={ - { - id: previewWebsitePage?.source_url || '', - name: previewWebsitePage?.title || '', - extension: 'md', - } - } - /> - )} - { - currentDocForm !== ChunkingMode.qa - && ( - - ) - } -
- - )} - className={cn('relative flex h-full w-1/2 shrink-0 p-4 pr-0', isMobile && 'w-full max-w-[524px]')} - mainClassName="space-y-6" - > - {currentDocForm === ChunkingMode.qa && estimate?.qa_preview && ( - estimate?.qa_preview.map((item, index) => ( - - - - )) - )} - {currentDocForm === ChunkingMode.text && estimate?.preview && ( - estimate?.preview.map((item, index) => ( - - {item.content} - - )) - )} - {currentDocForm === ChunkingMode.parentChild && currentEstimateMutation.data?.preview && ( - estimate?.preview?.map((item, index) => { - const indexForLabel = index + 1 - const childChunks = parentChildConfig.chunkForContext === 'full-doc' - ? item.child_chunks.slice(0, FULL_DOC_PREVIEW_LENGTH) - : item.child_chunks - return ( - - - {childChunks.map((child, index) => { - const indexForLabel = index + 1 - return ( - - ) - })} - - - ) - }) - )} - {currentEstimateMutation.isIdle && ( -
-
- -

- {t('stepTwo.previewChunkTip', { ns: 'datasetCreation' })} -

-
-
- )} - {currentEstimateMutation.isPending && ( -
- {Array.from({ length: 10 }, (_, i) => ( - - - - - - - - - - - ))} -
- )} -
-
+ } + pickerValue={preview.getPreviewPickerValue()} + isIdle={estimateHook.isIdle} + isPending={estimateHook.isPending} + onPickerChange={handlePickerChange} + />
) } diff --git a/web/app/components/datasets/create/step-two/types.ts b/web/app/components/datasets/create/step-two/types.ts new file mode 100644 index 0000000000..7f5291fb13 --- /dev/null +++ b/web/app/components/datasets/create/step-two/types.ts @@ -0,0 +1,28 @@ +import type { IndexingType } from './hooks' +import type { DataSourceProvider, NotionPage } from '@/models/common' +import type { CrawlOptions, CrawlResultItem, createDocumentResponse, CustomFile, DataSourceType, FullDocumentDetail } from '@/models/datasets' +import type { RETRIEVE_METHOD } from '@/types/app' + +export type StepTwoProps = { + isSetting?: boolean + documentDetail?: FullDocumentDetail + isAPIKeySet: boolean + onSetting: () => void + datasetId?: string + indexingType?: IndexingType + retrievalMethod?: string + dataSourceType: DataSourceType + files: CustomFile[] + notionPages?: NotionPage[] + notionCredentialId: string + websitePages?: CrawlResultItem[] + crawlOptions?: CrawlOptions + websiteCrawlProvider?: DataSourceProvider + websiteCrawlJobId?: string + onStepChange?: (delta: number) => void + updateIndexingTypeCache?: (type: string) => void + updateRetrievalMethodCache?: (method: RETRIEVE_METHOD | '') => void + updateResultCache?: (res: createDocumentResponse) => void + onSave?: () => void + onCancel?: () => void +} diff --git a/web/hooks/use-oauth.ts b/web/hooks/use-oauth.ts index 34ed8bafb0..8fb2707804 100644 --- a/web/hooks/use-oauth.ts +++ b/web/hooks/use-oauth.ts @@ -10,12 +10,15 @@ export const useOAuthCallback = () => { const errorDescription = urlParams.get('error_description') if (window.opener) { + // Use window.opener.origin instead of '*' for security + const targetOrigin = window.opener?.origin || '*' + if (subscriptionId) { window.opener.postMessage({ type: 'oauth_callback', success: true, subscriptionId, - }, '*') + }, targetOrigin) } else if (error) { window.opener.postMessage({ @@ -23,12 +26,12 @@ export const useOAuthCallback = () => { success: false, error, errorDescription, - }, '*') + }, targetOrigin) } else { window.opener.postMessage({ type: 'oauth_callback', - }, '*') + }, targetOrigin) } window.close() } diff --git a/web/i18n/ar-TN/common.json b/web/i18n/ar-TN/common.json index d015f1ae0b..998466c649 100644 --- a/web/i18n/ar-TN/common.json +++ b/web/i18n/ar-TN/common.json @@ -339,6 +339,9 @@ "modelProvider.callTimes": "أوقات الاتصال", "modelProvider.card.buyQuota": "شراء حصة", "modelProvider.card.callTimes": "أوقات الاتصال", + "modelProvider.card.modelAPI": "نماذج {{modelName}} تستخدم مفتاح API.", + "modelProvider.card.modelNotSupported": "نماذج {{modelName}} غير مثبتة.", + "modelProvider.card.modelSupported": "نماذج {{modelName}} تستخدم هذا الحصة.", "modelProvider.card.onTrial": "في التجربة", "modelProvider.card.paid": "مدفوع", "modelProvider.card.priorityUse": "أولوية الاستخدام", @@ -394,6 +397,7 @@ "modelProvider.quotaTip": "الرموز المجانية المتاحة المتبقية", "modelProvider.rerankModel.key": "نموذج إعادة الترتيب", "modelProvider.rerankModel.tip": "سيعيد نموذج إعادة الترتيب ترتيب قائمة المستندات المرشحة بناءً على المطابقة الدلالية مع استعلام المستخدم، مما يحسن نتائج الترتيب الدلالي", + "modelProvider.resetDate": "إعادة التعيين في {{date}}", "modelProvider.searchModel": "نموذج البحث", "modelProvider.selectModel": "اختر نموذجك", "modelProvider.selector.emptySetting": "يرجى الانتقال إلى الإعدادات للتكوين", diff --git a/web/i18n/de-DE/common.json b/web/i18n/de-DE/common.json index f54f6a939f..bd2d083fb0 100644 --- a/web/i18n/de-DE/common.json +++ b/web/i18n/de-DE/common.json @@ -339,6 +339,9 @@ "modelProvider.callTimes": "Anrufzeiten", "modelProvider.card.buyQuota": "Kontingent kaufen", "modelProvider.card.callTimes": "Anrufzeiten", + "modelProvider.card.modelAPI": "{{modelName}}-Modelle verwenden den API-Schlüssel.", + "modelProvider.card.modelNotSupported": "{{modelName}}-Modelle sind nicht installiert.", + "modelProvider.card.modelSupported": "{{modelName}}-Modelle verwenden dieses Kontingent.", "modelProvider.card.onTrial": "In Probe", "modelProvider.card.paid": "Bezahlt", "modelProvider.card.priorityUse": "Priorisierte Nutzung", @@ -394,6 +397,7 @@ "modelProvider.quotaTip": "Verbleibende verfügbare kostenlose Token", "modelProvider.rerankModel.key": "Rerank-Modell", "modelProvider.rerankModel.tip": "Rerank-Modell wird die Kandidatendokumentenliste basierend auf der semantischen Übereinstimmung mit der Benutzeranfrage neu ordnen und die Ergebnisse der semantischen Rangordnung verbessern", + "modelProvider.resetDate": "Zurücksetzen am {{date}}", "modelProvider.searchModel": "Suchmodell", "modelProvider.selectModel": "Wählen Sie Ihr Modell", "modelProvider.selector.emptySetting": "Bitte gehen Sie zu den Einstellungen, um zu konfigurieren", diff --git a/web/i18n/es-ES/common.json b/web/i18n/es-ES/common.json index ec08f11ed7..8175f97946 100644 --- a/web/i18n/es-ES/common.json +++ b/web/i18n/es-ES/common.json @@ -339,6 +339,9 @@ "modelProvider.callTimes": "Tiempos de llamada", "modelProvider.card.buyQuota": "Comprar Cuota", "modelProvider.card.callTimes": "Tiempos de llamada", + "modelProvider.card.modelAPI": "Los modelos {{modelName}} están usando la clave API.", + "modelProvider.card.modelNotSupported": "Los modelos {{modelName}} no están instalados.", + "modelProvider.card.modelSupported": "Los modelos {{modelName}} están usando esta cuota.", "modelProvider.card.onTrial": "En prueba", "modelProvider.card.paid": "Pagado", "modelProvider.card.priorityUse": "Uso prioritario", @@ -394,6 +397,7 @@ "modelProvider.quotaTip": "Tokens gratuitos restantes disponibles", "modelProvider.rerankModel.key": "Modelo de Reordenar", "modelProvider.rerankModel.tip": "El modelo de reordenar reordenará la lista de documentos candidatos basada en la coincidencia semántica con la consulta del usuario, mejorando los resultados de clasificación semántica", + "modelProvider.resetDate": "Restablecer el {{date}}", "modelProvider.searchModel": "Modelo de búsqueda", "modelProvider.selectModel": "Selecciona tu modelo", "modelProvider.selector.emptySetting": "Por favor ve a configuraciones para configurar", diff --git a/web/i18n/fa-IR/common.json b/web/i18n/fa-IR/common.json index 78f9b9e388..90ca2fbce3 100644 --- a/web/i18n/fa-IR/common.json +++ b/web/i18n/fa-IR/common.json @@ -339,6 +339,9 @@ "modelProvider.callTimes": "تعداد فراخوانی", "modelProvider.card.buyQuota": "خرید سهمیه", "modelProvider.card.callTimes": "تعداد فراخوانی", + "modelProvider.card.modelAPI": "مدل‌های {{modelName}} از کلید API استفاده می‌کنند.", + "modelProvider.card.modelNotSupported": "مدل‌های {{modelName}} نصب نشده‌اند.", + "modelProvider.card.modelSupported": "مدل‌های {{modelName}} از این سهمیه استفاده می‌کنند.", "modelProvider.card.onTrial": "در حال آزمایش", "modelProvider.card.paid": "پرداخت شده", "modelProvider.card.priorityUse": "استفاده با اولویت", @@ -394,6 +397,7 @@ "modelProvider.quotaTip": "توکن‌های رایگان باقی‌مانده در دسترس", "modelProvider.rerankModel.key": "مدل رتبه‌بندی مجدد", "modelProvider.rerankModel.tip": "مدل رتبه‌بندی مجدد، لیست اسناد کاندید را بر اساس تطابق معنایی با پرسش کاربر مرتب می‌کند و نتایج رتبه‌بندی معنایی را بهبود می‌بخشد", + "modelProvider.resetDate": "بازنشانی در {{date}}", "modelProvider.searchModel": "جستجوی مدل", "modelProvider.selectModel": "مدل خود را انتخاب کنید", "modelProvider.selector.emptySetting": "لطفاً به تنظیمات بروید تا پیکربندی کنید", diff --git a/web/i18n/fr-FR/common.json b/web/i18n/fr-FR/common.json index 7cc1af2d80..d2b4c70d7c 100644 --- a/web/i18n/fr-FR/common.json +++ b/web/i18n/fr-FR/common.json @@ -339,6 +339,9 @@ "modelProvider.callTimes": "Temps d'appel", "modelProvider.card.buyQuota": "Acheter Quota", "modelProvider.card.callTimes": "Temps d'appel", + "modelProvider.card.modelAPI": "Les modèles {{modelName}} utilisent la clé API.", + "modelProvider.card.modelNotSupported": "Les modèles {{modelName}} ne sont pas installés.", + "modelProvider.card.modelSupported": "Les modèles {{modelName}} utilisent ce quota.", "modelProvider.card.onTrial": "En Essai", "modelProvider.card.paid": "Payé", "modelProvider.card.priorityUse": "Utilisation prioritaire", @@ -394,6 +397,7 @@ "modelProvider.quotaTip": "Tokens gratuits restants disponibles", "modelProvider.rerankModel.key": "Modèle de Réorganisation", "modelProvider.rerankModel.tip": "Le modèle de réorganisation réorganisera la liste des documents candidats en fonction de la correspondance sémantique avec la requête de l'utilisateur, améliorant ainsi les résultats du classement sémantique.", + "modelProvider.resetDate": "Réinitialiser le {{date}}", "modelProvider.searchModel": "Modèle de recherche", "modelProvider.selectModel": "Sélectionnez votre modèle", "modelProvider.selector.emptySetting": "Veuillez aller dans les paramètres pour configurer", diff --git a/web/i18n/hi-IN/common.json b/web/i18n/hi-IN/common.json index 4670d5a545..c7b2402f81 100644 --- a/web/i18n/hi-IN/common.json +++ b/web/i18n/hi-IN/common.json @@ -339,6 +339,9 @@ "modelProvider.callTimes": "कॉल समय", "modelProvider.card.buyQuota": "कोटा खरीदें", "modelProvider.card.callTimes": "कॉल समय", + "modelProvider.card.modelAPI": "{{modelName}} मॉडल API कुंजी का उपयोग कर रहे हैं।", + "modelProvider.card.modelNotSupported": "{{modelName}} मॉडल इंस्टॉल नहीं हैं।", + "modelProvider.card.modelSupported": "{{modelName}} मॉडल इस कोटा का उपयोग कर रहे हैं।", "modelProvider.card.onTrial": "परीक्षण पर", "modelProvider.card.paid": "भुगतान किया हुआ", "modelProvider.card.priorityUse": "प्राथमिकता उपयोग", @@ -394,6 +397,7 @@ "modelProvider.quotaTip": "बचे हुए उपलब्ध मुफ्त टोकन", "modelProvider.rerankModel.key": "रीरैंक मॉडल", "modelProvider.rerankModel.tip": "रीरैंक मॉडल उपयोगकर्ता प्रश्न के साथ सांविधिक मेल के आधार पर उम्मीदवार दस्तावेज़ सूची को पुनः क्रमित करेगा, सांविधिक रैंकिंग के परिणामों में सुधार करेगा।", + "modelProvider.resetDate": "{{date}} को रीसेट करें", "modelProvider.searchModel": "खोज मॉडल", "modelProvider.selectModel": "अपने मॉडल का चयन करें", "modelProvider.selector.emptySetting": "कॉन्फ़िगर करने के लिए कृपया सेटिंग्स पर जाएं", diff --git a/web/i18n/id-ID/common.json b/web/i18n/id-ID/common.json index ede4d3ae44..541ee74b10 100644 --- a/web/i18n/id-ID/common.json +++ b/web/i18n/id-ID/common.json @@ -339,6 +339,9 @@ "modelProvider.callTimes": "Waktu panggilan", "modelProvider.card.buyQuota": "Beli Kuota", "modelProvider.card.callTimes": "Waktu panggilan", + "modelProvider.card.modelAPI": "Model {{modelName}} menggunakan Kunci API.", + "modelProvider.card.modelNotSupported": "Model {{modelName}} tidak terpasang.", + "modelProvider.card.modelSupported": "Model {{modelName}} menggunakan kuota ini.", "modelProvider.card.onTrial": "Sedang Diadili", "modelProvider.card.paid": "Dibayar", "modelProvider.card.priorityUse": "Penggunaan prioritas", @@ -394,6 +397,7 @@ "modelProvider.quotaTip": "Token gratis yang masih tersedia", "modelProvider.rerankModel.key": "Peringkat ulang Model", "modelProvider.rerankModel.tip": "Model rerank akan menyusun ulang daftar dokumen kandidat berdasarkan kecocokan semantik dengan kueri pengguna, meningkatkan hasil peringkat semantik", + "modelProvider.resetDate": "Setel ulang pada {{date}}", "modelProvider.searchModel": "Model pencarian", "modelProvider.selectModel": "Pilih model Anda", "modelProvider.selector.emptySetting": "Silakan buka pengaturan untuk mengonfigurasi", diff --git a/web/i18n/it-IT/common.json b/web/i18n/it-IT/common.json index 737ef923b1..49e14591a7 100644 --- a/web/i18n/it-IT/common.json +++ b/web/i18n/it-IT/common.json @@ -339,6 +339,9 @@ "modelProvider.callTimes": "Numero di chiamate", "modelProvider.card.buyQuota": "Acquista Quota", "modelProvider.card.callTimes": "Numero di chiamate", + "modelProvider.card.modelAPI": "I modelli {{modelName}} stanno utilizzando la chiave API.", + "modelProvider.card.modelNotSupported": "I modelli {{modelName}} non sono installati.", + "modelProvider.card.modelSupported": "I modelli {{modelName}} stanno utilizzando questa quota.", "modelProvider.card.onTrial": "In Prova", "modelProvider.card.paid": "Pagato", "modelProvider.card.priorityUse": "Uso prioritario", @@ -394,6 +397,7 @@ "modelProvider.quotaTip": "Token gratuiti rimanenti disponibili", "modelProvider.rerankModel.key": "Modello di Rerank", "modelProvider.rerankModel.tip": "Il modello di rerank riordinerà la lista dei documenti candidati basandosi sulla corrispondenza semantica con la query dell'utente, migliorando i risultati del ranking semantico", + "modelProvider.resetDate": "Ripristina il {{date}}", "modelProvider.searchModel": "Modello di ricerca", "modelProvider.selectModel": "Seleziona il tuo modello", "modelProvider.selector.emptySetting": "Per favore vai alle impostazioni per configurare", diff --git a/web/i18n/ko-KR/common.json b/web/i18n/ko-KR/common.json index 5640cb353d..a8ae974530 100644 --- a/web/i18n/ko-KR/common.json +++ b/web/i18n/ko-KR/common.json @@ -339,6 +339,9 @@ "modelProvider.callTimes": "호출 횟수", "modelProvider.card.buyQuota": "Buy Quota", "modelProvider.card.callTimes": "호출 횟수", + "modelProvider.card.modelAPI": "{{modelName}} 모델이 API 키를 사용하고 있습니다.", + "modelProvider.card.modelNotSupported": "{{modelName}} 모델이 설치되지 않았습니다.", + "modelProvider.card.modelSupported": "{{modelName}} 모델이 이 할당량을 사용하고 있습니다.", "modelProvider.card.onTrial": "트라이얼 중", "modelProvider.card.paid": "유료", "modelProvider.card.priorityUse": "우선 사용", @@ -394,6 +397,7 @@ "modelProvider.quotaTip": "남은 무료 토큰 사용 가능", "modelProvider.rerankModel.key": "재랭크 모델", "modelProvider.rerankModel.tip": "재랭크 모델은 사용자 쿼리와의 의미적 일치를 기반으로 후보 문서 목록을 재배열하여 의미적 순위를 향상시킵니다.", + "modelProvider.resetDate": "{{date}}에 재설정", "modelProvider.searchModel": "검색 모델", "modelProvider.selectModel": "모델 선택", "modelProvider.selector.emptySetting": "설정으로 이동하여 구성하세요", diff --git a/web/i18n/pl-PL/common.json b/web/i18n/pl-PL/common.json index ae654e04ac..963ecf865d 100644 --- a/web/i18n/pl-PL/common.json +++ b/web/i18n/pl-PL/common.json @@ -339,6 +339,9 @@ "modelProvider.callTimes": "Czasy wywołań", "modelProvider.card.buyQuota": "Kup limit", "modelProvider.card.callTimes": "Czasy wywołań", + "modelProvider.card.modelAPI": "Modele {{modelName}} używają klucza API.", + "modelProvider.card.modelNotSupported": "Modele {{modelName}} nie są zainstalowane.", + "modelProvider.card.modelSupported": "Modele {{modelName}} używają tego limitu.", "modelProvider.card.onTrial": "Na próbę", "modelProvider.card.paid": "Płatny", "modelProvider.card.priorityUse": "Używanie z priorytetem", @@ -394,6 +397,7 @@ "modelProvider.quotaTip": "Pozostałe dostępne darmowe tokeny", "modelProvider.rerankModel.key": "Model ponownego rankingu", "modelProvider.rerankModel.tip": "Model ponownego rankingu zmieni kolejność listy dokumentów kandydatów na podstawie semantycznego dopasowania z zapytaniem użytkownika, poprawiając wyniki rankingu semantycznego", + "modelProvider.resetDate": "Reset {{date}}", "modelProvider.searchModel": "Model wyszukiwania", "modelProvider.selectModel": "Wybierz swój model", "modelProvider.selector.emptySetting": "Przejdź do ustawień, aby skonfigurować", diff --git a/web/i18n/pt-BR/common.json b/web/i18n/pt-BR/common.json index 2e7f49de7e..7efc250349 100644 --- a/web/i18n/pt-BR/common.json +++ b/web/i18n/pt-BR/common.json @@ -339,6 +339,9 @@ "modelProvider.callTimes": "Chamadas", "modelProvider.card.buyQuota": "Comprar Quota", "modelProvider.card.callTimes": "Chamadas", + "modelProvider.card.modelAPI": "Os modelos {{modelName}} estão usando a Chave API.", + "modelProvider.card.modelNotSupported": "Os modelos {{modelName}} não estão instalados.", + "modelProvider.card.modelSupported": "Os modelos {{modelName}} estão usando esta cota.", "modelProvider.card.onTrial": "Em Teste", "modelProvider.card.paid": "Pago", "modelProvider.card.priorityUse": "Uso prioritário", @@ -394,6 +397,7 @@ "modelProvider.quotaTip": "Tokens gratuitos disponíveis restantes", "modelProvider.rerankModel.key": "Modelo de Reordenação", "modelProvider.rerankModel.tip": "O modelo de reordenaenação reorganizará a lista de documentos candidatos com base na correspondência semântica com a consulta do usuário, melhorando os resultados da classificação semântica", + "modelProvider.resetDate": "Redefinir em {{date}}", "modelProvider.searchModel": "Modelo de pesquisa", "modelProvider.selectModel": "Selecione seu modelo", "modelProvider.selector.emptySetting": "Por favor, vá para configurações para configurar", diff --git a/web/i18n/ro-RO/common.json b/web/i18n/ro-RO/common.json index c21e755b3c..bafe3542dc 100644 --- a/web/i18n/ro-RO/common.json +++ b/web/i18n/ro-RO/common.json @@ -339,6 +339,9 @@ "modelProvider.callTimes": "Apeluri", "modelProvider.card.buyQuota": "Cumpără cotă", "modelProvider.card.callTimes": "Apeluri", + "modelProvider.card.modelAPI": "Modelele {{modelName}} folosesc cheia API.", + "modelProvider.card.modelNotSupported": "Modelele {{modelName}} nu sunt instalate.", + "modelProvider.card.modelSupported": "Modelele {{modelName}} folosesc această cotă.", "modelProvider.card.onTrial": "În probă", "modelProvider.card.paid": "Plătit", "modelProvider.card.priorityUse": "Utilizare prioritară", @@ -394,6 +397,7 @@ "modelProvider.quotaTip": "Jetoane gratuite disponibile rămase", "modelProvider.rerankModel.key": "Model de reordonare", "modelProvider.rerankModel.tip": "Modelul de reordonare va reordona lista de documente candidate pe baza potrivirii semantice cu interogarea utilizatorului, îmbunătățind rezultatele clasificării semantice", + "modelProvider.resetDate": "Resetare la {{date}}", "modelProvider.searchModel": "Model de căutare", "modelProvider.selectModel": "Selectați modelul dvs.", "modelProvider.selector.emptySetting": "Vă rugăm să mergeți la setări pentru a configura", diff --git a/web/i18n/ru-RU/common.json b/web/i18n/ru-RU/common.json index e763a7ec2a..0210db777f 100644 --- a/web/i18n/ru-RU/common.json +++ b/web/i18n/ru-RU/common.json @@ -339,6 +339,9 @@ "modelProvider.callTimes": "Количество вызовов", "modelProvider.card.buyQuota": "Купить квоту", "modelProvider.card.callTimes": "Количество вызовов", + "modelProvider.card.modelAPI": "Модели {{modelName}} используют API-ключ.", + "modelProvider.card.modelNotSupported": "Модели {{modelName}} не установлены.", + "modelProvider.card.modelSupported": "Модели {{modelName}} используют эту квоту.", "modelProvider.card.onTrial": "Пробная версия", "modelProvider.card.paid": "Платный", "modelProvider.card.priorityUse": "Приоритетное использование", @@ -394,6 +397,7 @@ "modelProvider.quotaTip": "Оставшиеся доступные бесплатные токены", "modelProvider.rerankModel.key": "Модель повторного ранжирования", "modelProvider.rerankModel.tip": "Модель повторного ранжирования изменит порядок списка документов-кандидатов на основе семантического соответствия запросу пользователя, улучшая результаты семантического ранжирования", + "modelProvider.resetDate": "Сброс {{date}}", "modelProvider.searchModel": "Поиск модели", "modelProvider.selectModel": "Выберите свою модель", "modelProvider.selector.emptySetting": "Пожалуйста, перейдите в настройки для настройки", diff --git a/web/i18n/sl-SI/common.json b/web/i18n/sl-SI/common.json index d092fe10c8..c33686ac03 100644 --- a/web/i18n/sl-SI/common.json +++ b/web/i18n/sl-SI/common.json @@ -339,6 +339,9 @@ "modelProvider.callTimes": "Število klicev", "modelProvider.card.buyQuota": "Kupi kvoto", "modelProvider.card.callTimes": "Časi klicev", + "modelProvider.card.modelAPI": "Modeli {{modelName}} uporabljajo API ključ.", + "modelProvider.card.modelNotSupported": "Modeli {{modelName}} niso nameščeni.", + "modelProvider.card.modelSupported": "Modeli {{modelName}} uporabljajo to kvoto.", "modelProvider.card.onTrial": "Na preizkusu", "modelProvider.card.paid": "Plačano", "modelProvider.card.priorityUse": "Prednostna uporaba", @@ -394,6 +397,7 @@ "modelProvider.quotaTip": "Preostali razpoložljivi brezplačni žetoni", "modelProvider.rerankModel.key": "Model za prerazvrstitev", "modelProvider.rerankModel.tip": "Model za prerazvrstitev bo prerazporedil seznam kandidatskih dokumentov na podlagi semantične ujemanja z uporabniško poizvedbo, s čimer se izboljšajo rezultati semantičnega razvrščanja.", + "modelProvider.resetDate": "Ponastavi {{date}}", "modelProvider.searchModel": "Model iskanja", "modelProvider.selectModel": "Izberite svoj model", "modelProvider.selector.emptySetting": "Prosimo, pojdite v nastavitve za konfiguracijo", diff --git a/web/i18n/th-TH/common.json b/web/i18n/th-TH/common.json index 9a38f7f683..2a6b575618 100644 --- a/web/i18n/th-TH/common.json +++ b/web/i18n/th-TH/common.json @@ -339,6 +339,9 @@ "modelProvider.callTimes": "เวลาโทร", "modelProvider.card.buyQuota": "ซื้อโควต้า", "modelProvider.card.callTimes": "เวลาโทร", + "modelProvider.card.modelAPI": "โมเดล {{modelName}} กำลังใช้คีย์ API", + "modelProvider.card.modelNotSupported": "โมเดล {{modelName}} ไม่ได้ติดตั้ง", + "modelProvider.card.modelSupported": "โมเดล {{modelName}} กำลังใช้โควต้านี้", "modelProvider.card.onTrial": "ทดลองใช้", "modelProvider.card.paid": "จ่าย", "modelProvider.card.priorityUse": "ลําดับความสําคัญในการใช้งาน", @@ -394,6 +397,7 @@ "modelProvider.quotaTip": "โทเค็นฟรีที่เหลืออยู่", "modelProvider.rerankModel.key": "จัดอันดับโมเดลใหม่", "modelProvider.rerankModel.tip": "โมเดล Rerank จะจัดลําดับรายการเอกสารผู้สมัครใหม่ตามการจับคู่ความหมายกับการสืบค้นของผู้ใช้ ซึ่งช่วยปรับปรุงผลลัพธ์ของการจัดอันดับความหมาย", + "modelProvider.resetDate": "รีเซ็ตเมื่อ {{date}}", "modelProvider.searchModel": "ค้นหารุ่น", "modelProvider.selectModel": "เลือกรุ่นของคุณ", "modelProvider.selector.emptySetting": "โปรดไปที่การตั้งค่าเพื่อกําหนดค่า", diff --git a/web/i18n/tr-TR/common.json b/web/i18n/tr-TR/common.json index 0ee51e161c..c45b453180 100644 --- a/web/i18n/tr-TR/common.json +++ b/web/i18n/tr-TR/common.json @@ -339,6 +339,9 @@ "modelProvider.callTimes": "Çağrı Süreleri", "modelProvider.card.buyQuota": "Kota Satın Al", "modelProvider.card.callTimes": "Çağrı Süreleri", + "modelProvider.card.modelAPI": "{{modelName}} modelleri API Anahtarını kullanıyor.", + "modelProvider.card.modelNotSupported": "{{modelName}} modelleri kurulu değil.", + "modelProvider.card.modelSupported": "{{modelName}} modelleri bu kotayı kullanıyor.", "modelProvider.card.onTrial": "Deneme Sürümünde", "modelProvider.card.paid": "Ücretli", "modelProvider.card.priorityUse": "Öncelikli Kullan", @@ -394,6 +397,7 @@ "modelProvider.quotaTip": "Kalan kullanılabilir ücretsiz tokenler", "modelProvider.rerankModel.key": "Yeniden Sıralama Modeli", "modelProvider.rerankModel.tip": "Yeniden sıralama modeli, kullanıcı sorgusuyla anlam eşleştirmesine dayalı olarak aday belge listesini yeniden sıralayacak ve anlam sıralama sonuçlarını iyileştirecektir.", + "modelProvider.resetDate": "{{date}} tarihinde sıfırla", "modelProvider.searchModel": "Model ara", "modelProvider.selectModel": "Modelinizi seçin", "modelProvider.selector.emptySetting": "Lütfen ayarlara gidip yapılandırın", diff --git a/web/i18n/uk-UA/common.json b/web/i18n/uk-UA/common.json index ddec8637e1..e9e810da45 100644 --- a/web/i18n/uk-UA/common.json +++ b/web/i18n/uk-UA/common.json @@ -339,6 +339,9 @@ "modelProvider.callTimes": "Кількість викликів", "modelProvider.card.buyQuota": "Придбати квоту", "modelProvider.card.callTimes": "Кількість викликів", + "modelProvider.card.modelAPI": "Моделі {{modelName}} використовують API-ключ.", + "modelProvider.card.modelNotSupported": "Моделі {{modelName}} не встановлено.", + "modelProvider.card.modelSupported": "Моделі {{modelName}} використовують цю квоту.", "modelProvider.card.onTrial": "У пробному періоді", "modelProvider.card.paid": "Оплачено", "modelProvider.card.priorityUse": "Пріоритетне використання", @@ -394,6 +397,7 @@ "modelProvider.quotaTip": "Залишилося доступних безкоштовних токенів", "modelProvider.rerankModel.key": "Модель повторного ранжування", "modelProvider.rerankModel.tip": "Модель повторного ранжування змінить порядок списку документів-кандидатів на основі семантичної відповідності запиту користувача, покращуючи результати семантичного ранжування.", + "modelProvider.resetDate": "Скидання {{date}}", "modelProvider.searchModel": "Пошукова модель", "modelProvider.selectModel": "Виберіть свою модель", "modelProvider.selector.emptySetting": "Перейдіть до налаштувань, щоб налаштувати", diff --git a/web/i18n/vi-VN/common.json b/web/i18n/vi-VN/common.json index f8fa9c07d5..1fec0e10e2 100644 --- a/web/i18n/vi-VN/common.json +++ b/web/i18n/vi-VN/common.json @@ -339,6 +339,9 @@ "modelProvider.callTimes": "Số lần gọi", "modelProvider.card.buyQuota": "Mua Quota", "modelProvider.card.callTimes": "Số lần gọi", + "modelProvider.card.modelAPI": "Các mô hình {{modelName}} đang sử dụng Khóa API.", + "modelProvider.card.modelNotSupported": "Các mô hình {{modelName}} chưa được cài đặt.", + "modelProvider.card.modelSupported": "Các mô hình {{modelName}} đang sử dụng hạn mức này.", "modelProvider.card.onTrial": "Thử nghiệm", "modelProvider.card.paid": "Đã thanh toán", "modelProvider.card.priorityUse": "Ưu tiên sử dụng", @@ -394,6 +397,7 @@ "modelProvider.quotaTip": "Số lượng mã thông báo miễn phí còn lại", "modelProvider.rerankModel.key": "Mô hình Sắp xếp lại", "modelProvider.rerankModel.tip": "Mô hình sắp xếp lại sẽ sắp xếp lại danh sách tài liệu ứng cử viên dựa trên sự phù hợp ngữ nghĩa với truy vấn của người dùng, cải thiện kết quả của việc xếp hạng ngữ nghĩa", + "modelProvider.resetDate": "Đặt lại vào {{date}}", "modelProvider.searchModel": "Mô hình tìm kiếm", "modelProvider.selectModel": "Chọn mô hình của bạn", "modelProvider.selector.emptySetting": "Vui lòng vào cài đặt để cấu hình", diff --git a/web/i18n/zh-Hant/common.json b/web/i18n/zh-Hant/common.json index 8fe3e5bd07..52be863c6d 100644 --- a/web/i18n/zh-Hant/common.json +++ b/web/i18n/zh-Hant/common.json @@ -339,6 +339,9 @@ "modelProvider.callTimes": "呼叫次數", "modelProvider.card.buyQuota": "購買額度", "modelProvider.card.callTimes": "呼叫次數", + "modelProvider.card.modelAPI": "{{modelName}} 模型正在使用 API Key。", + "modelProvider.card.modelNotSupported": "{{modelName}} 模型未安裝。", + "modelProvider.card.modelSupported": "{{modelName}} 模型正在使用此配額。", "modelProvider.card.onTrial": "試用中", "modelProvider.card.paid": "已購買", "modelProvider.card.priorityUse": "優先使用", @@ -394,6 +397,7 @@ "modelProvider.quotaTip": "剩餘免費額度", "modelProvider.rerankModel.key": "Rerank 模型", "modelProvider.rerankModel.tip": "重排序模型將根據候選文件列表與使用者問題語義匹配度進行重新排序,從而改進語義排序的結果", + "modelProvider.resetDate": "於 {{date}} 重置", "modelProvider.searchModel": "搜尋模型", "modelProvider.selectModel": "選擇您的模型", "modelProvider.selector.emptySetting": "請前往設定進行配置",