mirror of https://github.com/langgenius/dify.git
302 lines
11 KiB
TypeScript
302 lines
11 KiB
TypeScript
/**
|
|
* Integration Test: Create Dataset Flow
|
|
*
|
|
* Tests cross-module data flow: step-one data → step-two hooks → creation params → API call
|
|
* Validates data contracts between steps.
|
|
*/
|
|
|
|
import type { CustomFile } from '@/models/datasets'
|
|
import type { RetrievalConfig } from '@/types/app'
|
|
import { act, renderHook } from '@testing-library/react'
|
|
import { beforeEach, describe, expect, it, vi } from 'vitest'
|
|
import { ChunkingMode, DataSourceType, ProcessMode } from '@/models/datasets'
|
|
import { RETRIEVE_METHOD } from '@/types/app'
|
|
|
|
const mockCreateFirstDocument = vi.fn()
|
|
const mockCreateDocument = vi.fn()
|
|
vi.mock('@/service/knowledge/use-create-dataset', () => ({
|
|
useCreateFirstDocument: () => ({ mutateAsync: mockCreateFirstDocument, isPending: false }),
|
|
useCreateDocument: () => ({ mutateAsync: mockCreateDocument, isPending: false }),
|
|
getNotionInfo: (pages: { page_id: string }[], credentialId: string) => ({
|
|
workspace_id: 'ws-1',
|
|
pages: pages.map(p => p.page_id),
|
|
notion_credential_id: credentialId,
|
|
}),
|
|
getWebsiteInfo: (opts: { websitePages: { url: string }[], websiteCrawlProvider: string }) => ({
|
|
urls: opts.websitePages.map(p => p.url),
|
|
only_main_content: true,
|
|
provider: opts.websiteCrawlProvider,
|
|
}),
|
|
}))
|
|
|
|
vi.mock('@/service/knowledge/use-dataset', () => ({
|
|
useInvalidDatasetList: () => vi.fn(),
|
|
}))
|
|
|
|
vi.mock('@/app/components/base/toast', () => ({
|
|
default: { notify: vi.fn() },
|
|
}))
|
|
|
|
vi.mock('@/app/components/base/amplitude', () => ({
|
|
trackEvent: vi.fn(),
|
|
}))
|
|
|
|
// Import hooks after mocks
|
|
const { useSegmentationState, DEFAULT_SEGMENT_IDENTIFIER, DEFAULT_MAXIMUM_CHUNK_LENGTH, DEFAULT_OVERLAP }
|
|
= await import('@/app/components/datasets/create/step-two/hooks')
|
|
const { useDocumentCreation, IndexingType }
|
|
= await import('@/app/components/datasets/create/step-two/hooks')
|
|
|
|
const createMockFile = (overrides?: Partial<CustomFile>): CustomFile => ({
|
|
id: 'file-1',
|
|
name: 'test.txt',
|
|
type: 'text/plain',
|
|
size: 1024,
|
|
extension: '.txt',
|
|
mime_type: 'text/plain',
|
|
created_at: 0,
|
|
created_by: '',
|
|
...overrides,
|
|
} as CustomFile)
|
|
|
|
describe('Create Dataset Flow - Cross-Step Data Contract', () => {
|
|
beforeEach(() => {
|
|
vi.clearAllMocks()
|
|
})
|
|
|
|
describe('Step-One → Step-Two: Segmentation Defaults', () => {
|
|
it('should initialise with correct default segmentation values', () => {
|
|
const { result } = renderHook(() => useSegmentationState())
|
|
expect(result.current.segmentIdentifier).toBe(DEFAULT_SEGMENT_IDENTIFIER)
|
|
expect(result.current.maxChunkLength).toBe(DEFAULT_MAXIMUM_CHUNK_LENGTH)
|
|
expect(result.current.overlap).toBe(DEFAULT_OVERLAP)
|
|
expect(result.current.segmentationType).toBe(ProcessMode.general)
|
|
})
|
|
|
|
it('should produce valid process rule for general chunking', () => {
|
|
const { result } = renderHook(() => useSegmentationState())
|
|
const processRule = result.current.getProcessRule(ChunkingMode.text)
|
|
|
|
// mode should be segmentationType = ProcessMode.general = 'custom'
|
|
expect(processRule.mode).toBe('custom')
|
|
expect(processRule.rules.segmentation).toEqual({
|
|
separator: '\n\n', // unescaped from \\n\\n
|
|
max_tokens: DEFAULT_MAXIMUM_CHUNK_LENGTH,
|
|
chunk_overlap: DEFAULT_OVERLAP,
|
|
})
|
|
// rules is empty initially since no default config loaded
|
|
expect(processRule.rules.pre_processing_rules).toEqual([])
|
|
})
|
|
|
|
it('should produce valid process rule for parent-child chunking', () => {
|
|
const { result } = renderHook(() => useSegmentationState())
|
|
const processRule = result.current.getProcessRule(ChunkingMode.parentChild)
|
|
|
|
expect(processRule.mode).toBe('hierarchical')
|
|
expect(processRule.rules.parent_mode).toBe('paragraph')
|
|
expect(processRule.rules.segmentation).toEqual({
|
|
separator: '\n\n',
|
|
max_tokens: 1024,
|
|
})
|
|
expect(processRule.rules.subchunk_segmentation).toEqual({
|
|
separator: '\n',
|
|
max_tokens: 512,
|
|
})
|
|
})
|
|
})
|
|
|
|
describe('Step-Two → Creation API: Params Building', () => {
|
|
it('should build valid creation params for file upload workflow', () => {
|
|
const files = [createMockFile()]
|
|
const { result: segResult } = renderHook(() => useSegmentationState())
|
|
const { result: creationResult } = renderHook(() =>
|
|
useDocumentCreation({
|
|
dataSourceType: DataSourceType.FILE,
|
|
files,
|
|
notionPages: [],
|
|
notionCredentialId: '',
|
|
websitePages: [],
|
|
}),
|
|
)
|
|
|
|
const processRule = segResult.current.getProcessRule(ChunkingMode.text)
|
|
const retrievalConfig: RetrievalConfig = {
|
|
search_method: RETRIEVE_METHOD.semantic,
|
|
reranking_enable: false,
|
|
reranking_model: { reranking_provider_name: '', reranking_model_name: '' },
|
|
top_k: 3,
|
|
score_threshold_enabled: false,
|
|
score_threshold: 0,
|
|
}
|
|
|
|
const params = creationResult.current.buildCreationParams(
|
|
ChunkingMode.text,
|
|
'English',
|
|
processRule,
|
|
retrievalConfig,
|
|
{ provider: 'openai', model: 'text-embedding-ada-002' },
|
|
IndexingType.QUALIFIED,
|
|
)
|
|
|
|
expect(params).not.toBeNull()
|
|
// File IDs come from file.id (not file.file.id)
|
|
expect(params!.data_source.type).toBe(DataSourceType.FILE)
|
|
expect(params!.data_source.info_list.file_info_list?.file_ids).toContain('file-1')
|
|
|
|
expect(params!.indexing_technique).toBe(IndexingType.QUALIFIED)
|
|
expect(params!.doc_form).toBe(ChunkingMode.text)
|
|
expect(params!.doc_language).toBe('English')
|
|
expect(params!.embedding_model).toBe('text-embedding-ada-002')
|
|
expect(params!.embedding_model_provider).toBe('openai')
|
|
expect(params!.process_rule.mode).toBe('custom')
|
|
})
|
|
|
|
it('should validate params: overlap must not exceed maxChunkLength', () => {
|
|
const { result } = renderHook(() =>
|
|
useDocumentCreation({
|
|
dataSourceType: DataSourceType.FILE,
|
|
files: [createMockFile()],
|
|
notionPages: [],
|
|
notionCredentialId: '',
|
|
websitePages: [],
|
|
}),
|
|
)
|
|
|
|
// validateParams returns false (invalid) when overlap > maxChunkLength for general mode
|
|
const isValid = result.current.validateParams({
|
|
segmentationType: 'general',
|
|
maxChunkLength: 100,
|
|
limitMaxChunkLength: 4000,
|
|
overlap: 200, // overlap > maxChunkLength
|
|
indexType: IndexingType.QUALIFIED,
|
|
embeddingModel: { provider: 'openai', model: 'text-embedding-ada-002' },
|
|
rerankModelList: [],
|
|
retrievalConfig: {
|
|
search_method: RETRIEVE_METHOD.semantic,
|
|
reranking_enable: false,
|
|
reranking_model: { reranking_provider_name: '', reranking_model_name: '' },
|
|
top_k: 3,
|
|
score_threshold_enabled: false,
|
|
score_threshold: 0,
|
|
},
|
|
})
|
|
expect(isValid).toBe(false)
|
|
})
|
|
|
|
it('should validate params: maxChunkLength must not exceed limit', () => {
|
|
const { result } = renderHook(() =>
|
|
useDocumentCreation({
|
|
dataSourceType: DataSourceType.FILE,
|
|
files: [createMockFile()],
|
|
notionPages: [],
|
|
notionCredentialId: '',
|
|
websitePages: [],
|
|
}),
|
|
)
|
|
|
|
const isValid = result.current.validateParams({
|
|
segmentationType: 'general',
|
|
maxChunkLength: 5000,
|
|
limitMaxChunkLength: 4000, // limit < maxChunkLength
|
|
overlap: 50,
|
|
indexType: IndexingType.QUALIFIED,
|
|
embeddingModel: { provider: 'openai', model: 'text-embedding-ada-002' },
|
|
rerankModelList: [],
|
|
retrievalConfig: {
|
|
search_method: RETRIEVE_METHOD.semantic,
|
|
reranking_enable: false,
|
|
reranking_model: { reranking_provider_name: '', reranking_model_name: '' },
|
|
top_k: 3,
|
|
score_threshold_enabled: false,
|
|
score_threshold: 0,
|
|
},
|
|
})
|
|
expect(isValid).toBe(false)
|
|
})
|
|
})
|
|
|
|
describe('Full Flow: Segmentation State → Process Rule → Creation Params Consistency', () => {
|
|
it('should keep segmentation values consistent across getProcessRule and buildCreationParams', () => {
|
|
const files = [createMockFile()]
|
|
const { result: segResult } = renderHook(() => useSegmentationState())
|
|
const { result: creationResult } = renderHook(() =>
|
|
useDocumentCreation({
|
|
dataSourceType: DataSourceType.FILE,
|
|
files,
|
|
notionPages: [],
|
|
notionCredentialId: '',
|
|
websitePages: [],
|
|
}),
|
|
)
|
|
|
|
// Change segmentation settings
|
|
act(() => {
|
|
segResult.current.setMaxChunkLength(2048)
|
|
segResult.current.setOverlap(100)
|
|
})
|
|
|
|
const processRule = segResult.current.getProcessRule(ChunkingMode.text)
|
|
expect(processRule.rules.segmentation.max_tokens).toBe(2048)
|
|
expect(processRule.rules.segmentation.chunk_overlap).toBe(100)
|
|
|
|
const params = creationResult.current.buildCreationParams(
|
|
ChunkingMode.text,
|
|
'Chinese',
|
|
processRule,
|
|
{
|
|
search_method: RETRIEVE_METHOD.semantic,
|
|
reranking_enable: false,
|
|
reranking_model: { reranking_provider_name: '', reranking_model_name: '' },
|
|
top_k: 3,
|
|
score_threshold_enabled: false,
|
|
score_threshold: 0,
|
|
},
|
|
{ provider: 'openai', model: 'text-embedding-ada-002' },
|
|
IndexingType.QUALIFIED,
|
|
)
|
|
|
|
expect(params).not.toBeNull()
|
|
expect(params!.process_rule.rules.segmentation.max_tokens).toBe(2048)
|
|
expect(params!.process_rule.rules.segmentation.chunk_overlap).toBe(100)
|
|
expect(params!.doc_language).toBe('Chinese')
|
|
})
|
|
|
|
it('should support parent-child mode through the full pipeline', () => {
|
|
const files = [createMockFile()]
|
|
const { result: segResult } = renderHook(() => useSegmentationState())
|
|
const { result: creationResult } = renderHook(() =>
|
|
useDocumentCreation({
|
|
dataSourceType: DataSourceType.FILE,
|
|
files,
|
|
notionPages: [],
|
|
notionCredentialId: '',
|
|
websitePages: [],
|
|
}),
|
|
)
|
|
|
|
const processRule = segResult.current.getProcessRule(ChunkingMode.parentChild)
|
|
const params = creationResult.current.buildCreationParams(
|
|
ChunkingMode.parentChild,
|
|
'English',
|
|
processRule,
|
|
{
|
|
search_method: RETRIEVE_METHOD.semantic,
|
|
reranking_enable: false,
|
|
reranking_model: { reranking_provider_name: '', reranking_model_name: '' },
|
|
top_k: 3,
|
|
score_threshold_enabled: false,
|
|
score_threshold: 0,
|
|
},
|
|
{ provider: 'openai', model: 'text-embedding-ada-002' },
|
|
IndexingType.QUALIFIED,
|
|
)
|
|
|
|
expect(params).not.toBeNull()
|
|
expect(params!.doc_form).toBe(ChunkingMode.parentChild)
|
|
expect(params!.process_rule.mode).toBe('hierarchical')
|
|
expect(params!.process_rule.rules.parent_mode).toBe('paragraph')
|
|
expect(params!.process_rule.rules.subchunk_segmentation).toBeDefined()
|
|
})
|
|
})
|
|
})
|