feat: knowledge pipeline (#25360)

Signed-off-by: -LAN- <laipz8200@outlook.com>
Co-authored-by: twwu <twwu@dify.ai>
Co-authored-by: crazywoola <100913391+crazywoola@users.noreply.github.com>
Co-authored-by: jyong <718720800@qq.com>
Co-authored-by: Wu Tianwei <30284043+WTW0313@users.noreply.github.com>
Co-authored-by: QuantumGhost <obelisk.reg+git@gmail.com>
Co-authored-by: lyzno1 <yuanyouhuilyz@gmail.com>
Co-authored-by: quicksand <quicksandzn@gmail.com>
Co-authored-by: Jyong <76649700+JohnJyong@users.noreply.github.com>
Co-authored-by: lyzno1 <92089059+lyzno1@users.noreply.github.com>
Co-authored-by: zxhlyh <jasonapring2015@outlook.com>
Co-authored-by: Yongtao Huang <yongtaoh2022@gmail.com>
Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
Co-authored-by: Joel <iamjoel007@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: nite-knite <nkCoding@gmail.com>
Co-authored-by: Hanqing Zhao <sherry9277@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Harry <xh001x@hotmail.com>
This commit is contained in:
-LAN-
2025-09-18 12:49:10 +08:00
committed by GitHub
parent 7dadb33003
commit 85cda47c70
1772 changed files with 102407 additions and 31710 deletions

View File

@@ -1,9 +1,12 @@
import type { DataSourceNotionPage, DataSourceProvider } from './common'
import type { AppIconType, AppMode, RetrievalConfig } from '@/types/app'
import type { AppIconType, AppMode, RetrievalConfig, TransferMethod } from '@/types/app'
import type { Tag } from '@/app/components/base/tag-management/constant'
import type { IndexingType } from '@/app/components/datasets/create/step-two'
import type { MetadataFilteringVariableType } from '@/app/components/workflow/nodes/knowledge-retrieval/types'
import type { MetadataItemWithValue } from '@/app/components/datasets/metadata/types'
import { ExternalKnowledgeBase, General, ParentChild, Qa } from '@/app/components/base/icons/src/public/knowledge/dataset-card'
import { GeneralChunk, ParentChildChunk, QuestionAndAnswer } from '@/app/components/base/icons/src/vender/knowledge'
import type { DatasourceType } from './pipeline'
export enum DataSourceType {
FILE = 'upload_file',
@@ -21,6 +24,7 @@ export enum ChunkingMode {
text = 'text_model', // General text
qa = 'qa_model', // General QA
parentChild = 'hierarchical_model', // Parent-Child
// graph = 'graph', // todo: Graph RAG
}
export type MetadataInDoc = {
@@ -30,11 +34,18 @@ export type MetadataInDoc = {
name: string
}
export type IconInfo = {
icon: string
icon_background?: string
icon_type: AppIconType
icon_url?: string
}
export type DataSet = {
id: string
name: string
icon: string
icon_background: string
indexing_status: DocumentIndexingStatus
icon_info: IconInfo
description: string
permission: DatasetPermission
data_source_type: DataSourceType
@@ -45,6 +56,8 @@ export type DataSet = {
app_count: number
doc_form: ChunkingMode
document_count: number
total_document_count: number
total_available_documents?: number
word_count: number
provider: string
embedding_model: string
@@ -67,6 +80,11 @@ export type DataSet = {
}
built_in_field_enabled: boolean
doc_metadata?: MetadataInDoc[]
keyword_number?: number
pipeline_id?: string
is_published?: boolean // Indicates if the pipeline is published
runtime_mode: 'rag_pipeline' | 'general'
enable_api: boolean
}
export type ExternalAPIItem = {
@@ -136,11 +154,22 @@ export type CrawlOptions = {
export type CrawlResultItem = {
title: string
markdown: string
content: string
description: string
source_url: string
}
export type CrawlResult = {
data: CrawlResultItem[]
time_consuming: number | string
}
export enum CrawlStep {
init = 'init',
running = 'running',
finished = 'finished',
}
export type FileItem = {
fileID: string
file: CustomFile
@@ -159,6 +188,14 @@ export type FetchDatasetsParams = {
}
}
export type DatasetListRequest = {
initialPage: number
tag_ids?: string[]
limit: number
include_all?: boolean
keyword?: string
}
export type DataSetListResponse = {
data: DataSet[]
has_more: boolean
@@ -272,7 +309,7 @@ export const DisplayStatusList = [
export type DocumentDisplayStatus = typeof DisplayStatusList[number]
export type DataSourceInfo = {
export type LegacyDataSourceInfo = {
upload_file: {
id: string
name: string
@@ -288,18 +325,60 @@ export type DataSourceInfo = {
provider?: DataSourceProvider
job_id: string
url: string
credential_id?: string
}
export type LocalFileInfo = {
extension: string
mime_type: string
name: string
related_id: string
size: number
transfer_method: TransferMethod
url: string
}
export type WebsiteCrawlInfo = {
content: string
credential_id: string
description: string
source_url: string
title: string
}
export type OnlineDocumentInfo = {
credential_id: string
workspace_id: string
page: {
last_edited_time: string
page_icon: DataSourceNotionPage['page_icon']
page_id: string
page_name: string
parent_id: string
type: string
},
}
export type OnlineDriveInfo = {
bucket: string
credential_id: string
id: string
name: string
type: 'file' | 'folder'
}
export type DataSourceInfo = LegacyDataSourceInfo | LocalFileInfo | OnlineDocumentInfo | WebsiteCrawlInfo
export type InitialDocumentDetail = {
id: string
batch: string
position: number
dataset_id: string
data_source_type: DataSourceType
data_source_type: DataSourceType | DatasourceType
data_source_info: DataSourceInfo
dataset_process_rule_id: string
name: string
created_from: 'api' | 'web'
created_from: 'rag-pipeline' | 'api' | 'web'
created_by: string
created_at: number
indexing_status: DocumentIndexingStatus
@@ -313,7 +392,6 @@ export type InitialDocumentDetail = {
export type SimpleDocumentDetail = InitialDocumentDetail & {
enabled: boolean
word_count: number
is_qa: boolean // TODO waiting for backend to add this field
error?: string | null
archived: boolean
updated_at: number
@@ -338,7 +416,7 @@ export type DocumentListResponse = {
export type DocumentReq = {
original_document_id?: string
indexing_technique?: string
indexing_technique?: IndexingType
doc_form: ChunkingMode
doc_language: string
process_rule: ProcessRule
@@ -374,6 +452,7 @@ export type DataSource = {
export type NotionInfo = {
workspace_id: string
pages: DataSourceNotionPage[]
credential_id: string
}
export type NotionPage = {
page_id: string
@@ -688,3 +767,47 @@ export type BatchImportResponse = {
job_id: string
job_status: string
}
export const DOC_FORM_ICON_WITH_BG: Record<ChunkingMode | 'external', React.ComponentType<{ className: string }>> = {
[ChunkingMode.text]: General,
[ChunkingMode.qa]: Qa,
[ChunkingMode.parentChild]: ParentChild,
// [ChunkingMode.graph]: Graph, // todo: Graph RAG
external: ExternalKnowledgeBase,
}
export const DOC_FORM_ICON: Record<ChunkingMode.text | ChunkingMode.qa | ChunkingMode.parentChild, React.ComponentType<{ className: string }>> = {
[ChunkingMode.text]: GeneralChunk,
[ChunkingMode.qa]: QuestionAndAnswer,
[ChunkingMode.parentChild]: ParentChildChunk,
}
export const DOC_FORM_TEXT: Record<ChunkingMode, string> = {
[ChunkingMode.text]: 'general',
[ChunkingMode.qa]: 'qa',
[ChunkingMode.parentChild]: 'parentChild',
// [ChunkingMode.graph]: 'graph', // todo: Graph RAG
}
export type CreateDatasetReq = {
yaml_content?: string
}
export type CreateDatasetResponse = {
id: string
name: string
description: string
permission: DatasetPermission
indexing_technique: IndexingType
created_by: string
created_at: number
updated_by: string
updated_at: number
pipeline_id: string
dataset_id: string
}
export type IndexingStatusBatchRequest = {
datasetId: string
batchId: string
}

View File

@@ -285,8 +285,13 @@ export type WorkflowRunDetailResponse = {
viewport?: Viewport
}
inputs: string
inputs_truncated: boolean
status: 'running' | 'succeeded' | 'failed' | 'stopped'
outputs?: string
outputs_truncated: boolean
outputs_full_content?: {
download_url: string
}
error?: string
elapsed_time?: number
total_tokens?: number

301
web/models/pipeline.ts Normal file
View File

@@ -0,0 +1,301 @@
import type { Edge, EnvironmentVariable, Node, SupportUploadFileTypes } from '@/app/components/workflow/types'
import type { DSLImportMode, DSLImportStatus } from './app'
import type { ChunkingMode, DatasetPermission, DocumentIndexingStatus, FileIndexingEstimateResponse, IconInfo } from './datasets'
import type { Dependency } from '@/app/components/plugins/types'
import type { AppIconSelection } from '@/app/components/base/app-icon-picker'
import type { Viewport } from 'reactflow'
import type { TransferMethod } from '@/types/app'
import { BaseFieldType } from '@/app/components/base/form/form-scenarios/base/types'
import type { NodeRunResult } from '@/types/workflow'
export enum DatasourceType {
localFile = 'local_file',
onlineDocument = 'online_document',
websiteCrawl = 'website_crawl',
onlineDrive = 'online_drive',
}
export type PipelineTemplateListParams = {
type: 'built-in' | 'customized'
}
export type PipelineTemplate = {
id: string
name: string
icon: IconInfo
description: string
position: number
chunk_structure: ChunkingMode
}
export type PipelineTemplateListResponse = {
pipeline_templates: PipelineTemplate[]
}
export type PipelineTemplateByIdRequest = {
template_id: string
type: 'built-in' | 'customized'
}
export type PipelineTemplateByIdResponse = {
id: string
name: string
icon_info: IconInfo
description: string
chunk_structure: ChunkingMode
export_data: string // DSL content
graph: {
nodes: Node[]
edges: Edge[]
viewport: Viewport
}
created_by: string
}
export type CreateFormData = {
name: string
appIcon: AppIconSelection
description: string
permission: DatasetPermission
selectedMemberIDs: string[]
}
export type UpdateTemplateInfoRequest = {
template_id: string
name: string
icon_info: IconInfo
description: string
}
export type UpdateTemplateInfoResponse = {
pipeline_id: string
name: string
icon: IconInfo
description: string
position: number
}
export type DeleteTemplateResponse = {
code: number
}
export type ExportTemplateDSLResponse = {
data: string
}
export type ImportPipelineDSLRequest = {
mode: DSLImportMode
yaml_content?: string
yaml_url?: string
pipeline_id?: string
}
export type ImportPipelineDSLResponse = {
id: string
status: DSLImportStatus
pipeline_id: string
dataset_id: string
current_dsl_version: string
imported_dsl_version: string
}
export type ImportPipelineDSLConfirmResponse = {
status: DSLImportStatus
pipeline_id: string
dataset_id: string
current_dsl_version: string
imported_dsl_version: string
error: string
}
export type PipelineCheckDependenciesResponse = {
leaked_dependencies: Dependency[]
}
export enum PipelineInputVarType {
textInput = 'text-input',
paragraph = 'paragraph',
select = 'select',
number = 'number',
singleFile = 'file',
multiFiles = 'file-list',
checkbox = 'checkbox',
}
export const VAR_TYPE_MAP: Record<PipelineInputVarType, BaseFieldType> = {
[PipelineInputVarType.textInput]: BaseFieldType.textInput,
[PipelineInputVarType.paragraph]: BaseFieldType.paragraph,
[PipelineInputVarType.select]: BaseFieldType.select,
[PipelineInputVarType.singleFile]: BaseFieldType.file,
[PipelineInputVarType.multiFiles]: BaseFieldType.fileList,
[PipelineInputVarType.number]: BaseFieldType.numberInput,
[PipelineInputVarType.checkbox]: BaseFieldType.checkbox,
}
export type RAGPipelineVariable = {
belong_to_node_id: string // indicates belong to which node or 'shared'
type: PipelineInputVarType
label: string
variable: string
max_length?: number
default_value?: string
placeholder?: string
unit?: string
required: boolean
tooltips?: string
options?: string[]
allowed_file_upload_methods?: TransferMethod[]
allowed_file_types?: SupportUploadFileTypes[]
allowed_file_extensions?: string[]
}
export type InputVar = Omit<RAGPipelineVariable, 'belong_to_node_id'>
export type RAGPipelineVariables = RAGPipelineVariable[]
export type PipelineProcessingParamsRequest = {
pipeline_id: string
node_id: string
}
export type PipelineProcessingParamsResponse = {
variables: RAGPipelineVariables
}
export type PipelinePreProcessingParamsRequest = {
pipeline_id: string
node_id: string
}
export type PipelinePreProcessingParamsResponse = {
variables: RAGPipelineVariables
}
export type PublishedPipelineInfoResponse = {
id: string
graph: {
nodes: Node[]
edges: Edge[]
viewport: Viewport
}
created_at: number
created_by: {
id: string
name: string
email: string
}
hash: string
updated_at: number
updated_by: {
id: string
name: string
email: string
},
environment_variables?: EnvironmentVariable[]
rag_pipeline_variables?: RAGPipelineVariables
version: string
marked_name: string
marked_comment: string
}
export type PublishedPipelineRunRequest = {
pipeline_id: string
inputs: Record<string, any>
start_node_id: string
datasource_type: DatasourceType
datasource_info_list: Array<Record<string, any>>
original_document_id?: string
is_preview: boolean
}
export type PublishedPipelineRunPreviewResponse = {
task_iod: string
workflow_run_id: string
data: {
id: string
status: string
created_at: number
elapsed_time: number
error: string
finished_at: number
outputs: FileIndexingEstimateResponse
total_steps: number
total_tokens: number
workflow_id: string
}
}
export type PublishedPipelineRunResponse = {
batch: string
dataset: {
chunk_structure: ChunkingMode
description: string
id: string
name: string
}
documents: InitialDocumentDetail[]
}
export type InitialDocumentDetail = {
data_source_info: Record<string, any>
data_source_type: DatasourceType
enable: boolean
error: string
id: string
indexing_status: DocumentIndexingStatus
name: string
position: number
}
export type PipelineExecutionLogRequest = {
dataset_id: string
document_id: string
}
export type PipelineExecutionLogResponse = {
datasource_info: Record<string, any>
datasource_type: DatasourceType
input_data: Record<string, any>
datasource_node_id: string
}
export type OnlineDocumentPreviewRequest = {
workspaceID: string
pageID: string
pageType: string
pipelineId: string
datasourceNodeId: string
credentialId: string
}
export type OnlineDocumentPreviewResponse = {
content: string
}
export type ConversionResponse = {
pipeline_id: string
dataset_id: string
status: 'success' | 'failed'
}
export enum OnlineDriveFileType {
file = 'file',
folder = 'folder',
bucket = 'bucket',
}
export type OnlineDriveFile = {
id: string
name: string
size?: number
type: OnlineDriveFileType
}
export type DatasourceNodeSingleRunRequest = {
pipeline_id: string
start_node_id: string
start_node_title: string
datasource_type: DatasourceType
datasource_info: Record<string, any>
}
export type DatasourceNodeSingleRunResponse = NodeRunResult