Skip to content

Commit

Permalink
Merge pull request #864 from bjwswang/main
Browse files Browse the repository at this point in the history
fix: add batchsize in embedding options
  • Loading branch information
bjwswang authored Mar 15, 2024
2 parents 9fd5da9 + 7db85f5 commit c1a50c1
Show file tree
Hide file tree
Showing 11 changed files with 144 additions and 15 deletions.
10 changes: 6 additions & 4 deletions api/base/v1alpha1/knowledgebase.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,20 +9,22 @@ import (
const (
// UpdateSourceFileAnnotationKey is the key of the update source file annotation
UpdateSourceFileAnnotationKey = Group + "/update-source-file-time"
DefaultChunkSize = 1024
DefaultChunkOverlap = 100
DefaultChunkSize = 300
DefaultChunkOverlap = 10
DefaultBatchSize = 10
)

func (kb *KnowledgeBase) EmbeddingOptions() EmbeddingOptions {
options := kb.Spec.EmbeddingOptions
if kb.Spec.EmbeddingOptions.ChunkSize == 0 {
// default 1024
options.ChunkSize = DefaultChunkSize
}
if kb.Spec.EmbeddingOptions.ChunkOverlap == nil {
// default 100
options.ChunkOverlap = pointer.IntPtr(DefaultChunkOverlap)
}
if kb.Spec.EmbeddingOptions.BatchSize == 0 {
options.BatchSize = DefaultBatchSize
}
return options
}

Expand Down
7 changes: 5 additions & 2 deletions api/base/v1alpha1/knowledgebase_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,14 @@ type KnowledgeBaseSpec struct {

type EmbeddingOptions struct {
// ChunkSize for text splitter
// +kubebuilder:default=1024
// +kubebuilder:default=300
ChunkSize int `json:"chunkSize,omitempty"`
// ChunkOverlap for text splitter
// +kubebuilder:default=100
// +kubebuilder:default=30
ChunkOverlap *int `json:"chunkOverlap,omitempty"`
// BatchSize for text splitter
// +kubebuilder:default=10
BatchSize int `json:"batchSize,omitempty"`
}

type FileGroupDetail struct {
Expand Down
87 changes: 85 additions & 2 deletions apiserver/graph/generated/generated.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions apiserver/graph/generated/models_gen.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions apiserver/graph/schema/knowledgebase.gql
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ query listKnowledgeBases($input: ListKnowledgeBaseInput!){
description
chunkSize
chunkOverlap
batchSize
status
reason
message
Expand Down Expand Up @@ -67,6 +68,7 @@ query getKnowledgeBase($name: String!, $namespace: String!) {
description
chunkSize
chunkOverlap
batchSize
status
reason
message
Expand Down Expand Up @@ -115,6 +117,7 @@ mutation createKnowledgeBase($input: CreateKnowledgeBaseInput!) {
description
chunkSize
chunkOverlap
batchSize
status
reason
message
Expand Down Expand Up @@ -163,6 +166,7 @@ mutation updateKnowledgeBase($input: UpdateKnowledgeBaseInput) {
description
chunkSize
chunkOverlap
batchSize
status
reason
message
Expand Down
12 changes: 12 additions & 0 deletions apiserver/graph/schema/knowledgebase.graphqls
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,10 @@ type KnowledgeBase {
chunkOverlap为知识库作文档拆分时相邻块的交集
"""
chunkOverlap: Int
"""
batchSize为知识库做批量处理时的批次大小
"""
batchSize: Int

"""
知识库整体连接状态
Expand Down Expand Up @@ -192,6 +196,10 @@ input CreateKnowledgeBaseInput{
chunkOverlap为知识库作文档拆分时相邻块的交集
"""
chunkOverlap: Int
"""
batchSize为知识库做批量处理时的批次大小
"""
batchSize: Int
}

"""知识库更新的输入"""
Expand Down Expand Up @@ -222,6 +230,10 @@ input UpdateKnowledgeBaseInput {
chunkOverlap为知识库作文档拆分时相邻块的交集
"""
chunkOverlap: Int
"""
batchSize为知识库做批量处理时的批次大小
"""
batchSize: Int
}

"""知识库分页列表查询的输入"""
Expand Down
10 changes: 10 additions & 0 deletions apiserver/pkg/knowledgebase/knowledgebase.go
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,8 @@ func knowledgebase2model(ctx context.Context, c client.Client, knowledgebase *v1
FileGroupDetails: filegroupdetails,
ChunkSize: &embeddingOptions.ChunkSize,
ChunkOverlap: embeddingOptions.ChunkOverlap,
BatchSize: &embeddingOptions.BatchSize,

// Status info
Status: &status,
Reason: &reason,
Expand Down Expand Up @@ -174,6 +176,10 @@ func CreateKnowledgeBase(ctx context.Context, c client.Client, input generated.C
if input.ChunkOverlap != nil {
chunkOverlap = input.ChunkOverlap
}
batchSize := v1alpha1.DefaultBatchSize
if input.BatchSize != nil {
batchSize = *input.BatchSize
}

knowledgebase := &v1alpha1.KnowledgeBase{
ObjectMeta: metav1.ObjectMeta{
Expand All @@ -195,6 +201,7 @@ func CreateKnowledgeBase(ctx context.Context, c client.Client, input generated.C
EmbeddingOptions: v1alpha1.EmbeddingOptions{
ChunkSize: chunkSize,
ChunkOverlap: chunkOverlap,
BatchSize: batchSize,
},
},
}
Expand Down Expand Up @@ -267,6 +274,9 @@ func UpdateKnowledgeBase(ctx context.Context, c client.Client, input *generated.
if input.ChunkOverlap != nil {
kb.Spec.ChunkOverlap = input.ChunkOverlap
}
if input.BatchSize != nil {
kb.Spec.BatchSize = *input.BatchSize
}

err = c.Update(ctx, kb)
if err != nil {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,16 @@ spec:
spec:
description: KnowledgeBaseSpec defines the desired state of KnowledgeBase
properties:
batchSize:
default: 10
description: BatchSize for text splitter
type: integer
chunkOverlap:
default: 100
default: 30
description: ChunkOverlap for text splitter
type: integer
chunkSize:
default: 1024
default: 300
description: ChunkSize for text splitter
type: integer
creator:
Expand Down
Loading

0 comments on commit c1a50c1

Please sign in to comment.