Skip to content

Commit

Permalink
fix(kb): issue of chunking (#34)
Browse files Browse the repository at this point in the history
Because

there are some bugs in chunk process
1. wrong chunk dest
2. kb collection invalid

This commit
fix them
  • Loading branch information
Yougigun committed Jul 9, 2024
1 parent 5409db3 commit 66307c7
Show file tree
Hide file tree
Showing 12 changed files with 227 additions and 107 deletions.
1 change: 1 addition & 0 deletions cmd/main/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ import (
"github.com/instill-ai/artifact-backend/pkg/repository"
servicePkg "github.com/instill-ai/artifact-backend/pkg/service"
"github.com/instill-ai/artifact-backend/pkg/usage"
// "github.com/instill-ai/artifact-backend/pkg/worker"

grpcclient "github.com/instill-ai/artifact-backend/pkg/client/grpc"
httpclient "github.com/instill-ai/artifact-backend/pkg/client/http"
Expand Down
2 changes: 1 addition & 1 deletion config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ database:
host: pg-sql
port: 5432
name: artifact
version: 5
version: 8
timezone: Etc/UTC
pool:
idleconnections: 5
Expand Down
24 changes: 15 additions & 9 deletions pkg/db/migration/000007_create_text_chunk_table.up.sql
Original file line number Diff line number Diff line change
@@ -1,28 +1,34 @@
BEGIN;

CREATE TABLE text_chunk (
uid UUID PRIMARY KEY DEFAULT gen_random_uuid(),
source_uid UUID NOT NULL,
source_table VARCHAR(255) NOT NULL,
start INT NOT NULL,
end INT NOT NULL,
start_pos INT NOT NULL,
end_pos INT NOT NULL,
content_dest VARCHAR(255) NOT NULL,
tokens INT NOT NULL,
retrievable BOOLEAN NOT NULL DEFAULT true,
order INT NOT NULL create_time TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
update_time TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
in_order INT NOT NULL,
create_time TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
update_time TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP
);

-- Create indexes
CREATE UNIQUE INDEX idx_unique_source_table_uid_start_end ON text_chunk (source_table, source_uid, start,end
);
CREATE UNIQUE INDEX idx_unique_source_table_uid_start_end ON text_chunk (source_table, source_uid, start_pos, end_pos);

-- Comments for the table and columns
COMMENT ON TABLE text_chunk IS 'Table to store text chunks with metadata';
COMMENT ON COLUMN text_chunk.uid IS 'Unique identifier for the text chunk';
COMMENT ON COLUMN text_chunk.source_uid IS 'Source unique identifier, references source table''s uid field';
COMMENT ON COLUMN text_chunk.source_table IS 'Name of the source table';
COMMENT ON COLUMN text_chunk.start IS 'Start position of the text chunk';
COMMENT ON COLUMN text_chunk.
end IS 'End position of the text chunk';
COMMENT ON COLUMN text_chunk.start_pos IS 'Start position of the text chunk';
COMMENT ON COLUMN text_chunk.end_pos IS 'End position of the text chunk';
COMMENT ON COLUMN text_chunk.content_dest IS 'dest of the text chunk''s content in file store';
COMMENT ON COLUMN text_chunk.tokens IS 'Number of tokens in the text chunk';
COMMENT ON COLUMN text_chunk.retrievable IS 'Flag indicating if the chunk is retrievable';
COMMENT ON COLUMN text_chunk.in_order IS 'Order of the text chunk';
COMMENT ON COLUMN text_chunk.create_time IS 'Timestamp when the record was created';
COMMENT ON COLUMN text_chunk.update_time IS 'Timestamp when the record was last updated';

COMMIT;
3 changes: 2 additions & 1 deletion pkg/db/migration/000008_create_embedding_table.up.sql
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ CREATE TABLE embedding (
source_uid UUID NOT NULL,
source_table VARCHAR(255) NOT NULL,
vector JSONB NOT NULL,
collection VARCHAR(255) NOT NULL,
create_time TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
update_time TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP
);
Expand All @@ -15,7 +16,7 @@ COMMENT ON TABLE embedding IS 'Table to store embeddings with metadata';
COMMENT ON COLUMN embedding.uid IS 'Unique identifier for the embedding';
COMMENT ON COLUMN embedding.source_uid IS 'Source unique identifier, references source table''s uid field';
COMMENT ON COLUMN embedding.source_table IS 'Name of the source table';
COMMENT ON COLUMN embedding.embedding_dest IS 'Destination of the embedding''s content in vector store';
COMMENT ON COLUMN embedding.collection IS 'Destination of the embedding''s content in vector store';
COMMENT ON COLUMN embedding.create_time IS 'Timestamp when the record was created';
COMMENT ON COLUMN embedding.update_time IS 'Timestamp when the record was last updated';
COMMIT;
31 changes: 14 additions & 17 deletions pkg/handler/knowledgebasefiles.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@ func (ph *PublicHandler) UploadKnowledgeBaseFile(ctx context.Context, req *artif
return nil, err
}

if strings.Contains(req.File.Name, "/") {
return nil, fmt.Errorf("file name cannot contain '/'. err: %w", customerror.ErrInvalidArgument)
}

// TODO: ACL - check if the creator can upload file to this knowledge base. ACL.
// .....

Expand All @@ -43,21 +47,12 @@ func (ph *PublicHandler) UploadKnowledgeBaseFile(ctx context.Context, req *artif

// upload file to minio
var kb *repository.KnowledgeBase
var filePathName string
{
kb, err = ph.service.Repository.GetKnowledgeBaseByOwnerAndID(ctx, ownerUID, req.KbId)
if err != nil {
return nil, fmt.Errorf("failed to get knowledge base by owner and id. err: %w", err)
}
// check if the name has "/" which may cause folder creation in minio
if strings.Contains(req.File.Name, "/") {
return nil, fmt.Errorf("file name cannot contain '/'. err: %w", customerror.ErrInvalidArgument)
}
filePathName = kb.UID.String() + "/" + req.File.Name
err = ph.service.MinIO.UploadBase64File(ctx, filePathName, req.File.Content, fileTypeConvertToMime(req.File.Type))
if err != nil {
return nil, err
}

}

// create metadata in db
Expand All @@ -74,23 +69,25 @@ func (ph *PublicHandler) UploadKnowledgeBaseFile(ctx context.Context, req *artif
log.Error("failed to parse owner uid", zap.Error(err))
return nil, err
}
destination := ph.service.MinIO.GetUploadedFilePathInKnowledgeBase(kb.UID.String(), req.File.Name)
kbFile := repository.KnowledgeBaseFile{
Name: req.File.Name,
Type: artifactpb.FileType_name[int32(req.File.Type)],
Owner: ownerUIDUuid,
CreatorUID: creatorUID,
KnowledgeBaseUID: kb.UID,
Destination: filePathName,
Destination: destination,
ProcessStatus: artifactpb.FileProcessStatus_name[int32(artifactpb.FileProcessStatus_FILE_PROCESS_STATUS_NOTSTARTED)],
}
res, err = ph.service.Repository.CreateKnowledgeBaseFile(ctx, kbFile)
if err != nil {
err := ph.service.MinIO.DeleteFile(ctx, filePathName)
res, err = ph.service.Repository.CreateKnowledgeBaseFile(ctx, kbFile, func(FileUID string) error {
// upload file to minio
err = ph.service.MinIO.UploadBase64File(ctx, destination, req.File.Content, fileTypeConvertToMime(req.File.Type))
if err != nil {
log.Error("failed to delete file in minio", zap.Error(err))
return err
}
return nil, err
}

return nil
})
}

return &artifactpb.UploadKnowledgeBaseFileResponse{
Expand Down
13 changes: 9 additions & 4 deletions pkg/milvus/milvus.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ type MilvusClientI interface {
ListEmbeddings(ctx context.Context, collectionName string) ([]Embedding, error)
SearchEmbeddings(ctx context.Context, collectionName string, vectors [][]float32, topK int) ([][]Embedding, error)
DeleteEmbedding(ctx context.Context, collectionName string, embeddingUID []string) error
// GetKnowledgeBaseCollectionName returns the collection name for a knowledge base
GetKnowledgeBaseCollectionName(kbUID string) string
Close()
}

Expand Down Expand Up @@ -81,7 +83,7 @@ func (m *MilvusClient) GetHealth(ctx context.Context) (bool, error) {
// CreateKnowledgeBaseCollection
func (m *MilvusClient) CreateKnowledgeBaseCollection(ctx context.Context, kbUID string) error {
logger, _ := logger.GetZapLogger(ctx)
collectionName := getKnowledgeBaseCollectionName(kbUID)
collectionName := m.GetKnowledgeBaseCollectionName(kbUID)

// 1. Check if the collection already exists
has, err := m.c.HasCollection(ctx, collectionName)
Expand Down Expand Up @@ -129,7 +131,7 @@ func (m *MilvusClient) CreateKnowledgeBaseCollection(ctx context.Context, kbUID

// InsertVectorsToKnowledgeBaseCollection
func (m *MilvusClient) InsertVectorsToKnowledgeBaseCollection(ctx context.Context, kbUID string, embeddings []Embedding) error {
collectionName := getKnowledgeBaseCollectionName(kbUID)
collectionName := m.GetKnowledgeBaseCollectionName(kbUID)

// Check if the collection exists
has, err := m.c.HasCollection(ctx, collectionName)
Expand Down Expand Up @@ -380,7 +382,10 @@ func (m *MilvusClient) Close() {

const kbCollectionPrefix = "kb_"

// getKnowledgeBaseCollectionName
func getKnowledgeBaseCollectionName(kbUID string) string {
// GetKnowledgeBaseCollectionName returns the collection name for a knowledge base
func (m *MilvusClient) GetKnowledgeBaseCollectionName(kbUID string) string {
// collection name can only contain numbers, letters and underscores: invalid parameter
// turn kbUID(uuid) into a valid collection name
kbUID = strings.ReplaceAll(kbUID, "-", "_")
return kbCollectionPrefix + kbUID
}
22 changes: 20 additions & 2 deletions pkg/minio/knowledgebase.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,17 @@ type KnowledgeBaseI interface {
SaveConvertedFile(ctx context.Context, kbUID, convertedFileUID, fileExt string, content []byte) error
// SaveChunks saves batch of chunks(text files) to MinIO.
SaveChunks(ctx context.Context, kbUID string, chunks map[ChunkUIDType]ChunkContentType) error
// GetUploadedFilePathInKnowledgeBase returns the path of the uploaded file in MinIO.
GetUploadedFilePathInKnowledgeBase(kbUID, dest string) string
// GetConvertedFilePathInKnowledgeBase returns the path of the converted file in MinIO.
GetConvertedFilePathInKnowledgeBase(kbUID, ConvertedFileUID, fileExt string) string
// GetChunkPathInKnowledgeBase returns the path of the chunk in MinIO.
GetChunkPathInKnowledgeBase(kbUID, chunkUID string) string
}

// SaveConvertedFile saves a converted file to MinIO with the appropriate MIME type.
func (m *Minio) SaveConvertedFile(ctx context.Context, kbUID, convertedFileUID, fileExt string, content []byte) error {
filePathName := GetConvertedFilePathInKnowledgeBase(kbUID, convertedFileUID, fileExt)
filePathName := m.GetConvertedFilePathInKnowledgeBase(kbUID, convertedFileUID, fileExt)
mimeType := "application/octet-stream"
if fileExt == "md" {
mimeType = "text/markdown"
Expand All @@ -41,7 +47,7 @@ func (m *Minio) SaveChunks(ctx context.Context, kbUID string, chunks map[ChunkUI
wg.Add(1)
go func(chunkUID ChunkUIDType, chunkContent ChunkContentType) {
defer wg.Done()
filePathName := GetChunkPathInKnowledgeBase(kbUID, string(chunkUID))
filePathName := m.GetChunkPathInKnowledgeBase(kbUID, string(chunkUID))

err := m.UploadBase64File(ctx, filePathName, base64.StdEncoding.EncodeToString(chunkContent), "text/plain")
if err != nil {
Expand All @@ -61,3 +67,15 @@ func (m *Minio) SaveChunks(ctx context.Context, kbUID string, chunks map[ChunkUI
}
return nil
}

func (m *Minio) GetUploadedFilePathInKnowledgeBase(kbUID, dest string) string {
return kbUID + "/uploaded-file/" + dest
}

func (m *Minio) GetConvertedFilePathInKnowledgeBase(kbUID, ConvertedFileUID, fileExt string) string {
return kbUID + "/converted-file/" + ConvertedFileUID + "." + fileExt
}

func (m *Minio) GetChunkPathInKnowledgeBase(kbUID, chunkUID string) string {
return kbUID + "/chunk/" + chunkUID + ".txt"
}
Loading

0 comments on commit 66307c7

Please sign in to comment.