Skip to content

Commit

Permalink
feat(kb): add file metadata (#37)
Browse files Browse the repository at this point in the history
Because

file catalog need to show file size, tokens #, and chunks #.

This commit

add those info in file list api
  • Loading branch information
Yougigun committed Jul 12, 2024
1 parent 8eccb43 commit 51113ce
Show file tree
Hide file tree
Showing 10 changed files with 946 additions and 11 deletions.
12 changes: 5 additions & 7 deletions cmd/main/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ import (
"github.com/instill-ai/artifact-backend/pkg/repository"
servicePkg "github.com/instill-ai/artifact-backend/pkg/service"
"github.com/instill-ai/artifact-backend/pkg/usage"
// "github.com/instill-ai/artifact-backend/pkg/worker"
"github.com/instill-ai/artifact-backend/pkg/worker"

grpcclient "github.com/instill-ai/artifact-backend/pkg/client/grpc"
httpclient "github.com/instill-ai/artifact-backend/pkg/client/http"
Expand Down Expand Up @@ -167,10 +167,9 @@ func main() {
}),
)

// file-to-embeddings worker pool
// TODO: uncomment when file-to-embeddings worker is tested
// wp := worker.NewFileToEmbWorkerPool(ctx, service, config.Config.FileToEmbeddingWorker.NumberOfWorkers)
// wp.Start(ctx)
// activate file-to-embeddings worker pool
wp := worker.NewFileToEmbWorkerPool(ctx, service, config.Config.FileToEmbeddingWorker.NumberOfWorkers)
wp.Start(ctx)

// Start usage reporter
var usg usage.Usage
Expand Down Expand Up @@ -263,8 +262,7 @@ func main() {
// }
logger.Info("Shutting down server...")
publicGrpcS.GracefulStop()
// TODO: uncomment when file-to-embeddings worker is tested
// wp.GraceFulStop()
wp.GraceFulStop()
}
}

Expand Down
2 changes: 1 addition & 1 deletion config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ database:
host: pg-sql
port: 5432
name: artifact
version: 9
version: 10
timezone: Etc/UTC
pool:
idleconnections: 5
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ require (
github.com/grpc-ecosystem/go-grpc-middleware v1.3.0
github.com/grpc-ecosystem/grpc-gateway/v2 v2.19.1
github.com/influxdata/influxdb-client-go/v2 v2.12.3
github.com/instill-ai/protogen-go v0.3.3-alpha.0.20240711114323-e16b79bfb545
github.com/instill-ai/protogen-go v0.3.3-alpha.0.20240711150749-e0fe2126f1cf
github.com/instill-ai/usage-client v0.3.0-alpha.0.20240319060111-4a3a39f2fd61
github.com/instill-ai/x v0.3.0-alpha.0.20231219052200-6230a89e386c
github.com/knadh/koanf v1.5.0
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -342,8 +342,8 @@ github.com/influxdata/influxdb-client-go/v2 v2.12.3 h1:28nRlNMRIV4QbtIUvxhWqaxn0
github.com/influxdata/influxdb-client-go/v2 v2.12.3/go.mod h1:IrrLUbCjjfkmRuaCiGQg4m2GbkaeJDcuWoxiWdQEbA0=
github.com/influxdata/line-protocol v0.0.0-20200327222509-2487e7298839 h1:W9WBk7wlPfJLvMCdtV4zPulc4uCPrlywQOmbFOhgQNU=
github.com/influxdata/line-protocol v0.0.0-20200327222509-2487e7298839/go.mod h1:xaLFMmpvUxqXtVkUJfg9QmT88cDaCJ3ZKgdZ78oO8Qo=
github.com/instill-ai/protogen-go v0.3.3-alpha.0.20240711114323-e16b79bfb545 h1:W79+lcBA8DyqGo2gnDSh9pCwyru0oGWEjQxCQf6DqVs=
github.com/instill-ai/protogen-go v0.3.3-alpha.0.20240711114323-e16b79bfb545/go.mod h1:2blmpUwiTwxIDnrjIqT6FhR5ewshZZF554wzjXFvKpQ=
github.com/instill-ai/protogen-go v0.3.3-alpha.0.20240711150749-e0fe2126f1cf h1:LjGpUAK8zDoQW0hh+5B55qCQopmsi9W/pxI/xgTFFDs=
github.com/instill-ai/protogen-go v0.3.3-alpha.0.20240711150749-e0fe2126f1cf/go.mod h1:2blmpUwiTwxIDnrjIqT6FhR5ewshZZF554wzjXFvKpQ=
github.com/instill-ai/usage-client v0.3.0-alpha.0.20240319060111-4a3a39f2fd61 h1:smPTvmXDhn/QC7y/TPXyMTqbbRd0gvzmFgWBChwTfhE=
github.com/instill-ai/usage-client v0.3.0-alpha.0.20240319060111-4a3a39f2fd61/go.mod h1:/TAHs4ybuylk5icuy+MQtHRc4XUnIyXzeNKxX9qDFhw=
github.com/instill-ai/x v0.3.0-alpha.0.20231219052200-6230a89e386c h1:a2RVkpIV2QcrGnSHAou+t/L+vBsaIfFvk5inVg5Uh4s=
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
BEGIN;

-- Drop the index (if it was created in the up migration)
DROP INDEX IF EXISTS idx_knowledge_base_file_size;

-- Remove the column
ALTER TABLE knowledge_base_file DROP COLUMN IF EXISTS size;

COMMIT;
10 changes: 10 additions & 0 deletions pkg/db/migration/000010_add_file_size__in_kb_file_table.up.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
BEGIN;

-- Add the new column 'size'
ALTER TABLE knowledge_base_file ADD COLUMN size BIGINT;

-- Add a comment for the new column
COMMENT ON COLUMN knowledge_base_file.size IS 'Size of the file in bytes';


COMMIT;
110 changes: 110 additions & 0 deletions pkg/handler/knowledgebasefiles.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package handler

import (
"context"
"errors"
"fmt"
"strings"

Expand All @@ -13,6 +14,7 @@ import (
mgmtpb "github.com/instill-ai/protogen-go/core/mgmt/v1beta"
"go.uber.org/zap"
"google.golang.org/protobuf/types/known/timestamppb"
"gorm.io/gorm"
)

func (ph *PublicHandler) UploadKnowledgeBaseFile(ctx context.Context, req *artifactpb.UploadKnowledgeBaseFileRequest) (*artifactpb.UploadKnowledgeBaseFileResponse, error) {
Expand Down Expand Up @@ -69,6 +71,7 @@ func (ph *PublicHandler) UploadKnowledgeBaseFile(ctx context.Context, req *artif
log.Error("failed to parse owner uid", zap.Error(err))
return nil, err
}
fileSize, _ := getFileSize(req.File.Content)
destination := ph.service.MinIO.GetUploadedFilePathInKnowledgeBase(kb.UID.String(), req.File.Name)
kbFile := repository.KnowledgeBaseFile{
Name: req.File.Name,
Expand All @@ -78,6 +81,7 @@ func (ph *PublicHandler) UploadKnowledgeBaseFile(ctx context.Context, req *artif
KnowledgeBaseUID: kb.UID,
Destination: destination,
ProcessStatus: artifactpb.FileProcessStatus_name[int32(artifactpb.FileProcessStatus_FILE_PROCESS_STATUS_NOTSTARTED)],
Size: fileSize,
}
res, err = ph.service.Repository.CreateKnowledgeBaseFile(ctx, kbFile, func(FileUID string) error {
// upload file to minio
Expand Down Expand Up @@ -105,10 +109,44 @@ func (ph *PublicHandler) UploadKnowledgeBaseFile(ctx context.Context, req *artif
CreateTime: timestamppb.New(*res.CreateTime),
UpdateTime: timestamppb.New(*res.UpdateTime),
ProcessStatus: artifactpb.FileProcessStatus_FILE_PROCESS_STATUS_NOTSTARTED,
Size: res.Size,
TotalChunks: 0,
TotalTokens: 0,
},
}, nil
}

// getFileSize returns the size of the file in bytes and a human-readable string
func getFileSize(base64String string) (int64, string) {
// Get the length of the base64 string
base64Length := len(base64String)

// Calculate the size of the decoded data
// The actual size is approximately 3/4 of the base64 string length
decodedSize := base64Length / 4 * 3

// Remove padding characters
if base64String[base64Length-1] == '=' {
decodedSize--
if base64String[base64Length-2] == '=' {
decodedSize--
}
}

// Convert to appropriate unit
const unit = 1024
if decodedSize < unit {
return int64(decodedSize), fmt.Sprintf("%d B", decodedSize)
}
div, exp := int64(unit), 0
for n := int64(decodedSize) / unit; n >= unit; n /= unit {
div *= unit
exp++
}
size := float64(decodedSize) / float64(div)
return int64(decodedSize), fmt.Sprintf("%.1f %cB", size, "KMGTPE"[exp])
}

func checkUploadKnowledgeBaseFileRequest(req *artifactpb.UploadKnowledgeBaseFileRequest) error {
if req.OwnerId == "" {
return fmt.Errorf("owner uid is required. err: %w", ErrCheckRequiredFields)
Expand Down Expand Up @@ -188,6 +226,24 @@ func (ph *PublicHandler) ListKnowledgeBaseFiles(ctx context.Context, req *artifa
log.Error("failed to list knowledge base files", zap.Error(err))
return nil, err
}
// get the tokens and chunks using the source table and source uid
sources, err := ph.findSourceTableAndSourceUIDByFileUID(ctx, kbFiles)
if err != nil {
log.Error("failed to find source table and source uid by file uid", zap.Error(err))
return nil, err
}
// TODO need test from file upload to file to embeded text then check the total tokens
totalTokens, err := ph.service.Repository.GetFilesTotalTokens(ctx, sources)
if err != nil {
log.Error("failed to get files total tokens", zap.Error(err))
return nil, err
}
// TODO get total chunks
totalChunks, err := ph.service.Repository.GetTotalChunksBySources(ctx, sources)
if err != nil {
log.Error("failed to get files total chunks", zap.Error(err))
return nil, err
}
totalSize = size
nextPageToken = nextToken
for _, kbFile := range kbFiles {
Expand All @@ -201,6 +257,9 @@ func (ph *PublicHandler) ListKnowledgeBaseFiles(ctx context.Context, req *artifa
CreateTime: timestamppb.New(*kbFile.CreateTime),
UpdateTime: timestamppb.New(*kbFile.UpdateTime),
ProcessStatus: artifactpb.FileProcessStatus(artifactpb.FileProcessStatus_value[kbFile.ProcessStatus]),
Size: kbFile.Size,
TotalChunks: int32(totalChunks[kbFile.UID]),
TotalTokens: int32(totalTokens[kbFile.UID]),
})
}
}
Expand All @@ -213,6 +272,52 @@ func (ph *PublicHandler) ListKnowledgeBaseFiles(ctx context.Context, req *artifa
}, nil
}

// findSourceTableAndSourceUIDByFiles find the source table and source uid by file uid.
func (ph *PublicHandler) findSourceTableAndSourceUIDByFileUID(ctx context.Context, files []repository.KnowledgeBaseFile) (
map[uuid.UUID]struct {
SourceTable string
SourceUID uuid.UUID
}, error) {
result := make(map[uuid.UUID]struct {
SourceTable string
SourceUID uuid.UUID
})
logger, _ := logger.GetZapLogger(ctx)
for _, file := range files {
// find the source table and source uid by file uid
// check if the file is is text or markdown
switch file.Type {
case artifactpb.FileType_FILE_TYPE_TEXT.String(), artifactpb.FileType_FILE_TYPE_MARKDOWN.String():
result[file.UID] = struct {
SourceTable string
SourceUID uuid.UUID
}{
SourceTable: ph.service.Repository.KnowledgeBaseFileTableName(),
SourceUID: file.UID,
}
case artifactpb.FileType_FILE_TYPE_PDF.String():
convertedFile, err := ph.service.Repository.GetConvertedFileByFileUID(ctx, file.UID)
if err != nil {
if errors.Is(err, gorm.ErrRecordNotFound) {
continue
} else {
logger.Error("failed to get converted file by file uid", zap.Error(err))
return nil, err
}
}
result[file.UID] = struct {
SourceTable string
SourceUID uuid.UUID
}{
SourceTable: ph.service.Repository.ConvertedFileTableName(),
SourceUID: convertedFile.UID,
}
}
}

return result, nil
}

func (ph *PublicHandler) DeleteKnowledgeBaseFile(
ctx context.Context,
req *artifactpb.DeleteKnowledgeBaseFileRequest) (
Expand All @@ -226,6 +331,11 @@ func (ph *PublicHandler) DeleteKnowledgeBaseFile(

// TODO: ACL - check if the uid can delete file. ACL.

// check if file uid is empty
if req.FileUid == "" {
return nil, fmt.Errorf("file uid is required. err: %w", customerror.ErrInvalidArgument)
}

err := ph.service.Repository.DeleteKnowledgeBaseFile(ctx, req.FileUid)
if err != nil {
return nil, err
Expand Down
Loading

0 comments on commit 51113ce

Please sign in to comment.