Skip to content

Commit 487d4f7

Browse files
committed
refactor(artifact): flatten protobuf packages and decouple files from knowledge bases (#321)
**Because** - The protobuf package structure was nested (`artifact/artifact/v1alpha`) and needed flattening to `artifact/v1alpha` for consistency across the codebase - Files were tightly coupled to a single knowledge base, limiting reusability - Resource IDs needed to follow AIP standard with unique constraints and proper scoping - The System resource used RFC-1034 IDs instead of hash-based IDs like other resources **This commit** - Flattens protobuf imports from `protogen-go/artifact/artifact/v1alpha` to `protogen-go/artifact/v1alpha` - Decouples files from knowledge bases - files are now namespace-scoped and can be associated with multiple KBs via a junction table (`file_knowledge_base`) - Adds 7 database migrations: - `000048`: Add slug columns to knowledge_base and file tables - `000049`: Add unique constraints for resource IDs (AIP standard) - `000050`: Unify System schema to use hash-based IDs with `sys-` prefix - `000051`: Unify storage paths - `000052`: Rename chunk content destination - `000053`: Add ID column to chunk table with `chk-` prefix - `000054`: Create junction table and migrate file-KB relationships - Renames `KnowledgeBaseFile*` to `File*` throughout repository and service layers - Updates all integration tests to use the flattened proto paths and new file API structure - Removes deprecated code: `pkg/usage/usage.go`, `pkg/client/http/registry.go` - Updates handler parsing to support new file resource pattern: `namespaces/{namespace}/files/{file}`
1 parent 1f6fb10 commit 487d4f7

File tree

141 files changed

+48634
-37843
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

141 files changed

+48634
-37843
lines changed

.env

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# version
2-
GOLANG_VERSION=1.24.6
2+
GOLANG_VERSION=1.25.6
33
K6_VERSION=1.0.0
44
XK6_VERSION=0.19.3
55
XK6_SQL_VERSION=1.0.4

.github/workflows/coverage.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ jobs:
3131
go test -race ./... -coverprofile=coverage.txt -covermode=atomic
3232
3333
- name: Upload coverage report
34-
uses: codecov/codecov-action@v2
34+
uses: codecov/codecov-action@v5
3535
with:
3636
file: ./coverage.txt
3737
flags: unittests

.github/workflows/golangci-lint.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,5 +24,5 @@ jobs:
2424
- name: golangci-lint
2525
uses: golangci/golangci-lint-action@v8
2626
with:
27-
version: v2.1.6
27+
version: v2.8.0
2828
args: --timeout=10m

Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
ARG GOLANG_VERSION=1.24.6
1+
ARG GOLANG_VERSION=1.25.6
22
FROM golang:${GOLANG_VERSION} AS build
33

44
WORKDIR /build

Dockerfile.dev

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
ARG GOLANG_VERSION=1.24.6
1+
ARG GOLANG_VERSION=1.25.6
22
FROM golang:${GOLANG_VERSION}-bullseye
33

44
ARG SERVICE_NAME SERVICE_VERSION

Makefile

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,9 @@
66
include .env
77
export
88

9-
# Integration test defaults (for running from host)
9+
# Integration test configuration
10+
# - From host: make integration-test (uses localhost:8080)
11+
# - In container: make integration-test API_GATEWAY_URL=api-gateway:8080 DB_HOST=pg_sql
1012
API_GATEWAY_PROTOCOL ?= http
1113
API_GATEWAY_URL ?= localhost:8080
1214
DB_HOST ?= localhost
@@ -94,11 +96,7 @@ unit-test: ## Run unit test
9496

9597
.PHONY: integration-test
9698
integration-test: ## Run integration tests (parallel by default, sequential if CI=true)
97-
@if [ -n "${API_GATEWAY_URL}" ]; then \
98-
echo "✓ Running tests through API Gateway: ${API_GATEWAY_URL}"; \
99-
else \
100-
echo "⚠ WARNING: No API_GATEWAY_URL set - using default localhost:8080"; \
101-
fi
99+
@echo "✓ Running tests via API Gateway: ${API_GATEWAY_URL}"
102100
@echo " DB_HOST: ${DB_HOST}"
103101
ifeq ($(CI),true)
104102
@echo "Running integration tests sequentially (CI mode)..."
@@ -108,18 +106,21 @@ ifeq ($(CI),true)
108106
integration-test/rest.js \
109107
integration-test/rest-object-storage.js \
110108
integration-test/rest-hash-based-ids.js \
109+
integration-test/rest-invariants.js \
111110
integration-test/rest-file-type.js \
112111
integration-test/rest-db.js \
113112
integration-test/rest-ai-client.js \
114113
integration-test/rest-kb-e2e-file-process.js \
115114
integration-test/rest-file-reprocess.js \
116115
integration-test/rest-kb-delete.js \
116+
integration-test/rest-namespace-permission.js \
117117
integration-test/grpc.js \
118118
integration-test/grpc-kb-update.js \
119119
integration-test/grpc-system-config-update.js \
120120
integration-test/grpc-system-admin.js; do \
121121
echo "Running $$test..." | tee -a /tmp/artifact-integration-test.log; \
122122
TEST_FOLDER_ABS_PATH=$(PWD) k6 run --address="" \
123+
-e CI=true \
123124
-e API_GATEWAY_PROTOCOL=$(API_GATEWAY_PROTOCOL) \
124125
-e API_GATEWAY_URL=$(API_GATEWAY_URL) \
125126
-e DB_HOST=$(DB_HOST) \
@@ -138,13 +139,15 @@ else
138139
{} --no-usage-report" ::: \
139140
integration-test/rest.js \
140141
integration-test/rest-object-storage.js \
141-
integration-test/rest-hash-based-ids.js 2>&1 | tee -a /tmp/artifact-integration-test.log
142+
integration-test/rest-hash-based-ids.js \
143+
integration-test/rest-invariants.js 2>&1 | tee -a /tmp/artifact-integration-test.log
142144
# Batch 2: File processing tests (heavy AI workload)
143145
@parallel --halt now,fail=1 --tag --line-buffer \
144146
"TEST_FOLDER_ABS_PATH=${PWD} k6 run --address=\"\" \
145147
-e API_GATEWAY_PROTOCOL=${API_GATEWAY_PROTOCOL} -e API_GATEWAY_URL=${API_GATEWAY_URL} \
146148
-e DB_HOST=${DB_HOST} \
147149
{} --no-usage-report" ::: \
150+
integration-test/rest-namespace-permission.js \
148151
integration-test/rest-file-type.js \
149152
integration-test/rest-db.js \
150153
integration-test/rest-ai-client.js 2>&1 | tee -a /tmp/artifact-integration-test.log

cmd/init/main.go

Lines changed: 41 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package main
22

33
import (
44
"context"
5+
"fmt"
56
"log"
67
"time"
78

@@ -11,14 +12,39 @@ import (
1112
grpczap "github.com/grpc-ecosystem/go-grpc-middleware/logging/zap"
1213

1314
"github.com/instill-ai/artifact-backend/config"
15+
"github.com/instill-ai/artifact-backend/pkg/ai"
16+
"github.com/instill-ai/artifact-backend/pkg/ai/gemini"
17+
"github.com/instill-ai/artifact-backend/pkg/ai/openai"
1418
"github.com/instill-ai/artifact-backend/pkg/pipeline"
19+
"github.com/instill-ai/artifact-backend/pkg/repository"
1520

16-
pipelinepb "github.com/instill-ai/protogen-go/pipeline/pipeline/v1beta"
21+
database "github.com/instill-ai/artifact-backend/pkg/db"
22+
pipelinepb "github.com/instill-ai/protogen-go/pipeline/v1beta"
1723
clientx "github.com/instill-ai/x/client"
1824
clientgrpcx "github.com/instill-ai/x/client/grpc"
1925
logx "github.com/instill-ai/x/log"
2026
)
2127

28+
// DefaultSystemPresets defines the default system configurations to seed
29+
var DefaultSystemPresets = []repository.PresetSystem{
30+
{
31+
DisplayName: "OpenAI",
32+
Slug: "openai",
33+
ModelFamily: ai.ModelFamilyOpenAI,
34+
Dimensionality: openai.DefaultEmbeddingDimension,
35+
Description: "OpenAI embedding configuration (text-embedding-3-small, 1536 dimensions)",
36+
IsDefault: false,
37+
},
38+
{
39+
DisplayName: "Gemini",
40+
Slug: "gemini",
41+
ModelFamily: ai.ModelFamilyGemini,
42+
Dimensionality: gemini.DefaultEmbeddingDimension,
43+
Description: "Gemini embedding configuration (text-embedding-004, 3072 dimensions)",
44+
IsDefault: true, // Gemini is the default
45+
},
46+
}
47+
2248
func main() {
2349
ctx := context.Background()
2450

@@ -40,6 +66,19 @@ func main() {
4066
grpczap.ReplaceGrpcLoggerV2WithVerbosity(logger, 3) // verbosity 3 will avoid [transport] from emitting
4167
}
4268

69+
// Initialize database connection for system seeding
70+
db := database.GetSharedConnection()
71+
defer database.Close(db)
72+
73+
// Seed default systems
74+
logger.Info("Seeding default system configurations...")
75+
repo := repository.NewDBOnlyRepository(db)
76+
if err := repo.SeedDefaultSystems(ctx, DefaultSystemPresets); err != nil {
77+
logger.Fatal(fmt.Sprintf("failed to seed default systems: %v", err))
78+
}
79+
logger.Info("Default system configurations seeded successfully")
80+
81+
// Initialize pipeline client for preset pipelines
4382
pipelinePublicServiceClient, pipelinePublicClose, err := clientgrpcx.NewClient[pipelinepb.PipelinePublicServiceClient](
4483
clientgrpcx.WithServiceConfig(clientx.ServiceConfig{
4584
Host: config.Config.PipelineBackend.Host,
@@ -66,7 +105,7 @@ func main() {
66105
}
67106

68107
for _, pr := range pipeline.PresetPipelinesList {
69-
logger := logger.With(zap.String("id", pr.ID), zap.String("version", pr.Version))
108+
logger := logger.With(zap.String("slug", pr.Slug()), zap.String("displayName", pr.DisplayName), zap.String("version", pr.Version))
70109
if err := upserter.Upsert(ctx, pr); err != nil {
71110
logger.Error("Failed to add pipeline", zap.Error(err))
72111
continue

cmd/main/main.go

Lines changed: 3 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -39,15 +39,12 @@ import (
3939
"github.com/instill-ai/artifact-backend/pkg/repository"
4040
"github.com/instill-ai/artifact-backend/pkg/repository/object"
4141
"github.com/instill-ai/artifact-backend/pkg/service"
42-
"github.com/instill-ai/artifact-backend/pkg/usage"
43-
"github.com/instill-ai/artifact-backend/pkg/utils"
4442
"github.com/instill-ai/artifact-backend/pkg/worker"
4543

4644
database "github.com/instill-ai/artifact-backend/pkg/db"
47-
artifactpb "github.com/instill-ai/protogen-go/artifact/artifact/v1alpha"
48-
mgmtpb "github.com/instill-ai/protogen-go/core/mgmt/v1beta"
49-
usagepb "github.com/instill-ai/protogen-go/core/usage/v1beta"
50-
pipelinepb "github.com/instill-ai/protogen-go/pipeline/pipeline/v1beta"
45+
artifactpb "github.com/instill-ai/protogen-go/artifact/v1alpha"
46+
mgmtpb "github.com/instill-ai/protogen-go/mgmt/v1beta"
47+
pipelinepb "github.com/instill-ai/protogen-go/pipeline/v1beta"
5148
clientx "github.com/instill-ai/x/client"
5249
clientgrpcx "github.com/instill-ai/x/client/grpc"
5350
logx "github.com/instill-ai/x/log"
@@ -253,39 +250,6 @@ func main() {
253250
}),
254251
)
255252

256-
// Start usage reporter
257-
var usg usage.Usage
258-
if config.Config.Server.Usage.Enabled {
259-
usageServiceClient, usageServiceClientClose, err := clientgrpcx.NewClient[usagepb.UsageServiceClient](
260-
clientgrpcx.WithServiceConfig(clientx.ServiceConfig{
261-
Host: config.Config.Server.Usage.Host,
262-
PublicPort: config.Config.Server.Usage.Port,
263-
}),
264-
clientgrpcx.WithSetOTELClientHandler(config.Config.OTELCollector.Enable),
265-
)
266-
if err != nil {
267-
logger.Error("failed to create usage service client", zap.Error(err))
268-
}
269-
defer func() {
270-
if err := usageServiceClientClose(); err != nil {
271-
logger.Error("failed to close usage service client", zap.Error(err))
272-
}
273-
}()
274-
logger.Info("try to start usage reporter")
275-
go utils.GoRecover(func() {
276-
for {
277-
usg = usage.NewUsage(ctx, mgmtPrivateServiceClient, redisClient, usageServiceClient, serviceVersion)
278-
if usg != nil {
279-
usg.StartReporter(ctx)
280-
logger.Info("usage reporter started")
281-
break
282-
}
283-
logger.Warn("retry to start usage reporter after 5 minutes")
284-
time.Sleep(5 * time.Minute)
285-
}
286-
}, "Usage Reporter")
287-
}
288-
289253
dialOpts, err := clientgrpcx.NewClientOptionsAndCreds(
290254
clientgrpcx.WithServiceConfig(clientx.ServiceConfig{
291255
HTTPS: clientx.HTTPSConfig{
@@ -363,9 +327,6 @@ func main() {
363327
logger.Error(fmt.Sprintf("Fatal error: %v\n", err))
364328
os.Exit(1)
365329
case <-quitSig:
366-
if config.Config.Server.Usage.Enabled && usg != nil {
367-
usg.TriggerSingleReporter(ctx)
368-
}
369330
logger.Info("Shutting down server...")
370331
publicGrpcS.GracefulStop()
371332
logger.Info("server shutdown due to signal")

cmd/worker/main.go

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,8 @@ import (
3737

3838
database "github.com/instill-ai/artifact-backend/pkg/db"
3939
artifactworker "github.com/instill-ai/artifact-backend/pkg/worker"
40-
mgmtpb "github.com/instill-ai/protogen-go/core/mgmt/v1beta"
41-
pipelinepb "github.com/instill-ai/protogen-go/pipeline/pipeline/v1beta"
40+
mgmtpb "github.com/instill-ai/protogen-go/mgmt/v1beta"
41+
pipelinepb "github.com/instill-ai/protogen-go/pipeline/v1beta"
4242
clientgrpcx "github.com/instill-ai/x/client/grpc"
4343
logx "github.com/instill-ai/x/log"
4444
miniox "github.com/instill-ai/x/minio"
@@ -207,16 +207,17 @@ func main() {
207207

208208
// ===== CleanupKnowledgeBaseWorkflow Activities =====
209209
// Activities for cleaning up entire knowledge base resources
210-
w.RegisterActivity(cw.GetInProgressFileCountActivity) // Check for in-progress files before cleanup
211-
w.RegisterActivity(cw.DeleteKBFilesFromMinIOActivity) // Delete all KB files from MinIO
212-
w.RegisterActivity(cw.DropVectorDBCollectionActivity) // Drop Milvus collection for KB
213-
w.RegisterActivity(cw.DeleteKBFileRecordsActivity) // Delete all file records from database
214-
w.RegisterActivity(cw.DeleteKBConvertedFileRecordsActivity) // Delete all converted file records
215-
w.RegisterActivity(cw.DeleteKBTextChunkRecordsActivity) // Delete all chunk records
216-
w.RegisterActivity(cw.DeleteKBEmbeddingRecordsActivity) // Delete all embedding records
217-
w.RegisterActivity(cw.SoftDeleteKBRecordActivity) // Soft-delete the KB record itself
218-
w.RegisterActivity(cw.PurgeKBACLActivity) // Remove all ACL permissions for KB
219-
w.RegisterActivity(cw.ClearProductionKBRetentionActivity) // Clear retention field on production KB after rollback cleanup
210+
w.RegisterActivity(cw.GetInProgressFileCountActivity) // Check for in-progress files before cleanup
211+
w.RegisterActivity(cw.DeleteKBFilesFromMinIOActivity) // Delete all KB files from MinIO
212+
w.RegisterActivity(cw.DropVectorDBCollectionActivity) // Drop Milvus collection for KB
213+
w.RegisterActivity(cw.DeleteKBFileRecordsActivity) // Delete all file records from database
214+
w.RegisterActivity(cw.DeleteKBConvertedFileRecordsActivity) // Delete all converted file records
215+
w.RegisterActivity(cw.DeleteKBTextChunkRecordsActivity) // Delete all chunk records
216+
w.RegisterActivity(cw.DeleteKBEmbeddingRecordsActivity) // Delete all embedding records
217+
w.RegisterActivity(cw.SoftDeleteKBRecordActivity) // Soft-delete the KB record itself
218+
w.RegisterActivity(cw.PurgeKBACLActivity) // Remove all ACL permissions for KB
219+
w.RegisterActivity(cw.ClearProductionKBRetentionActivity) // Clear retention field on production KB after rollback cleanup
220+
w.RegisterActivity(cw.CheckRollbackRetentionExpiredActivity) // Check if rollback KB retention period has expired
220221

221222
// ===== RAG update activities =====
222223
// Activities for knowledge base update workflow (6-phase)
@@ -264,7 +265,7 @@ func main() {
264265
// Chunking Phase - Combined content and summary chunking (sequential after parallel AI operations)
265266
w.RegisterActivity(cw.DeleteOldTextChunksActivity) // Delete old text chunk records before creating new ones
266267
w.RegisterActivity(cw.ChunkContentActivity) // Split markdown content into semantic chunks
267-
w.RegisterActivity(cw.SaveTextChunksActivity) // Persist chunks to database and MinIO storage
268+
w.RegisterActivity(cw.SaveChunksActivity) // Persist chunks to database and MinIO storage
268269

269270
// Embedding Phase - Vector embedding generation and storage
270271
w.RegisterActivity(cw.EmbedAndSaveChunksActivity) // Combined: query chunks, generate embeddings, save to DB/Milvus

0 commit comments

Comments
 (0)