Skip to content

Commit e854a02

Browse files
authored
feat: update knowledge base processing to use KnowledgeBase object and enhance configuration (#46)
* feat: update knowledge base processing to use KnowledgeBase object and enhance configuration
1 parent d0bac68 commit e854a02

File tree

10 files changed

+50
-83
lines changed

10 files changed

+50
-83
lines changed

backend/pom.xml

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,13 @@
7575
<type>pom</type>
7676
<scope>import</scope>
7777
</dependency>
78+
<dependency>
79+
<groupId>com.google.protobuf</groupId>
80+
<artifactId>protobuf-bom</artifactId>
81+
<version>3.25.5</version>
82+
<type>pom</type>
83+
<scope>import</scope>
84+
</dependency>
7885
<dependency>
7986
<groupId>org.springframework.cloud</groupId>
8087
<artifactId>spring-cloud-dependencies</artifactId>
@@ -165,12 +172,6 @@
165172
</exclusions>
166173
</dependency>
167174

168-
<dependency>
169-
<groupId>org.springframework.ai</groupId>
170-
<artifactId>spring-ai-starter-mcp-server-webmvc</artifactId>
171-
<version>${spring-ai.version}</version>
172-
</dependency>
173-
174175
<dependency>
175176
<groupId>com.baomidou</groupId>
176177
<artifactId>mybatis-plus-spring-boot3-starter</artifactId>

backend/services/main-application/src/main/resources/application.yml

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -167,13 +167,5 @@ datamate:
167167

168168
# RAG配置
169169
rag:
170-
embedding:
171-
model: ${RAG_EMBEDDING_MODEL:text-embedding-ada-002}
172-
api-key: ${RAG_API_KEY:}
173-
dimension: ${RAG_DIMENSION:1536}
174-
chunk:
175-
size: ${RAG_CHUNK_SIZE:512}
176-
overlap: ${RAG_CHUNK_OVERLAP:50}
177-
retrieval:
178-
top-k: ${RAG_TOP_K:5}
179-
score-threshold: ${RAG_SCORE_THRESHOLD:0.7}
170+
milvus-host: ${MILVUS_HOST:milvus-standalone}
171+
milvus-port: ${MILVUS_PORT:19530}

backend/services/rag-indexer-service/pom.xml

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -98,15 +98,12 @@
9898
<dependency>
9999
<groupId>dev.langchain4j</groupId>
100100
<artifactId>langchain4j-milvus</artifactId>
101-
</dependency>
102-
103-
<dependency>
104-
<groupId>dev.langchain4j</groupId>
105-
<artifactId>langchain4j-embeddings-all-minilm-l6-v2</artifactId>
106-
</dependency>
107-
<dependency>
108-
<groupId>org.testcontainers</groupId>
109-
<artifactId>milvus</artifactId>
101+
<exclusions>
102+
<exclusion>
103+
<groupId>com.google.protobuf</groupId>
104+
<artifactId>protobuf-java</artifactId>
105+
</exclusion>
106+
</exclusions>
110107
</dependency>
111108
</dependencies>
112109

backend/services/rag-indexer-service/src/main/java/com/dataengine/rag/indexer/RagApplication.java

Lines changed: 0 additions & 17 deletions
This file was deleted.

backend/services/rag-indexer-service/src/main/java/com/datamate/rag/indexer/application/KnowledgeBaseService.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -93,13 +93,13 @@ public void addFiles(AddFilesReq request) {
9393
List<RagFile> ragFiles = request.getFiles().stream().map(fileInfo -> {
9494
RagFile ragFile = new RagFile();
9595
ragFile.setKnowledgeBaseId(knowledgeBase.getId());
96-
ragFile.setFileId(fileInfo.fileId());
97-
ragFile.setFileName(fileInfo.fileName());
96+
ragFile.setFileId(fileInfo.id());
97+
ragFile.setFileName(fileInfo.name());
9898
ragFile.setStatus(FileStatus.UNPROCESSED);
9999
return ragFile;
100100
}).toList();
101101
ragFileRepository.saveBatch(ragFiles, 100);
102-
eventPublisher.publishEvent(new DataInsertedEvent(knowledgeBase.getId(), request.getProcessType()));
102+
eventPublisher.publishEvent(new DataInsertedEvent(knowledgeBase, request.getProcessType()));
103103
}
104104

105105
public PagedResponse<RagFile> listFiles(String knowledgeBaseId, RagFileReq request) {
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
package com.datamate.rag.indexer.infrastructure.event;
22

3+
import com.datamate.rag.indexer.domain.model.KnowledgeBase;
34
import com.datamate.rag.indexer.interfaces.dto.ProcessType;
45

56
/**
@@ -8,5 +9,5 @@
89
* @author dallas
910
* @since 2025-10-29
1011
*/
11-
public record DataInsertedEvent(String knowledgeBaseId, ProcessType processType) {
12+
public record DataInsertedEvent(KnowledgeBase knowledgeBase, ProcessType processType) {
1213
}

backend/services/rag-indexer-service/src/main/java/com/datamate/rag/indexer/infrastructure/event/RagEtlService.java

Lines changed: 25 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -23,12 +23,11 @@
2323
import dev.langchain4j.data.embedding.Embedding;
2424
import dev.langchain4j.data.segment.TextSegment;
2525
import dev.langchain4j.model.embedding.EmbeddingModel;
26-
import dev.langchain4j.model.output.Response;
2726
import dev.langchain4j.store.embedding.EmbeddingStore;
2827
import dev.langchain4j.store.embedding.milvus.MilvusEmbeddingStore;
2928
import lombok.RequiredArgsConstructor;
3029
import lombok.extern.slf4j.Slf4j;
31-
import org.jetbrains.annotations.NotNull;
30+
import org.springframework.beans.factory.annotation.Value;
3231
import org.springframework.scheduling.annotation.Async;
3332
import org.springframework.stereotype.Service;
3433
import org.springframework.transaction.event.TransactionPhase;
@@ -52,6 +51,11 @@
5251
public class RagEtlService {
5352
private static final Semaphore SEMAPHORE = new Semaphore(10);
5453

54+
@Value("${datamate.rag.milvus-host}")
55+
private String milvusHost;
56+
@Value("${datamate.rag.milvus-port}")
57+
private int milvusPort;
58+
5559
private final RagFileRepository ragFileRepository;
5660

5761
private final DatasetFileRepository datasetFileRepository;
@@ -64,7 +68,7 @@ public class RagEtlService {
6468
@TransactionalEventListener(phase = TransactionPhase.AFTER_COMMIT)
6569
public void processAfterCommit(DataInsertedEvent event) {
6670
// 执行 RAG 处理流水线
67-
List<RagFile> ragFiles = ragFileRepository.findByKnowledgeBaseId(event.knowledgeBaseId());
71+
List<RagFile> ragFiles = ragFileRepository.findByKnowledgeBaseId(event.knowledgeBase().getId());
6872

6973
ragFiles.forEach(ragFile -> {
7074
try {
@@ -74,12 +78,13 @@ public void processAfterCommit(DataInsertedEvent event) {
7478
// 执行 RAG 处理流水线
7579
ragFile.setStatus(FileStatus.PROCESSING);
7680
ragFileRepository.updateById(ragFile);
77-
processRagFile(ragFile, event.processType());
81+
processRagFile(ragFile, event);
7882
// 更新文件状态为已处理
7983
ragFile.setStatus(FileStatus.PROCESSED);
8084
ragFileRepository.updateById(ragFile);
8185
} catch (Exception e) {
8286
// 处理异常
87+
log.error("Error processing RAG file: {}", ragFile.getFileId(), e);
8388
ragFile.setStatus(FileStatus.PROCESS_FAILED);
8489
ragFileRepository.updateById(ragFile);
8590
} finally {
@@ -93,31 +98,31 @@ public void processAfterCommit(DataInsertedEvent event) {
9398
);
9499
}
95100

96-
private void processRagFile(RagFile ragFile, ProcessType processType) {
101+
private void processRagFile(RagFile ragFile, DataInsertedEvent event) {
97102
DatasetFile file = datasetFileRepository.getById(ragFile.getFileId());
98103
// 使用文档解析器解析文档
99104
DocumentParser parser = documentParser(file.getFileType());
100105
// 从文件系统读取文档
101106
Document document = FileSystemDocumentLoader.loadDocument(file.getFilePath(), parser);
102107
// 对html文档进行转换
103108
if (Arrays.asList("html", "htm").contains(file.getFileType().toLowerCase())) {
104-
document= new HtmlToTextDocumentTransformer().transform(document);
109+
document = new HtmlToTextDocumentTransformer().transform(document);
105110
}
106111
// 使用文档分块器对文档进行分块
107-
DocumentSplitter splitter = documentSplitter(processType);
112+
DocumentSplitter splitter = documentSplitter(event.processType());
108113
List<TextSegment> split = splitter.split(document);
109114

110115
// 更新分块数量
111116
ragFile.setChunkCount(split.size());
112117
ragFileRepository.updateById(ragFile);
113118

114119
// 调用模型客户端获取嵌入模型
115-
ModelConfig model = modelConfigRepository.getById("1");
120+
ModelConfig model = modelConfigRepository.getById(event.knowledgeBase().getEmbeddingModel());
116121
EmbeddingModel embeddingModel = ModelClient.invokeEmbeddingModel(model);
117122
// 调用嵌入模型获取嵌入向量
118-
Response<@NotNull List<Embedding>> response = embeddingModel.embedAll(split);
123+
List<Embedding> content = embeddingModel.embedAll(split).content();
119124
// 存储嵌入向量到 Milvus
120-
embeddingStore().addAll(response.content(), split);
125+
embeddingStore(embeddingModel, ragFile.getKnowledgeBaseId()).addAll(content, split);
121126
}
122127

123128
/**
@@ -139,19 +144,20 @@ public DocumentParser documentParser(String fileType) {
139144

140145
public DocumentSplitter documentSplitter(ProcessType processType) {
141146
return switch (processType) {
142-
case CHAPTER_CHUNK -> new DocumentByParagraphSplitter(1000, 100);
143-
case PARAGRAPH_CHUNK -> new DocumentByLineSplitter(1000, 100);
144-
case LENGTH_CHUNK -> new DocumentBySentenceSplitter(1000, 100);
145-
case CUSTOM_SEPARATOR_CHUNK -> new DocumentByWordSplitter(1000, 100);
146-
case DEFAULT_CHUNK -> new DocumentByRegexSplitter("\\n\\n", "",1000, 100);
147+
case PARAGRAPH_CHUNK -> new DocumentByParagraphSplitter(1000, 100);
148+
case CHAPTER_CHUNK -> new DocumentByLineSplitter(1000, 100);
149+
case CUSTOM_SEPARATOR_CHUNK -> new DocumentBySentenceSplitter(1000, 100);
150+
case LENGTH_CHUNK -> new DocumentByWordSplitter(1000, 100);
151+
case DEFAULT_CHUNK -> new DocumentByLineSplitter(1000, 100);
147152
};
148153
}
149154

150-
public EmbeddingStore<TextSegment> embeddingStore() {
155+
public EmbeddingStore<TextSegment> embeddingStore(EmbeddingModel embeddingModel, String knowledgeBaseId) {
151156
return MilvusEmbeddingStore.builder()
152-
.uri("http://milvus:19530")
153-
.collectionName("rag_embeddings")
154-
.dimension(1536)
157+
.host(milvusHost)
158+
.port(milvusPort)
159+
.collectionName("datamate_" + knowledgeBaseId)
160+
.dimension(embeddingModel.dimension())
155161
.build();
156162
}
157163
}

backend/services/rag-indexer-service/src/main/java/com/datamate/rag/indexer/interfaces/EmbeddingController.java

Lines changed: 0 additions & 8 deletions
This file was deleted.

backend/services/rag-indexer-service/src/main/java/com/datamate/rag/indexer/interfaces/KnowledgeBaseController.java

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,14 @@
44
import com.datamate.rag.indexer.domain.model.KnowledgeBase;
55
import com.datamate.rag.indexer.domain.model.RagChunk;
66
import com.datamate.rag.indexer.domain.model.RagFile;
7-
import com.datamate.common.infrastructure.common.Response;
87
import com.datamate.common.interfaces.PagedResponse;
98
import com.datamate.common.interfaces.PagingQuery;
109
import com.datamate.rag.indexer.interfaces.dto.*;
10+
import jakarta.validation.Valid;
1111
import lombok.RequiredArgsConstructor;
1212
import org.springframework.web.bind.annotation.*;
1313

14-
import javax.validation.Valid;
14+
1515

1616
/**
1717
* 知识库控制器
@@ -21,15 +21,10 @@
2121
*/
2222
@RestController
2323
@RequiredArgsConstructor
24-
@RequestMapping("/v1/knowledge-base")
24+
@RequestMapping("/knowledge-base")
2525
public class KnowledgeBaseController {
2626
private final KnowledgeBaseService knowledgeBaseService;
2727

28-
@GetMapping(path = "/test1")
29-
public String test() {
30-
return "test1";
31-
}
32-
3328
/**
3429
* 创建知识库
3530
*
@@ -105,7 +100,7 @@ public void addFiles(@PathVariable("knowledgeBaseId") String knowledgeBaseId,
105100
*/
106101
@GetMapping("/{knowledgeBaseId}/files")
107102
public PagedResponse<RagFile> listFiles(@PathVariable("knowledgeBaseId") String knowledgeBaseId,
108-
@RequestBody @Valid RagFileReq request) {
103+
RagFileReq request) {
109104
return knowledgeBaseService.listFiles(knowledgeBaseId, request);
110105
}
111106

backend/services/rag-indexer-service/src/main/java/com/datamate/rag/indexer/interfaces/dto/AddFilesReq.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,6 @@ public class AddFilesReq {
1818
private ProcessType processType;
1919
private List<FileInfo> files;
2020

21-
public record FileInfo(String fileId, String fileName) {
21+
public record FileInfo(String id, String name) {
2222
}
2323
}

0 commit comments

Comments
 (0)