Skip to content

Commit 4017f71

Browse files
committed
Merge branch 'master-jdk17' of https://gitee.com/zhijiantianya/yudao-cloud
2 parents 84834c7 + ec8577b commit 4017f71

File tree

29 files changed

+956
-33
lines changed

29 files changed

+956
-33
lines changed

yudao-framework/yudao-spring-boot-starter-biz-tenant/src/main/java/cn/iocoder/yudao/framework/tenant/core/redis/TenantRedisCacheManager.java

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package cn.iocoder.yudao.framework.tenant.core.redis;
22

33
import cn.hutool.core.collection.CollUtil;
4+
import cn.hutool.core.util.StrUtil;
45
import cn.iocoder.yudao.framework.redis.core.TimeoutRedisCacheManager;
56
import cn.iocoder.yudao.framework.tenant.core.context.TenantContextHolder;
67
import lombok.extern.slf4j.Slf4j;
@@ -21,6 +22,8 @@
2122
@Slf4j
2223
public class TenantRedisCacheManager extends TimeoutRedisCacheManager {
2324

25+
private static final String SPLIT = "#";
26+
2427
private final Set<String> ignoreCaches;
2528

2629
public TenantRedisCacheManager(RedisCacheWriter cacheWriter,
@@ -32,15 +35,16 @@ public TenantRedisCacheManager(RedisCacheWriter cacheWriter,
3235

3336
@Override
3437
public Cache getCache(String name) {
38+
String[] names = StrUtil.splitToArray(name, SPLIT);
3539
// 如果开启多租户,则 name 拼接租户后缀
3640
if (!TenantContextHolder.isIgnore()
3741
&& TenantContextHolder.getTenantId() != null
38-
&& !CollUtil.contains(ignoreCaches, name)) {
42+
&& !CollUtil.contains(ignoreCaches, names[0])) {
3943
name = name + ":" + TenantContextHolder.getTenantId();
4044
}
4145

4246
// 继续基于父方法
4347
return super.getCache(name);
4448
}
4549

46-
}
50+
}
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
package cn.iocoder.yudao.module.ai.enums;
2+
3+
import lombok.AllArgsConstructor;
4+
import lombok.Getter;
5+
6+
/**
7+
* AI 知识库文档切片策略枚举
8+
*
9+
* @author runzhen
10+
*/
11+
@AllArgsConstructor
12+
@Getter
13+
public enum AiDocumentSplitStrategyEnum {
14+
15+
/**
16+
* 自动识别文档类型并选择最佳切片策略
17+
*/
18+
AUTO("auto", "自动识别"),
19+
20+
/**
21+
* 基于 Token 数量机械切分(默认策略)
22+
*/
23+
TOKEN("token", "Token 切分"),
24+
25+
/**
26+
* 按段落切分(以双换行符为分隔)
27+
*/
28+
PARAGRAPH("paragraph", "段落切分"),
29+
30+
/**
31+
* Markdown QA 格式专用切片器
32+
* 识别二级标题作为问题,保持问答对完整性
33+
* 长答案智能切分但保留问题作为上下文
34+
*/
35+
MARKDOWN_QA("markdown_qa", "Markdown QA 切分"),
36+
37+
/**
38+
* 语义化切分,保留句子完整性
39+
* 在段落和句子边界处切分,避免截断
40+
*/
41+
SEMANTIC("semantic", "语义切分");
42+
43+
/**
44+
* 策略代码
45+
*/
46+
private final String code;
47+
48+
/**
49+
* 策略名称
50+
*/
51+
private final String name;
52+
53+
}

yudao-module-ai/yudao-module-ai-server/pom.xml

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,8 @@
2020
</description>
2121
<properties>
2222
<spring-ai.version>1.1.0</spring-ai.version>
23-
<alibaba-ai.version>1.1.0.0-M5</alibaba-ai.version>
23+
<!-- https://mvnrepository.com/artifact/com.alibaba.cloud.ai/spring-ai-alibaba -->
24+
<alibaba-ai.version>1.1.0.0-RC1</alibaba-ai.version>
2425
<tinyflow.version>1.2.6</tinyflow.version>
2526
</properties>
2627

@@ -262,6 +263,11 @@
262263
<groupId>com.agentsflex</groupId>
263264
<artifactId>agents-flex-store-elasticsearch</artifactId>
264265
</exclusion>
266+
<exclusion>
267+
<!-- 解决 https://t.zsxq.com/pCBZC 问题 -->
268+
<groupId>com.agentsflex</groupId>
269+
<artifactId>agents-flex-search-engine-es</artifactId>
270+
</exclusion>
265271
<exclusion>
266272
<!-- TODO @芋艿:暂时移除 groovy,和 iot 冲突 -->
267273
<groupId>org.codehaus.groovy</groupId>

yudao-module-ai/yudao-module-ai-server/src/main/java/cn/iocoder/yudao/module/ai/controller/admin/knowledge/vo/segment/AiKnowledgeSegmentPageReqVO.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
public class AiKnowledgeSegmentPageReqVO extends PageParam {
1212

1313
@Schema(description = "文档编号", example = "1")
14-
private Integer documentId;
14+
private Long documentId;
1515

1616
@Schema(description = "分段内容关键字", example = "Java 开发")
1717
private String content;

yudao-module-ai/yudao-module-ai-server/src/main/java/cn/iocoder/yudao/module/ai/service/knowledge/AiKnowledgeSegmentServiceImpl.java

Lines changed: 106 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import cn.hutool.core.collection.ListUtil;
55
import cn.hutool.core.util.ObjUtil;
66
import cn.hutool.core.util.StrUtil;
7+
78
import cn.iocoder.yudao.framework.common.enums.CommonStatusEnum;
89
import cn.iocoder.yudao.framework.common.pojo.PageResult;
910
import cn.iocoder.yudao.framework.common.util.object.BeanUtils;
@@ -15,8 +16,11 @@
1516
import cn.iocoder.yudao.module.ai.dal.dataobject.knowledge.AiKnowledgeDocumentDO;
1617
import cn.iocoder.yudao.module.ai.dal.dataobject.knowledge.AiKnowledgeSegmentDO;
1718
import cn.iocoder.yudao.module.ai.dal.mysql.knowledge.AiKnowledgeSegmentMapper;
19+
import cn.iocoder.yudao.module.ai.enums.AiDocumentSplitStrategyEnum;
1820
import cn.iocoder.yudao.module.ai.service.knowledge.bo.AiKnowledgeSegmentSearchReqBO;
1921
import cn.iocoder.yudao.module.ai.service.knowledge.bo.AiKnowledgeSegmentSearchRespBO;
22+
import cn.iocoder.yudao.module.ai.service.knowledge.splitter.MarkdownQaSplitter;
23+
import cn.iocoder.yudao.module.ai.service.knowledge.splitter.SemanticTextSplitter;
2024
import cn.iocoder.yudao.module.ai.service.model.AiModelService;
2125
import com.alibaba.cloud.ai.dashscope.rerank.DashScopeRerankOptions;
2226
import com.alibaba.cloud.ai.model.RerankModel;
@@ -39,8 +43,7 @@
3943

4044
import static cn.iocoder.yudao.framework.common.exception.util.ServiceExceptionUtil.exception;
4145
import static cn.iocoder.yudao.framework.common.util.collection.CollectionUtils.convertList;
42-
import static cn.iocoder.yudao.module.ai.enums.ErrorCodeConstants.KNOWLEDGE_SEGMENT_CONTENT_TOO_LONG;
43-
import static cn.iocoder.yudao.module.ai.enums.ErrorCodeConstants.KNOWLEDGE_SEGMENT_NOT_EXISTS;
46+
import static cn.iocoder.yudao.module.ai.enums.ErrorCodeConstants.*;
4447
import static org.springframework.ai.vectorstore.SearchRequest.SIMILARITY_THRESHOLD_ACCEPT_ALL;
4548

4649
/**
@@ -95,8 +98,9 @@ public void createKnowledgeSegmentBySplitContent(Long documentId, String content
9598
AiKnowledgeDO knowledgeDO = knowledgeService.validateKnowledgeExists(documentDO.getKnowledgeId());
9699
VectorStore vectorStore = getVectorStoreById(knowledgeDO);
97100

98-
// 2. 文档切片
99-
List<Document> documentSegments = splitContentByToken(content, documentDO.getSegmentMaxTokens());
101+
// 2. 文档切片(使用自动检测策略)
102+
List<Document> documentSegments = splitContentByStrategy(content, documentDO.getSegmentMaxTokens(),
103+
AiDocumentSplitStrategyEnum.AUTO, documentDO.getUrl());
100104

101105
// 3.1 存储切片
102106
List<AiKnowledgeSegmentDO> segmentDOs = convertList(documentSegments, segment -> {
@@ -295,8 +299,10 @@ public List<AiKnowledgeSegmentDO> splitContent(String url, Integer segmentMaxTok
295299
// 1. 读取 URL 内容
296300
String content = knowledgeDocumentService.readUrl(url);
297301

298-
// 2. 文档切片
299-
List<Document> documentSegments = splitContentByToken(content, segmentMaxTokens);
302+
// 2.1 自动检测文档类型并选择策略
303+
AiDocumentSplitStrategyEnum strategy = detectDocumentStrategy(content, url);
304+
// 2.2 文档切片
305+
List<Document> documentSegments = splitContentByStrategy(content, segmentMaxTokens, strategy, url);
300306

301307
// 3. 转换为段落对象
302308
return convertList(documentSegments, segment -> {
@@ -333,11 +339,103 @@ private VectorStore getVectorStoreById(Long knowledgeId) {
333339
return getVectorStoreById(knowledge);
334340
}
335341

336-
private static List<Document> splitContentByToken(String content, Integer segmentMaxTokens) {
337-
TextSplitter textSplitter = buildTokenTextSplitter(segmentMaxTokens);
342+
/**
343+
* 根据策略切分内容
344+
*
345+
* @param content 文档内容
346+
* @param segmentMaxTokens 分段的最大 Token 数
347+
* @param strategy 切片策略
348+
* @param url 文档 URL(用于自动检测文件类型)
349+
* @return 切片后的文档列表
350+
*/
351+
@SuppressWarnings("EnhancedSwitchMigration")
352+
private List<Document> splitContentByStrategy(String content, Integer segmentMaxTokens,
353+
AiDocumentSplitStrategyEnum strategy, String url) {
354+
// 自动检测策略
355+
if (strategy == AiDocumentSplitStrategyEnum.AUTO) {
356+
strategy = detectDocumentStrategy(content, url);
357+
log.info("[splitContentByStrategy][自动检测到文档策略: {}]", strategy.getName());
358+
}
359+
// 根据策略切分
360+
TextSplitter textSplitter;
361+
switch (strategy) {
362+
case MARKDOWN_QA:
363+
textSplitter = new MarkdownQaSplitter(segmentMaxTokens);
364+
break;
365+
case SEMANTIC:
366+
textSplitter = new SemanticTextSplitter(segmentMaxTokens);
367+
break;
368+
case PARAGRAPH:
369+
textSplitter = new SemanticTextSplitter(segmentMaxTokens, 0); // 段落切分,无重叠
370+
break;
371+
case TOKEN:
372+
default:
373+
textSplitter = buildTokenTextSplitter(segmentMaxTokens);
374+
break;
375+
}
376+
// 执行切分
338377
return textSplitter.apply(Collections.singletonList(new Document(content)));
339378
}
340379

380+
/**
381+
* 自动检测文档类型并选择切片策略
382+
*
383+
* @param content 文档内容
384+
* @param url 文档 URL
385+
* @return 推荐的切片策略
386+
*/
387+
private AiDocumentSplitStrategyEnum detectDocumentStrategy(String content, String url) {
388+
if (StrUtil.isEmpty(content)) {
389+
return AiDocumentSplitStrategyEnum.TOKEN;
390+
}
391+
// 1. 检测 Markdown QA 格式
392+
if (isMarkdownQaFormat(content, url)) {
393+
return AiDocumentSplitStrategyEnum.MARKDOWN_QA;
394+
}
395+
// 2. 检测普通 Markdown 文档
396+
if (isMarkdownDocument(url)) {
397+
return AiDocumentSplitStrategyEnum.SEMANTIC;
398+
}
399+
// 3. 默认使用语义切分(比 Token 切分更智能)
400+
return AiDocumentSplitStrategyEnum.SEMANTIC;
401+
}
402+
403+
/**
404+
* 检测是否为 Markdown QA 格式
405+
* 特征:包含多个二级标题(## )且标题后紧跟答案内容
406+
*/
407+
private boolean isMarkdownQaFormat(String content, String url) {
408+
// 文件扩展名判断
409+
if (StrUtil.isNotEmpty(url) && !url.toLowerCase().endsWith(".md")) {
410+
return false;
411+
}
412+
413+
// 统计二级标题数量
414+
long h2Count = content.lines()
415+
.filter(line -> line.trim().startsWith("## "))
416+
.count();
417+
418+
// 要求一:至少包含 2 个二级标题才认为是 QA 格式
419+
if (h2Count < 2) {
420+
return false;
421+
}
422+
423+
// 要求二:检查标题占比(QA 文档标题行数相对较多),如果二级标题占比超过 10%,认为是 QA 格式
424+
long totalLines = content.lines().count();
425+
double h2Ratio = (double) h2Count / totalLines;
426+
return h2Ratio > 0.1;
427+
}
428+
429+
/**
430+
* 检测是否为 Markdown 文档
431+
*/
432+
private boolean isMarkdownDocument(String url) {
433+
return StrUtil.endWithAnyIgnoreCase(url, ".md", ".markdown");
434+
}
435+
436+
/**
437+
* 构建基于 Token 的文本切片器(原有逻辑保留)
438+
*/
341439
private static TextSplitter buildTokenTextSplitter(Integer segmentMaxTokens) {
342440
return TokenTextSplitter.builder()
343441
.withChunkSize(segmentMaxTokens)

0 commit comments

Comments
 (0)