Skip to content

Commit 9f575e6

Browse files
committed
feat[batch]: 스프링 배치 적용
1 parent 1b0e0b2 commit 9f575e6

File tree

2 files changed

+233
-0
lines changed

2 files changed

+233
-0
lines changed
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
package com.ai.lawyer.global.batch;
2+
3+
/*@Slf4j
4+
@Component
5+
@EnableScheduling
6+
@RequiredArgsConstructor
7+
public class BatchScheduler {
8+
9+
private final JobLauncher jobLauncher;
10+
private final Job dataVectorizationJob;
11+
12+
@Scheduled(cron = "#{${batch.scheduler.run-every-minute} ? '* * * * * *' : '* * 2 * * *'}")
13+
public void runVectorizationJob() {
14+
log.info("전체 데이터(판례, 법령) 벡터화 스케줄러 실행...");
15+
try {
16+
JobParameters jobParameters = new JobParametersBuilder()
17+
.addString("requestDate", LocalDateTime.now().toString())
18+
.toJobParameters();
19+
20+
jobLauncher.run(dataVectorizationJob, jobParameters); // Job 실행
21+
} catch (Exception e) {
22+
log.error("전체 데이터 벡터화 배치 작업 실행 중 오류 발생", e);
23+
}
24+
}
25+
}*/
Lines changed: 208 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,208 @@
1+
package com.ai.lawyer.global.batch;
2+
3+
/*@Slf4j
4+
@Configuration
5+
@RequiredArgsConstructor
6+
public class DataVectorizationJobConfig {
7+
8+
private final JobRepository jobRepository;
9+
private final PlatformTransactionManager transactionManager;
10+
private final EntityManagerFactory entityManagerFactory;
11+
private final VectorStore vectorStore;
12+
13+
private final JangRepository jangRepository;
14+
private final JoRepository joRepository;
15+
private final HangRepository hangRepository;
16+
private final HoRepository hoRepository;
17+
18+
private final TokenTextSplitter tokenSplitter = TokenTextSplitter.builder()
19+
.withChunkSize(800)
20+
.withMinChunkSizeChars(0)
21+
.withMinChunkLengthToEmbed(5)
22+
.withMaxNumChunks(10000)
23+
.withKeepSeparator(true)
24+
.build();
25+
26+
private static final int CHUNK_SIZE = 10; // 배치 처리 시 한 번에 읽어올 데이터 수
27+
28+
@Value("${batch.page.size.precedent}")
29+
private int precedentPageSize; // 하루에 처리할 판례 수
30+
31+
@Value("${batch.page.size.law}")
32+
private int lawPageSize; // 하루에 처리할 법령 수
33+
34+
@Bean
35+
public TaskExecutor taskExecutor() {
36+
ThreadPoolTaskExecutor executor = new ThreadPoolTaskExecutor();
37+
executor.setCorePoolSize(10);
38+
executor.setMaxPoolSize(20);
39+
executor.setQueueCapacity(100);
40+
executor.setThreadNamePrefix("async-thread-");
41+
executor.initialize();
42+
return executor;
43+
}
44+
45+
// -------------- 전체 데이터 벡터화 정의 --------------
46+
@Bean
47+
public Job dataVectorizationJob() {
48+
return new JobBuilder("dataVectorizationJob", jobRepository)
49+
.start(precedentVectorizationStep()) // 판례 벡터화 Step 실행
50+
.next(lawVectorizationStep()) // 법령 벡터화 Step 실행
51+
.build();
52+
}
53+
54+
// -------------- 판례 벡터화 ---------------
55+
@Bean
56+
public Step precedentVectorizationStep() {
57+
log.info(">>>>>> 판례 벡터화 시작");
58+
return new StepBuilder("precedentVectorizationStep", jobRepository)
59+
.<Precedent, List<Document>>chunk(CHUNK_SIZE, transactionManager)
60+
.reader(precedentItemReader())
61+
.processor(precedentItemProcessor())
62+
.writer(documentItemWriter())
63+
.taskExecutor(taskExecutor())
64+
.build();
65+
}
66+
67+
@Bean
68+
public JpaPagingItemReader<Precedent> precedentItemReader() {
69+
return new JpaPagingItemReaderBuilder<Precedent>()
70+
.name("precedentItemReader")
71+
.entityManagerFactory(entityManagerFactory)
72+
.pageSize(CHUNK_SIZE)
73+
.maxItemCount(precedentPageSize)
74+
.queryString("SELECT p FROM Precedent p ORDER BY p.id ASC")
75+
.build();
76+
}
77+
78+
@Bean
79+
public ItemProcessor<Precedent, List<Document>> precedentItemProcessor() {
80+
81+
return precedent -> {
82+
String content = precedent.getPrecedentContent();
83+
if (content == null || content.isBlank()) return null;
84+
85+
Document originalDoc = new Document(content, Map.of(
86+
"type", "판례",
87+
"caseNumber", precedent.getCaseNumber(),
88+
"court", precedent.getCourtName(),
89+
"caseName", precedent.getCaseName()
90+
));
91+
92+
List<Document> chunkDocs = tokenSplitter.split(originalDoc);
93+
List<Document> finalChunks = new ArrayList<>();
94+
95+
// 청크별로 메타데이터에 인덱스 추가 -> 구분 용도
96+
for (int i = 0; i < chunkDocs.size(); i++) {
97+
Document chunk = chunkDocs.get(i);
98+
Map<String, Object> newMetadata = new HashMap<>(chunk.getMetadata());
99+
newMetadata.put("chunkIndex", i);
100+
finalChunks.add(new Document(chunk.getText(), newMetadata));
101+
}
102+
return finalChunks;
103+
};
104+
}
105+
106+
// -------------- 법령 백터화 ---------------
107+
@Bean
108+
public Step lawVectorizationStep() {
109+
log.info(">>>>>> 법령 벡터화 시작");
110+
return new StepBuilder("lawVectorizationStep", jobRepository)
111+
.<Law, List<Document>>chunk(CHUNK_SIZE, transactionManager) // 법령은 한 번에 10개씩 처리
112+
.reader(lawItemReader())
113+
.processor(lawItemProcessor())
114+
.writer(documentItemWriter())
115+
.taskExecutor(taskExecutor())
116+
.build();
117+
}
118+
119+
@Bean
120+
public JpaPagingItemReader<Law> lawItemReader() {
121+
return new JpaPagingItemReaderBuilder<Law>()
122+
.name("lawItemReader")
123+
.entityManagerFactory(entityManagerFactory)
124+
.pageSize(CHUNK_SIZE)
125+
.maxItemCount(lawPageSize)
126+
.queryString("SELECT l FROM Law l ORDER BY l.id ASC")
127+
.build();
128+
}
129+
130+
@Bean
131+
public ItemProcessor<Law, List<Document>> lawItemProcessor() {
132+
return law -> {
133+
List<Document> finalChunks = new ArrayList<>();
134+
135+
List<Jang> jangs = jangRepository.findByLaw(law);
136+
137+
for (Jang jang : jangs) {
138+
139+
StringBuilder contentBuilder = new StringBuilder();
140+
141+
contentBuilder.append(law.getLawName()).append("\n");
142+
143+
if (jang.getContent() != null && !jang.getContent().isBlank()) {
144+
contentBuilder.append(jang.getContent()).append("\n");
145+
}
146+
147+
List<Jo> jos = joRepository.findByJang(jang);
148+
for (Jo jo : jos) {
149+
150+
if (jo.getContent() != null && !jo.getContent().isBlank()) {
151+
contentBuilder.append(jo.getContent()).append("\n");
152+
}
153+
154+
List<Hang> hangs = hangRepository.findByJo(jo);
155+
for (Hang hang : hangs) {
156+
if (hang.getContent() != null && !hang.getContent().isBlank()) {
157+
contentBuilder.append(hang.getContent()).append("\n");
158+
}
159+
160+
List<Ho> hos = hoRepository.findByHang(hang);
161+
for (Ho ho : hos) {
162+
if (ho.getContent() != null && !ho.getContent().isBlank()) {
163+
contentBuilder.append(ho.getContent()).append("\n");
164+
}
165+
}
166+
}
167+
}
168+
169+
// === Jang 단위로 문서화 ===
170+
String finalContent = contentBuilder.toString();
171+
172+
if (!finalContent.isBlank()) {
173+
Map<String, Object> metadata = new HashMap<>();
174+
metadata.put("type", "법령");
175+
metadata.put("lawName", law.getLawName());
176+
metadata.put("jangId", jang.getId());
177+
178+
Document originalDoc = new Document(finalContent, metadata);
179+
180+
List<Document> chunkDocs = tokenSplitter.split(originalDoc);
181+
182+
for (int i = 0; i < chunkDocs.size(); i++) {
183+
Document chunk = chunkDocs.get(i);
184+
Map<String, Object> newMetadata = new HashMap<>(chunk.getMetadata());
185+
newMetadata.put("chunkIndex", i);
186+
finalChunks.add(new Document(chunk.getText(), newMetadata));
187+
}
188+
}
189+
}
190+
191+
return finalChunks.isEmpty() ? null : finalChunks;
192+
};
193+
}
194+
195+
@Bean
196+
public ItemWriter<List<Document>> documentItemWriter() {
197+
return chunk -> {
198+
List<Document> totalDocuments = chunk.getItems().stream()
199+
.flatMap(List::stream)
200+
.collect(Collectors.toList());
201+
202+
if (!totalDocuments.isEmpty()) {
203+
vectorStore.add(totalDocuments);
204+
log.info(">>>>>> {}개의 Document 청크를 벡터 저장소에 저장했습니다.", totalDocuments.size());
205+
}
206+
};
207+
}
208+
}*/

0 commit comments

Comments
 (0)