Skip to content

Commit 2edf688

Browse files
authored
feat/OPS-289 : dataprocessor 서비스 생성 (#59)
1 parent 02f6310 commit 2edf688

File tree

15 files changed

+170
-90
lines changed

15 files changed

+170
-90
lines changed
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
package org.tuna.zoopzoop.backend.domain.datasource.ai.dto;
2+
3+
import java.time.LocalDate;
4+
5+
public record AiExtractorDto(
6+
String title, // 제목
7+
LocalDate dataCreatedDate, // 작성일자
8+
String content, // ai한테 줘야할 내용
9+
String imageUrl, // 썸네일 이미지 url
10+
String sources // 출처
11+
) {
12+
}
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
package org.tuna.zoopzoop.backend.domain.datasource.ai.prompt;
2+
3+
public class AiPrompt {
4+
public static final String EXTRACTION = """
5+
아래 HTML 전문에서 필요한 정보를 JSON 형식으로 추출해 주세요.
6+
반환 JSON 구조:
7+
{
8+
"title": "제목",
9+
"datacreatedDate": "작성일자 (YYYY-MM-DD)",
10+
"content": "본문 내용",
11+
"imageUrl": "썸네일 이미지 URL",
12+
"sources": "출판사 이름 or 서비스 이름 or 도메인 이름"
13+
}
14+
15+
HTML 전문:
16+
%s
17+
18+
- 반드시 JSON 형식으로만 출력해 주세요.
19+
- 해당정보가 없으면 반드시 빈 문자열로 출력해 주세요.
20+
""";
21+
22+
public static final String SUMMARY_TAG_CATEGORY = """
23+
내용 요약, 태그 요약, 카테고리 선정 프롬프트
24+
""";
25+
}
Lines changed: 7 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -1,79 +1,22 @@
11
package org.tuna.zoopzoop.backend.domain.datasource.ai.service;
22

3-
import com.fasterxml.jackson.core.type.TypeReference;
4-
import com.fasterxml.jackson.databind.ObjectMapper;
53
import lombok.RequiredArgsConstructor;
64
import org.springframework.ai.chat.client.ChatClient;
75
import org.springframework.stereotype.Service;
8-
import org.tuna.zoopzoop.backend.domain.datasource.repository.TagRepository;
9-
10-
import java.util.HashSet;
11-
import java.util.List;
12-
import java.util.Map;
13-
import java.util.Set;
6+
import org.tuna.zoopzoop.backend.domain.datasource.ai.dto.AiExtractorDto;
7+
import org.tuna.zoopzoop.backend.domain.datasource.ai.prompt.AiPrompt;
148

159
@Service
1610
@RequiredArgsConstructor
1711
public class AiService {
1812
private final ChatClient chatClient;
19-
private final TagRepository tagRepository;
20-
21-
public Set<String> duplicateTag() {
22-
Set<String> existingTags = new HashSet<>();
23-
existingTags.addAll(tagRepository.findAllTagNames());
24-
return existingTags;
25-
}
26-
27-
public Map<String, Object> summarizeAndTag(String text) {
28-
Set<String> existingTags = duplicateTag();
29-
String tagsForPrompt = String.join(", ", existingTags);
3013

31-
String prompt = """
32-
본문 요약 프롬프트:
33-
아래 본문을 무조건 50자 이상, 100자 이하로 요약해주세요.
34-
35-
핵심 태그 프롬프트:
36-
이미 존재하는 태그 목록은 다음과 같습니다:
37-
[%s]
38-
39-
본문을 요약하고, 해당 본문과 관련된 태그 3~5개를 생성하세요.
40-
- 태그는 반드시 본문과 관련된 것만 선택하세요.
41-
- 기존 태그 중 본문과 관련 없는 것은 포함하지 마세요.
42-
- 새로운 태그는 본문에 꼭 필요한 경우에만 생성하세요.
43-
- 결과는 JSON 형식으로만 출력하세요.
44-
45-
본문:
46-
%s
47-
48-
예시 출력:
49-
{
50-
"summary": "...",
51-
"tags": ["...", "..."]
52-
}
53-
""".formatted(tagsForPrompt, text);
54-
55-
String response = chatClient.prompt()
56-
.user(prompt)
14+
public AiExtractorDto extract(String rawHtml) {
15+
AiExtractorDto response = chatClient.prompt()
16+
.user(AiPrompt.EXTRACTION.formatted(rawHtml))
5717
.call()
58-
.content();
59-
60-
// JSON 시작/끝만 추출
61-
int start = response.indexOf("{");
62-
int end = response.lastIndexOf("}") + 1;
63-
if (start >= 0 && end > start) {
64-
response = response.substring(start, end);
65-
}
66-
67-
try {
68-
ObjectMapper mapper = new ObjectMapper();
69-
Map<String, Object> map = mapper.readValue(response, new TypeReference<Map<String, Object>>() {});
70-
71-
String summary = (String) map.get("summary");
72-
List<String> tags = (List<String>) map.get("tags");
18+
.entity(AiExtractorDto.class);
7319

74-
return map;
75-
} catch (Exception e) {
76-
throw new RuntimeException("AI 응답 파싱 실패: " + response, e);
77-
}
20+
return response;
7821
}
7922
}

src/main/java/org/tuna/zoopzoop/backend/domain/datasource/crawler/controller/CrawlerTestController.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,16 +6,18 @@
66
import org.springframework.web.bind.annotation.RequestParam;
77
import org.springframework.web.bind.annotation.RestController;
88
import org.tuna.zoopzoop.backend.domain.datasource.crawler.service.CrawlerManagerService;
9+
import org.tuna.zoopzoop.backend.domain.datasource.dataprocessor.service.DataProcessorService;
910
import org.tuna.zoopzoop.backend.domain.datasource.dto.ArticleData;
1011

1112
@RestController
1213
@RequestMapping("api/v1")
1314
@RequiredArgsConstructor
1415
public class CrawlerTestController {
1516
private final CrawlerManagerService crawlerManagerService;
17+
private final DataProcessorService dataProcessorService;
1618

1719
@GetMapping("/crawl")
1820
public ArticleData crawl(@RequestParam String url) throws Exception {
19-
return crawlerManagerService.extractContent(url);
21+
return dataProcessorService.process(url);
2022
}
2123
}
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
package org.tuna.zoopzoop.backend.domain.datasource.crawler.dto;
2+
3+
public record CrawlerResult<T>(
4+
CrawlerType type, // SPECIFIC or UNSPECIFIC
5+
T data
6+
) {
7+
public enum CrawlerType {
8+
SPECIFIC, UNSPECIFIC
9+
}
10+
}
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
package org.tuna.zoopzoop.backend.domain.datasource.crawler.dto;
2+
3+
import java.time.LocalDate;
4+
5+
public record SpecificSiteDto(
6+
String title, // 제목
7+
LocalDate dataCreatedDate, // 작성일자
8+
String content, // ai한테 줘야할 내용
9+
String imageUrl, // 썸네일 이미지 url
10+
String sources // 출처
11+
) {
12+
}
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
package org.tuna.zoopzoop.backend.domain.datasource.crawler.dto;
2+
3+
public record UnspecificSiteDto(
4+
String rawHtml // 불특정 사이트의 html 전문
5+
) {
6+
}
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
package org.tuna.zoopzoop.backend.domain.datasource.crawler.service;
22

33
import org.jsoup.nodes.Document;
4-
import org.tuna.zoopzoop.backend.domain.datasource.dto.ArticleData;
4+
import org.tuna.zoopzoop.backend.domain.datasource.crawler.dto.CrawlerResult;
55

66
import java.time.LocalDate;
77

88
public interface Crawler {
99
boolean supports(String domain);
10-
ArticleData extract(Document doc);
10+
CrawlerResult<?> extract(Document doc);
1111
LocalDate transLocalDate(String rawDate);
1212
}

src/main/java/org/tuna/zoopzoop/backend/domain/datasource/crawler/service/CrawlerManagerService.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import org.jsoup.Jsoup;
55
import org.jsoup.nodes.Document;
66
import org.springframework.stereotype.Service;
7-
import org.tuna.zoopzoop.backend.domain.datasource.dto.ArticleData;
7+
import org.tuna.zoopzoop.backend.domain.datasource.crawler.dto.CrawlerResult;
88

99
import java.io.IOException;
1010
import java.util.List;
@@ -14,7 +14,7 @@
1414
public class CrawlerManagerService {
1515
private final List<Crawler> crawlers;
1616

17-
public ArticleData extractContent(String url) throws IOException {
17+
public CrawlerResult<?> extractContent(String url) throws IOException {
1818
Document doc = Jsoup.connect(url)
1919
.userAgent("Mozilla/5.0")
2020
.timeout(10000)

src/main/java/org/tuna/zoopzoop/backend/domain/datasource/crawler/service/GenericCrawler.java

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@
44
import org.springframework.core.Ordered;
55
import org.springframework.core.annotation.Order;
66
import org.springframework.stereotype.Component;
7-
import org.tuna.zoopzoop.backend.domain.datasource.dto.ArticleData;
7+
import org.tuna.zoopzoop.backend.domain.datasource.crawler.dto.CrawlerResult;
8+
import org.tuna.zoopzoop.backend.domain.datasource.crawler.dto.UnspecificSiteDto;
89

910
import java.time.LocalDate;
1011

@@ -17,8 +18,17 @@ public boolean supports(String url) {
1718
}
1819

1920
@Override
20-
public ArticleData extract(Document doc) {
21-
return new ArticleData(null, null, null, null, null, doc.outerHtml());
21+
public CrawlerResult<?> extract(Document doc) {
22+
// 불필요한 태그 제거
23+
doc.select("script, style, noscript, iframe, nav, header, footer, form, aside, meta, link").remove();
24+
25+
// 본문만 가져오기 (HTML)
26+
String cleanHtml = doc.body().html();
27+
28+
return new CrawlerResult<>(
29+
CrawlerResult.CrawlerType.UNSPECIFIC,
30+
new UnspecificSiteDto(cleanHtml)
31+
);
2232
}
2333

2434
@Override

0 commit comments

Comments
 (0)