Skip to content

Commit 6907daf

Browse files
committed
refactor/OPS-290 : 데이터소스 정제화 기능 고도화
1 parent bb0312d commit 6907daf

File tree

16 files changed

+523
-174
lines changed

16 files changed

+523
-174
lines changed

build.gradle

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,10 @@ dependencies {
125125

126126
// Awaitility (비동기 테스트 지원)
127127
testImplementation 'org.awaitility:awaitility:4.2.0'
128+
129+
// retry (ai retry용)
130+
implementation 'org.springframework.retry:spring-retry'
131+
implementation 'org.springframework:spring-aspects'
128132
}
129133

130134
dependencyManagement {

gradlew

100644100755
File mode changed.

src/main/java/org/tuna/zoopzoop/backend/domain/datasource/ai/prompt/AiPrompt.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ public class AiPrompt {
4141
- 제공된 태그와 중복 가능하다.
4242
- 필요하면 새로운 태그를 만들어도 된다.
4343
4. 출력은 반드시 아래 JSON 형식으로 해라. Markdown 문법(```)은 쓰지 마라.
44-
- 해당 정보가 없으면 null말고 무조건문자열로 출력해줘라.
44+
- 해당정보가 없을 시 summary하고 category는문자열, category는 null로 출력해줘라.
4545
4646
[출력 JSON 형식]
4747
{

src/main/java/org/tuna/zoopzoop/backend/domain/datasource/ai/service/AiService.java

Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,32 +1,59 @@
11
package org.tuna.zoopzoop.backend.domain.datasource.ai.service;
22

3+
import com.fasterxml.jackson.core.JsonParseException;
4+
import com.fasterxml.jackson.core.JsonProcessingException;
35
import lombok.RequiredArgsConstructor;
46
import org.springframework.ai.chat.client.ChatClient;
7+
import org.springframework.retry.annotation.Backoff;
8+
import org.springframework.retry.annotation.Recover;
9+
import org.springframework.retry.annotation.Retryable;
510
import org.springframework.stereotype.Service;
611
import org.tuna.zoopzoop.backend.domain.datasource.ai.dto.AiExtractorDto;
712
import org.tuna.zoopzoop.backend.domain.datasource.ai.dto.AnalyzeContentDto;
813
import org.tuna.zoopzoop.backend.domain.datasource.ai.prompt.AiPrompt;
914
import org.tuna.zoopzoop.backend.domain.datasource.entity.Tag;
10-
import org.tuna.zoopzoop.backend.domain.datasource.repository.TagRepository;
1115

16+
import java.util.ArrayList;
1217
import java.util.List;
1318
import java.util.stream.Collectors;
1419

1520
@Service
1621
@RequiredArgsConstructor
1722
public class AiService {
1823
private final ChatClient chatClient;
19-
private final TagRepository tagRepository;
2024

25+
@Retryable(
26+
maxAttempts = 3,
27+
backoff = @Backoff(delay = 500),
28+
retryFor = {JsonParseException.class, JsonProcessingException.class}
29+
)
2130
public AiExtractorDto extract(String rawHtml) {
31+
System.out.println("AI 추출 시도");
2232
AiExtractorDto response = chatClient.prompt()
2333
.user(AiPrompt.EXTRACTION.formatted(rawHtml))
2434
.call()
2535
.entity(AiExtractorDto.class);
2636

37+
System.out.println("AI 추출 성공!");
2738
return response;
2839
}
2940

41+
@Recover
42+
public AiExtractorDto extractRecover(Exception e, String rawHtml) {
43+
return new AiExtractorDto(
44+
"",
45+
null,
46+
"",
47+
"",
48+
""
49+
);
50+
}
51+
52+
@Retryable(
53+
maxAttempts = 3,
54+
backoff = @Backoff(delay = 500),
55+
retryFor = {JsonParseException.class, JsonProcessingException.class}
56+
)
3057
public AnalyzeContentDto analyzeContent(String content, List<Tag> tagList) {
3158
// JSON 배열 문자열로 변환
3259
String tags = tagList.stream()
@@ -41,4 +68,14 @@ public AnalyzeContentDto analyzeContent(String content, List<Tag> tagList) {
4168

4269
return response;
4370
}
71+
72+
@Recover
73+
public AnalyzeContentDto analyzeContentRecover(Exception e, String content, List<Tag> tagList) {
74+
return new AnalyzeContentDto(
75+
"",
76+
null,
77+
new ArrayList<>()
78+
);
79+
}
80+
4481
}

src/main/java/org/tuna/zoopzoop/backend/domain/datasource/crawler/service/Crawler.java

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,10 @@
33
import org.jsoup.nodes.Document;
44
import org.tuna.zoopzoop.backend.domain.datasource.crawler.dto.CrawlerResult;
55

6-
import java.io.IOException;
76
import java.time.LocalDate;
87

98
public interface Crawler {
109
boolean supports(String domain);
11-
CrawlerResult<?> extract(Document doc) throws IOException;
10+
CrawlerResult<?> extract(Document doc);
1211
LocalDate transLocalDate(String rawDate);
1312
}
Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,24 @@
11
package org.tuna.zoopzoop.backend.domain.datasource.crawler.service;
22

33
import lombok.RequiredArgsConstructor;
4-
import org.jsoup.Jsoup;
54
import org.jsoup.nodes.Document;
65
import org.springframework.stereotype.Service;
76
import org.tuna.zoopzoop.backend.domain.datasource.crawler.dto.CrawlerResult;
87

9-
import java.io.IOException;
108
import java.util.List;
119

1210
@Service
1311
@RequiredArgsConstructor
1412
public class CrawlerManagerService {
1513
private final List<Crawler> crawlers;
1614

17-
public CrawlerResult<?> extractContent(String url) throws IOException {
18-
Document doc = Jsoup.connect(url)
19-
.userAgent("Mozilla/5.0")
20-
.timeout(10000)
21-
.get();
22-
15+
public CrawlerResult<?> extractContent(String url, Document doc) {
2316
for (Crawler crawler : crawlers) {
2417
if (crawler.supports(url)) {
2518
return crawler.extract(doc);
2619
}
2720
}
21+
2822
return null;
2923
}
3024
}

src/main/java/org/tuna/zoopzoop/backend/domain/datasource/crawler/service/GenericCrawler.java

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,30 @@ public boolean supports(String url) {
1919

2020
@Override
2121
public CrawlerResult<?> extract(Document doc) {
22-
// 불필요한 태그 제거
23-
doc.select("script, style, noscript, meta, link").remove();
22+
// img 태그
23+
doc.select("img[src]").forEach(el ->
24+
el.attr("src", el.absUrl("src"))
25+
);
26+
27+
// meta 태그 (Open Graph, Twitter Card 등)
28+
doc.select("meta[content]").forEach(meta -> {
29+
String absUrl = meta.absUrl("content");
30+
if (!absUrl.isEmpty() && !absUrl.equals(meta.attr("content"))) {
31+
meta.attr("content", absUrl);
32+
}
33+
});
2434

2535
// 본문만 가져오기 (HTML)
26-
String cleanHtml = doc.body().html();
36+
String cleanHtml = doc.body().html()
37+
.replaceAll("<script[^>]*>.*?</script>", "")
38+
.replaceAll("<style[^>]*>.*?</style>", "")
39+
// 주석 제거
40+
.replaceAll("<!--.*?-->", "")
41+
// 연속된 공백 제거
42+
.replaceAll("\\s+", " ")
43+
// 불필요한 속성 제거
44+
.replaceAll("(class|id|style|onclick|onload)=\"[^\"]*\"", "")
45+
.trim();
2746

2847
return new CrawlerResult<>(
2948
CrawlerResult.CrawlerType.UNSPECIFIC,
Lines changed: 80 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
package org.tuna.zoopzoop.backend.domain.datasource.crawler.service;
22

3+
import org.jsoup.Connection;
34
import org.jsoup.Jsoup;
45
import org.jsoup.nodes.Document;
56
import org.jsoup.nodes.Element;
@@ -9,11 +10,14 @@
910
import org.springframework.stereotype.Component;
1011
import org.tuna.zoopzoop.backend.domain.datasource.crawler.dto.CrawlerResult;
1112
import org.tuna.zoopzoop.backend.domain.datasource.crawler.dto.SpecificSiteDto;
13+
import org.tuna.zoopzoop.backend.domain.datasource.exception.ServiceException;
1214

13-
import java.io.IOException;
1415
import java.time.LocalDate;
1516
import java.time.LocalDateTime;
1617
import java.time.format.DateTimeFormatter;
18+
import java.util.Optional;
19+
import java.util.regex.Matcher;
20+
import java.util.regex.Pattern;
1721

1822
@Component
1923
@Order(Ordered.HIGHEST_PRECEDENCE)
@@ -28,28 +32,40 @@ public boolean supports(String domain) {
2832
}
2933

3034
@Override
31-
public CrawlerResult<?> extract(Document doc) throws IOException {
35+
public CrawlerResult<?> extract(Document doc) {
3236
/*
3337
블로그 본문은 <iframe id="mainFrame"> 안에 로드되므로
3438
먼저 메인 페이지를 가져온 뒤 iframe의 src를 추출하여
3539
해당 URL로 다시 connect 해야 실제 본문 내용을 크롤링할 수 있다.
3640
*/
37-
Element iframe = doc.selectFirst("iframe#mainFrame");
38-
String iframeUrl = iframe.absUrl("src");
41+
String iframeUrl = Optional.ofNullable(doc.selectFirst("iframe#mainFrame"))
42+
.map(el -> el.absUrl("src"))
43+
.orElse("");
3944

40-
Document iframeDoc = Jsoup.connect(iframeUrl)
41-
.userAgent("Mozilla/5.0") // 크롤링 차단 방지를 위해 user-agent 설정 권장
42-
.timeout(10 * 1000) // 타임아웃 (10초)
43-
.get();
45+
Document iframeDoc;
46+
47+
try {
48+
Connection.Response response = Jsoup.connect(iframeUrl)
49+
.userAgent("Mozilla/5.0")
50+
.timeout(10000)
51+
.followRedirects(true)
52+
.execute();
53+
54+
iframeDoc = response.parse();
55+
} catch (Exception e) {
56+
throw new ServiceException("URL 접속에 실패했습니다.");
57+
}
4458

4559
// 제목
46-
Element titleSpans = iframeDoc.selectFirst("div.se-module.se-module-text.se-title-text");
47-
String title = titleSpans.text();
60+
String title = iframeDoc.select("meta[property=og:title]").attr("content");
61+
4862

4963
// 작성일자
50-
String publishedAt = iframeDoc.selectFirst("span.se_publishDate.pcol2").text();
51-
LocalDateTime rawDate = LocalDateTime.parse(publishedAt, DateTimeFormatter.ofPattern("yyyy. M. d. HH:mm"));
52-
LocalDate dataCreatedDate = rawDate.toLocalDate();
64+
String publishedAt = Optional.ofNullable(iframeDoc.selectFirst("span.se_publishDate.pcol2"))
65+
.map(Element::text)
66+
.orElse("");
67+
LocalDate dataCreatedDate = transLocalDate(publishedAt);
68+
5369

5470
// 내용
5571
Elements spans = iframeDoc.select(".se-main-container span");
@@ -59,18 +75,13 @@ public CrawlerResult<?> extract(Document doc) throws IOException {
5975
}
6076
String content = sb.toString();
6177

78+
6279
// 썸네일 이미지 URL
63-
Element img = iframeDoc.select("div.se-main-container img").first();
80+
String imageUrl = iframeDoc.select("meta[property=og:image]").attr("content");
6481

65-
String imageUrl = "";
66-
if (img != null) {
67-
if (!img.attr("data-lazy-src").isEmpty()) {
68-
imageUrl = img.attr("data-lazy-src");
69-
}
70-
}
7182

7283
// 출처
73-
String source = "네이버 블로그";
84+
String source = iframeDoc.select("meta[property=og:site_name]").attr("content");
7485

7586
return new CrawlerResult<>(
7687
CrawlerResult.CrawlerType.SPECIFIC,
@@ -79,8 +90,53 @@ public CrawlerResult<?> extract(Document doc) throws IOException {
7990
}
8091

8192
@Override
82-
public LocalDate transLocalDate(String rawDate) {
83-
LocalDateTime dateTime = LocalDateTime.parse(rawDate, NAVERBLOG_FORMATTER);
84-
return dateTime.toLocalDate(); // 시간 버리고 날짜만
93+
public LocalDate transLocalDate(String publishedAt) {
94+
if (publishedAt == null || publishedAt.isEmpty()) {
95+
return null;
96+
}
97+
98+
publishedAt = publishedAt.trim();
99+
100+
// "방금 전"
101+
if (publishedAt.equals("방금 전")) {
102+
return LocalDate.now();
103+
}
104+
105+
// "?분 전"
106+
Pattern minutePattern = Pattern.compile("(\\d+)분\\s*전");
107+
Matcher minuteMatcher = minutePattern.matcher(publishedAt);
108+
if (minuteMatcher.find()) {
109+
int minutes = Integer.parseInt(minuteMatcher.group(1));
110+
LocalDateTime pastTime = LocalDateTime.now().minusMinutes(minutes);
111+
return pastTime.toLocalDate();
112+
}
113+
114+
// "?시간 전"
115+
Pattern hourPattern = Pattern.compile("(\\d+)시간\\s*전");
116+
Matcher hourMatcher = hourPattern.matcher(publishedAt);
117+
if (hourMatcher.find()) {
118+
int hours = Integer.parseInt(hourMatcher.group(1));
119+
LocalDateTime pastTime = LocalDateTime.now().minusHours(hours);
120+
return pastTime.toLocalDate();
121+
}
122+
123+
// "yyyy. M. d. HH:mm" 또는 "yyyy. M. d. H:mm" 형식
124+
try {
125+
// 시간 부분 제거하고 날짜만 추출
126+
Pattern datePattern = Pattern.compile("(\\d{4})\\s*\\.\\s*(\\d{1,2})\\s*\\.\\s*(\\d{1,2})");
127+
Matcher dateMatcher = datePattern.matcher(publishedAt);
128+
129+
if (dateMatcher.find()) {
130+
int year = Integer.parseInt(dateMatcher.group(1));
131+
int month = Integer.parseInt(dateMatcher.group(2));
132+
int day = Integer.parseInt(dateMatcher.group(3));
133+
return LocalDate.of(year, month, day);
134+
}
135+
} catch (Exception e) {
136+
System.err.println("날짜 파싱 실패: " + publishedAt);
137+
}
138+
139+
// 파싱 실패 시 null 반환
140+
return null;
85141
}
86142
}

src/main/java/org/tuna/zoopzoop/backend/domain/datasource/crawler/service/NaverNewsCrawler.java

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package org.tuna.zoopzoop.backend.domain.datasource.crawler.service;
22

33
import org.jsoup.nodes.Document;
4+
import org.jsoup.nodes.Element;
45
import org.springframework.core.Ordered;
56
import org.springframework.core.annotation.Order;
67
import org.springframework.stereotype.Component;
@@ -9,6 +10,7 @@
910

1011
import java.time.LocalDate;
1112
import java.time.format.DateTimeFormatter;
13+
import java.util.Optional;
1214

1315
@Component
1416
@Order(Ordered.HIGHEST_PRECEDENCE)
@@ -25,22 +27,33 @@ public boolean supports(String domain) {
2527
@Override
2628
public CrawlerResult<?> extract(Document doc) {
2729
// 제목
28-
String title = doc.selectFirst("h2#title_area").text();
30+
String title = doc.select("meta[property=og:title]").attr("content");
31+
2932

3033
// 작성 날짜
31-
String publishedAt = doc.selectFirst(
32-
"span.media_end_head_info_datestamp_time._ARTICLE_DATE_TIME"
33-
).attr("data-date-time");
34-
LocalDate dataCreatedDate = transLocalDate(publishedAt);
34+
String publishedAt = Optional.ofNullable(
35+
doc.selectFirst("span.media_end_head_info_datestamp_time._ARTICLE_DATE_TIME")
36+
)
37+
.map(el -> el.attr("data-date-time"))
38+
.orElse(""); // 값 없으면 빈 문자열
39+
40+
LocalDate dataCreatedDate = publishedAt.isEmpty()
41+
? null
42+
: transLocalDate(publishedAt);
43+
3544

3645
// 내용(ai한테 줘야함)
37-
String content = doc.select("article").text();
46+
String content = Optional.ofNullable(doc.selectFirst("article"))
47+
.map(Element::text)
48+
.orElse("");
49+
3850

3951
// 썸네일 이미지 url
40-
String imageUrl = doc.selectFirst("img#img1._LAZY_LOADING._LAZY_LOADING_INIT_HIDE").attr("data-src");
52+
String imageUrl = doc.select("meta[property=og:image]").attr("content");
53+
4154

4255
// 출처
43-
String source = doc.selectFirst("span.media_end_head_top_logo_text").text();
56+
String source = doc.select("meta[name=twitter:creator]").attr("content");
4457

4558
return new CrawlerResult<>(
4659
CrawlerResult.CrawlerType.SPECIFIC,

0 commit comments

Comments
 (0)