Skip to content

Commit 16e6bf7

Browse files
authored
feat/OPS-393 : 티스토리 크롤러 생성 (#145)
1 parent e874b2c commit 16e6bf7

File tree

16 files changed

+642
-173
lines changed

16 files changed

+642
-173
lines changed

build.gradle

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,10 @@ dependencies {
128128

129129
// Elastic Search
130130
implementation 'org.springframework.boot:spring-boot-starter-data-elasticsearch'
131+
132+
// retry (ai retry용)
133+
implementation 'org.springframework.retry:spring-retry'
134+
implementation 'org.springframework:spring-aspects'
131135
}
132136

133137
dependencyManagement {

src/main/java/org/tuna/zoopzoop/backend/domain/datasource/ai/prompt/AiPrompt.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ public class AiPrompt {
4141
- 제공된 태그와 중복 가능하다.
4242
- 필요하면 새로운 태그를 만들어도 된다.
4343
4. 출력은 반드시 아래 JSON 형식으로 해라. Markdown 문법(```)은 쓰지 마라.
44-
- 해당 정보가 없으면 null말고 무조건 빈 문자열로 출력해줘라.
44+
- 해당정보가 없을 시 summary는 빈 문자열, category는 null, tags는 빈 리스트로 출력해줘라.
4545
4646
[출력 JSON 형식]
4747
{

src/main/java/org/tuna/zoopzoop/backend/domain/datasource/ai/service/AiService.java

Lines changed: 37 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,32 @@
11
package org.tuna.zoopzoop.backend.domain.datasource.ai.service;
22

3+
import com.fasterxml.jackson.core.JsonParseException;
4+
import com.fasterxml.jackson.core.JsonProcessingException;
35
import lombok.RequiredArgsConstructor;
46
import org.springframework.ai.chat.client.ChatClient;
7+
import org.springframework.retry.annotation.Backoff;
8+
import org.springframework.retry.annotation.Recover;
9+
import org.springframework.retry.annotation.Retryable;
510
import org.springframework.stereotype.Service;
611
import org.tuna.zoopzoop.backend.domain.datasource.ai.dto.AiExtractorDto;
712
import org.tuna.zoopzoop.backend.domain.datasource.ai.dto.AnalyzeContentDto;
813
import org.tuna.zoopzoop.backend.domain.datasource.ai.prompt.AiPrompt;
914
import org.tuna.zoopzoop.backend.domain.datasource.entity.Tag;
10-
import org.tuna.zoopzoop.backend.domain.datasource.repository.TagRepository;
1115

16+
import java.util.ArrayList;
1217
import java.util.List;
1318
import java.util.stream.Collectors;
1419

1520
@Service
1621
@RequiredArgsConstructor
1722
public class AiService {
1823
private final ChatClient chatClient;
19-
private final TagRepository tagRepository;
2024

25+
@Retryable(
26+
maxAttempts = 3,
27+
backoff = @Backoff(delay = 500),
28+
retryFor = {JsonParseException.class, JsonProcessingException.class}
29+
)
2130
public AiExtractorDto extract(String rawHtml) {
2231
AiExtractorDto response = chatClient.prompt()
2332
.user(AiPrompt.EXTRACTION.formatted(rawHtml))
@@ -27,6 +36,22 @@ public AiExtractorDto extract(String rawHtml) {
2736
return response;
2837
}
2938

39+
@Recover
40+
public AiExtractorDto extractRecover(Exception e, String rawHtml) {
41+
return new AiExtractorDto(
42+
"",
43+
null,
44+
"",
45+
"",
46+
""
47+
);
48+
}
49+
50+
@Retryable(
51+
maxAttempts = 3,
52+
backoff = @Backoff(delay = 500),
53+
retryFor = {JsonParseException.class, JsonProcessingException.class}
54+
)
3055
public AnalyzeContentDto analyzeContent(String content, List<Tag> tagList) {
3156
// JSON 배열 문자열로 변환
3257
String tags = tagList.stream()
@@ -41,4 +66,14 @@ public AnalyzeContentDto analyzeContent(String content, List<Tag> tagList) {
4166

4267
return response;
4368
}
69+
70+
@Recover
71+
public AnalyzeContentDto analyzeContentRecover(Exception e, String content, List<Tag> tagList) {
72+
return new AnalyzeContentDto(
73+
"",
74+
null,
75+
new ArrayList<>()
76+
);
77+
}
78+
4479
}

src/main/java/org/tuna/zoopzoop/backend/domain/datasource/crawler/service/Crawler.java

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,10 @@
33
import org.jsoup.nodes.Document;
44
import org.tuna.zoopzoop.backend.domain.datasource.crawler.dto.CrawlerResult;
55

6-
import java.io.IOException;
76
import java.time.LocalDate;
87

98
public interface Crawler {
109
boolean supports(String domain);
11-
CrawlerResult<?> extract(Document doc) throws IOException;
10+
CrawlerResult<?> extract(Document doc);
1211
LocalDate transLocalDate(String rawDate);
1312
}
Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,24 @@
11
package org.tuna.zoopzoop.backend.domain.datasource.crawler.service;
22

33
import lombok.RequiredArgsConstructor;
4-
import org.jsoup.Jsoup;
54
import org.jsoup.nodes.Document;
65
import org.springframework.stereotype.Service;
76
import org.tuna.zoopzoop.backend.domain.datasource.crawler.dto.CrawlerResult;
87

9-
import java.io.IOException;
108
import java.util.List;
119

1210
@Service
1311
@RequiredArgsConstructor
1412
public class CrawlerManagerService {
1513
private final List<Crawler> crawlers;
1614

17-
public CrawlerResult<?> extractContent(String url) throws IOException {
18-
Document doc = Jsoup.connect(url)
19-
.userAgent("Mozilla/5.0")
20-
.timeout(10000)
21-
.get();
22-
15+
public CrawlerResult<?> extractContent(String url, Document doc) {
2316
for (Crawler crawler : crawlers) {
2417
if (crawler.supports(url)) {
2518
return crawler.extract(doc);
2619
}
2720
}
21+
2822
return null;
2923
}
3024
}

src/main/java/org/tuna/zoopzoop/backend/domain/datasource/crawler/service/GenericCrawler.java

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,30 @@ public boolean supports(String url) {
1919

2020
@Override
2121
public CrawlerResult<?> extract(Document doc) {
22-
// 불필요한 태그 제거
23-
doc.select("script, style, noscript, meta, link").remove();
22+
// img 태그
23+
doc.select("img[src]").forEach(el ->
24+
el.attr("src", el.absUrl("src"))
25+
);
26+
27+
// meta 태그 (Open Graph, Twitter Card 등)
28+
doc.select("meta[content]").forEach(meta -> {
29+
String absUrl = meta.absUrl("content");
30+
if (!absUrl.isEmpty() && !absUrl.equals(meta.attr("content"))) {
31+
meta.attr("content", absUrl);
32+
}
33+
});
2434

2535
// 본문만 가져오기 (HTML)
26-
String cleanHtml = doc.body().html();
36+
String cleanHtml = doc.body().html()
37+
.replaceAll("<script[^>]*>.*?</script>", "")
38+
.replaceAll("<style[^>]*>.*?</style>", "")
39+
// 주석 제거
40+
.replaceAll("<!--.*?-->", "")
41+
// 연속된 공백 제거
42+
.replaceAll("\\s+", " ")
43+
// 불필요한 속성 제거
44+
.replaceAll("(class|id|style|onclick|onload)=\"[^\"]*\"", "")
45+
.trim();
2746

2847
return new CrawlerResult<>(
2948
CrawlerResult.CrawlerType.UNSPECIFIC,
Lines changed: 79 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
package org.tuna.zoopzoop.backend.domain.datasource.crawler.service;
22

3+
import org.jsoup.Connection;
34
import org.jsoup.Jsoup;
45
import org.jsoup.nodes.Document;
56
import org.jsoup.nodes.Element;
@@ -9,11 +10,14 @@
910
import org.springframework.stereotype.Component;
1011
import org.tuna.zoopzoop.backend.domain.datasource.crawler.dto.CrawlerResult;
1112
import org.tuna.zoopzoop.backend.domain.datasource.crawler.dto.SpecificSiteDto;
13+
import org.tuna.zoopzoop.backend.domain.datasource.exception.ServiceException;
1214

13-
import java.io.IOException;
1415
import java.time.LocalDate;
1516
import java.time.LocalDateTime;
1617
import java.time.format.DateTimeFormatter;
18+
import java.util.Optional;
19+
import java.util.regex.Matcher;
20+
import java.util.regex.Pattern;
1721

1822
@Component
1923
@Order(Ordered.HIGHEST_PRECEDENCE)
@@ -28,28 +32,38 @@ public boolean supports(String domain) {
2832
}
2933

3034
@Override
31-
public CrawlerResult<?> extract(Document doc) throws IOException {
35+
public CrawlerResult<?> extract(Document doc) {
3236
/*
3337
블로그 본문은 <iframe id="mainFrame"> 안에 로드되므로
3438
먼저 메인 페이지를 가져온 뒤 iframe의 src를 추출하여
3539
해당 URL로 다시 connect 해야 실제 본문 내용을 크롤링할 수 있다.
3640
*/
37-
Element iframe = doc.selectFirst("iframe#mainFrame");
38-
String iframeUrl = iframe.absUrl("src");
41+
String iframeUrl = Optional.ofNullable(doc.selectFirst("iframe#mainFrame"))
42+
.map(el -> el.absUrl("src"))
43+
.orElse("");
3944

40-
Document iframeDoc = Jsoup.connect(iframeUrl)
41-
.userAgent("Mozilla/5.0") // 크롤링 차단 방지를 위해 user-agent 설정 권장
42-
.timeout(10 * 1000) // 타임아웃 (10초)
43-
.get();
45+
Document iframeDoc;
46+
47+
try {
48+
Connection.Response response = Jsoup.connect(iframeUrl)
49+
.userAgent("Mozilla/5.0")
50+
.timeout(5000)
51+
.followRedirects(true)
52+
.execute();
53+
54+
iframeDoc = response.parse();
55+
} catch (Exception e) {
56+
throw new ServiceException("URL 접속에 실패했습니다.");
57+
}
4458

4559
// 제목
46-
Element titleSpans = iframeDoc.selectFirst("div.se-module.se-module-text.se-title-text");
47-
String title = titleSpans.text();
60+
String title = iframeDoc.select("meta[property=og:title]").attr("content");
4861

4962
// 작성일자
50-
String publishedAt = iframeDoc.selectFirst("span.se_publishDate.pcol2").text();
51-
LocalDateTime rawDate = LocalDateTime.parse(publishedAt, DateTimeFormatter.ofPattern("yyyy. M. d. HH:mm"));
52-
LocalDate dataCreatedDate = rawDate.toLocalDate();
63+
String publishedAt = Optional.ofNullable(iframeDoc.selectFirst("span.se_publishDate.pcol2"))
64+
.map(Element::text)
65+
.orElse("");
66+
LocalDate dataCreatedDate = transLocalDate(publishedAt);
5367

5468
// 내용
5569
Elements spans = iframeDoc.select(".se-main-container span");
@@ -60,17 +74,11 @@ public CrawlerResult<?> extract(Document doc) throws IOException {
6074
String content = sb.toString();
6175

6276
// 썸네일 이미지 URL
63-
Element img = iframeDoc.select("div.se-main-container img").first();
64-
65-
String imageUrl = "";
66-
if (img != null) {
67-
if (!img.attr("data-lazy-src").isEmpty()) {
68-
imageUrl = img.attr("data-lazy-src");
69-
}
70-
}
77+
String imageUrl = iframeDoc.select("meta[property=og:image]").attr("content");
7178

7279
// 출처
73-
String source = "네이버 블로그";
80+
String source = iframeDoc.select("meta[property=og:site_name]").attr("content");
81+
7482

7583
return new CrawlerResult<>(
7684
CrawlerResult.CrawlerType.SPECIFIC,
@@ -79,8 +87,54 @@ public CrawlerResult<?> extract(Document doc) throws IOException {
7987
}
8088

8189
@Override
82-
public LocalDate transLocalDate(String rawDate) {
83-
LocalDateTime dateTime = LocalDateTime.parse(rawDate, NAVERBLOG_FORMATTER);
84-
return dateTime.toLocalDate(); // 시간 버리고 날짜만
90+
public LocalDate transLocalDate(String publishedAt) {
91+
92+
if (publishedAt == null || publishedAt.isEmpty()) {
93+
return null;
94+
}
95+
96+
publishedAt = publishedAt.trim();
97+
98+
// "방금 전"
99+
if (publishedAt.equals("방금 전")) {
100+
return LocalDate.now();
101+
}
102+
103+
// "?분 전"
104+
Pattern minutePattern = Pattern.compile("(\\d+)분\\s*전");
105+
Matcher minuteMatcher = minutePattern.matcher(publishedAt);
106+
if (minuteMatcher.find()) {
107+
int minutes = Integer.parseInt(minuteMatcher.group(1));
108+
LocalDateTime pastTime = LocalDateTime.now().minusMinutes(minutes);
109+
return pastTime.toLocalDate();
110+
}
111+
112+
// "?시간 전"
113+
Pattern hourPattern = Pattern.compile("(\\d+)시간\\s*전");
114+
Matcher hourMatcher = hourPattern.matcher(publishedAt);
115+
if (hourMatcher.find()) {
116+
int hours = Integer.parseInt(hourMatcher.group(1));
117+
LocalDateTime pastTime = LocalDateTime.now().minusHours(hours);
118+
return pastTime.toLocalDate();
119+
}
120+
121+
// "yyyy. M. d. HH:mm" 또는 "yyyy. M. d. H:mm" 형식
122+
try {
123+
// 시간 부분 제거하고 날짜만 추출
124+
Pattern datePattern = Pattern.compile("(\\d{4})\\s*\\.\\s*(\\d{1,2})\\s*\\.\\s*(\\d{1,2})");
125+
Matcher dateMatcher = datePattern.matcher(publishedAt);
126+
127+
if (dateMatcher.find()) {
128+
int year = Integer.parseInt(dateMatcher.group(1));
129+
int month = Integer.parseInt(dateMatcher.group(2));
130+
int day = Integer.parseInt(dateMatcher.group(3));
131+
return LocalDate.of(year, month, day);
132+
}
133+
} catch (Exception e) {
134+
System.err.println("날짜 파싱 실패: " + publishedAt);
135+
}
136+
137+
// 파싱 실패 시 null 반환
138+
return null;
85139
}
86140
}

src/main/java/org/tuna/zoopzoop/backend/domain/datasource/crawler/service/NaverNewsCrawler.java

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package org.tuna.zoopzoop.backend.domain.datasource.crawler.service;
22

33
import org.jsoup.nodes.Document;
4+
import org.jsoup.nodes.Element;
45
import org.springframework.core.Ordered;
56
import org.springframework.core.annotation.Order;
67
import org.springframework.stereotype.Component;
@@ -9,6 +10,7 @@
910

1011
import java.time.LocalDate;
1112
import java.time.format.DateTimeFormatter;
13+
import java.util.Optional;
1214

1315
@Component
1416
@Order(Ordered.HIGHEST_PRECEDENCE)
@@ -25,22 +27,33 @@ public boolean supports(String domain) {
2527
@Override
2628
public CrawlerResult<?> extract(Document doc) {
2729
// 제목
28-
String title = doc.selectFirst("h2#title_area").text();
30+
String title = doc.select("meta[property=og:title]").attr("content");
31+
2932

3033
// 작성 날짜
31-
String publishedAt = doc.selectFirst(
32-
"span.media_end_head_info_datestamp_time._ARTICLE_DATE_TIME"
33-
).attr("data-date-time");
34-
LocalDate dataCreatedDate = transLocalDate(publishedAt);
34+
String publishedAt = Optional.ofNullable(
35+
doc.selectFirst("span.media_end_head_info_datestamp_time._ARTICLE_DATE_TIME")
36+
)
37+
.map(el -> el.attr("data-date-time"))
38+
.orElse(""); // 값 없으면 빈 문자열
39+
40+
LocalDate dataCreatedDate = publishedAt.isEmpty()
41+
? null
42+
: transLocalDate(publishedAt);
43+
3544

3645
// 내용(ai한테 줘야함)
37-
String content = doc.select("article").text();
46+
String content = Optional.ofNullable(doc.selectFirst("article"))
47+
.map(Element::text)
48+
.orElse("");
49+
3850

3951
// 썸네일 이미지 url
40-
String imageUrl = doc.selectFirst("img#img1._LAZY_LOADING._LAZY_LOADING_INIT_HIDE").attr("data-src");
52+
String imageUrl = doc.select("meta[property=og:image]").attr("content");
53+
4154

4255
// 출처
43-
String source = doc.selectFirst("span.media_end_head_top_logo_text").text();
56+
String source = doc.select("meta[name=twitter:creator]").attr("content");
4457

4558
return new CrawlerResult<>(
4659
CrawlerResult.CrawlerType.SPECIFIC,

src/main/java/org/tuna/zoopzoop/backend/domain/datasource/crawler/service/SupportedDomain.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@
33
public enum SupportedDomain {
44
NAVERNEWS("n.news.naver.com"),
55
NAVERBLOG("blog.naver.com"),
6-
VELOG("velog.io");
6+
VELOG("velog.io"),
7+
TISTORY("tistory.com");
78

89
private final String domain;
910

0 commit comments

Comments
 (0)