Skip to content

Commit 03e06b8

Browse files
committed
fix: 벨로그 크롤러 예외 처리 추가 (HTML 요소 없을 시 빈 문자열, 작성일 없을 시 기본값)
1 parent 5c59e57 commit 03e06b8

File tree

3 files changed

+38
-10
lines changed

3 files changed

+38
-10
lines changed

src/main/java/org/tuna/zoopzoop/backend/domain/datasource/crawler/service/Crawler.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,12 @@
77
import java.time.LocalDate;
88

99
public interface Crawler {
10+
11+
// 작성 일자가 null일 경우 기본값 설정
12+
// LocalDate.EPOCH(1970-01-01 - 시간이 없는 값 표현할 때 사용되는 관용적 기준점)
13+
// 이 값이 사용되면 작성 일자가 없는 것으로 간주
14+
LocalDate DEFAULT_DATE = LocalDate.EPOCH;
15+
1016
boolean supports(String domain);
1117
CrawlerResult<?> extract(Document doc) throws IOException;
1218
LocalDate transLocalDate(String rawDate);

src/main/java/org/tuna/zoopzoop/backend/domain/datasource/crawler/service/VelogCrawler.java

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package org.tuna.zoopzoop.backend.domain.datasource.crawler.service;
22

33
import org.jsoup.nodes.Document;
4+
import org.jsoup.nodes.Element;
45
import org.springframework.core.Ordered;
56
import org.springframework.core.annotation.Order;
67
import org.springframework.stereotype.Component;
@@ -22,25 +23,28 @@ public boolean supports(String domain) {
2223
}
2324

2425
@Override
25-
public CrawlerResult<?> extract(Document doc) {
26+
public CrawlerResult<SpecificSiteDto> extract(Document doc) {
2627
// 제목
27-
String title = doc.selectFirst("meta[property=og:title]").attr("content");
28+
Element titleElement = doc.selectFirst("meta[property=og:title]");
29+
String title = titleElement != null ? titleElement.attr("content") : "";
2830

2931
// 작성 날짜
30-
String publishedAt = doc.selectFirst(
31-
"div.information > span:not([class])"
32-
).text();
32+
Element publishedAtElement = doc.selectFirst("div.information > span:not([class])");
33+
String publishedAt = publishedAtElement != null ? publishedAtElement.text() : "";
3334

34-
LocalDate dataCreatedDate = transLocalDate(publishedAt);
35+
LocalDate dataCreatedDate = publishedAt.isBlank() ? DEFAULT_DATE : transLocalDate(publishedAt) ;
3536

3637
// 내용(ai한테 줘야함)
37-
String content = doc.selectFirst("div.atom-one").text();
38+
Element contentElement = doc.selectFirst("div.atom-one");
39+
String content = contentElement != null ? contentElement.text() : "";
3840

3941
// 썸네일 이미지 url
40-
String imageUrl = doc.selectFirst("meta[property=og:image]").attr("content");
42+
Element imageUrlElement = doc.selectFirst("meta[property=og:image]");
43+
String imageUrl = imageUrlElement != null ? imageUrlElement.attr("content") : "";
4144

4245
// 출처
43-
String source = doc.selectFirst("span.username > a").text();
46+
Element sourceElement = doc.selectFirst("span.username > a");
47+
String source = sourceElement != null ? sourceElement.text() : "";
4448

4549
return new CrawlerResult<>(
4650
CrawlerResult.CrawlerType.SPECIFIC,
@@ -60,7 +64,6 @@ public LocalDate transLocalDate(String rawDate) {
6064
return LocalDate.now().minusDays(1);
6165
}
6266

63-
6467
return LocalDate.parse(rawDate, VELOG_FORMATTER);
6568
}
6669
}

src/test/java/org/tuna/zoopzoop/backend/domain/datasource/crawler/service/VelogCrawlerTest.java

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,13 @@
22

33
import org.jsoup.Jsoup;
44
import org.jsoup.nodes.Document;
5+
import org.junit.jupiter.api.DisplayName;
56
import org.junit.jupiter.api.Test;
67
import org.tuna.zoopzoop.backend.domain.datasource.crawler.dto.CrawlerResult;
8+
import org.tuna.zoopzoop.backend.domain.datasource.crawler.dto.SpecificSiteDto;
79

810
import java.io.IOException;
11+
import java.time.LocalDate;
912

1013
import static org.assertj.core.api.AssertionsForClassTypes.assertThat;
1114

@@ -14,11 +17,27 @@ class VelogCrawlerTest {
1417
private final VelogCrawler velogCrawler = new VelogCrawler();
1518

1619
@Test
20+
@DisplayName("Velog 크롤러 작동 테스트")
1721
void testExtract() throws IOException {
1822
Document doc = Jsoup.connect("https://velog.io/@imcute0703123/%EC%BA%A0%ED%8D%BC%EC%8A%A4%EC%8B%9C%EA%B7%B8%EB%84%90-2025-%EB%B0%B1%EC%84%9D%EC%97%B0%ED%99%94-1%EB%93%B1-%EB%B6%80%EC%8A%A4-%EB%9F%AC%EB%B8%8C%EB%9D%BC%EC%9D%B8-%EB%A7%A4%EC%B9%AD-%EC%84%9C%EB%B9%84%EC%8A%A4-%ED%9A%8C%EA%B3%A0%EB%A1%9D").get();
1923
CrawlerResult<?> result = velogCrawler.extract(doc);
2024
assertThat(result).isNotNull();
2125

2226
System.out.println(result);
2327
}
28+
29+
@Test
30+
@DisplayName("Velog 크롤러 예외처리 테스트")
31+
void testExtractException() throws IOException {
32+
String html = "<html><head></head><body></body></html>";
33+
Document doc = Jsoup.parse(html);
34+
35+
SpecificSiteDto result = velogCrawler.extract(doc).data();
36+
37+
assertThat(result.title()).isEmpty();
38+
assertThat(result.content()).isEmpty();
39+
assertThat(result.imageUrl()).isEmpty();
40+
assertThat(result.source()).isEmpty();
41+
assertThat(result.dataCreatedDate()).isEqualTo(LocalDate.EPOCH);
42+
}
2443
}

0 commit comments

Comments
 (0)