Skip to content

Commit f4a3380

Browse files
authored
fix: 벨로그 크롤러 작성일자 파싱 이슈 해결 (#111)
* fix : 작성일자 파싱 이슈 해결 * refactor: velog 크롤러 우선순위 높게 설정 * fix: 벨로그 크롤러 예외 처리 추가 (HTML 요소 없을 시 빈 문자열, 작성일 없을 시 기본값)
1 parent e6e05f1 commit f4a3380

File tree

3 files changed

+61
-23
lines changed

3 files changed

+61
-23
lines changed

src/main/java/org/tuna/zoopzoop/backend/domain/datasource/crawler/service/Crawler.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,12 @@
66
import java.time.LocalDate;
77

88
public interface Crawler {
9+
10+
// 작성 일자가 null일 경우 기본값 설정
11+
// LocalDate.EPOCH(1970-01-01 - 시간이 없는 값 표현할 때 사용되는 관용적 기준점)
12+
// 이 값이 사용되면 작성 일자가 없는 것으로 간주
13+
LocalDate DEFAULT_DATE = LocalDate.EPOCH;
14+
915
boolean supports(String domain);
1016
CrawlerResult<?> extract(Document doc);
1117
LocalDate transLocalDate(String rawDate);

src/main/java/org/tuna/zoopzoop/backend/domain/datasource/crawler/service/VelogCrawler.java

Lines changed: 19 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
package org.tuna.zoopzoop.backend.domain.datasource.crawler.service;
22

33
import org.jsoup.nodes.Document;
4+
import org.jsoup.nodes.Element;
5+
import org.springframework.core.Ordered;
6+
import org.springframework.core.annotation.Order;
47
import org.springframework.stereotype.Component;
58
import org.tuna.zoopzoop.backend.domain.datasource.crawler.dto.CrawlerResult;
69
import org.tuna.zoopzoop.backend.domain.datasource.crawler.dto.SpecificSiteDto;
@@ -9,6 +12,7 @@
912
import java.time.format.DateTimeFormatter;
1013

1114
@Component
15+
@Order(Ordered.HIGHEST_PRECEDENCE)
1216
public class VelogCrawler implements Crawler{
1317
private static final SupportedDomain DOMAIN = SupportedDomain.VELOG;
1418
private static final DateTimeFormatter VELOG_FORMATTER = DateTimeFormatter.ofPattern("yyyy년 M월 d일");
@@ -19,25 +23,28 @@ public boolean supports(String domain) {
1923
}
2024

2125
@Override
22-
public CrawlerResult<?> extract(Document doc) {
26+
public CrawlerResult<SpecificSiteDto> extract(Document doc) {
2327
// 제목
24-
String title = doc.selectFirst("meta[property=og:title]").attr("content");
28+
Element titleElement = doc.selectFirst("meta[property=og:title]");
29+
String title = titleElement != null ? titleElement.attr("content") : "";
2530

2631
// 작성 날짜
27-
String publishedAt = doc.selectFirst(
28-
"div.information > span:not([class])"
29-
).text();
32+
Element publishedAtElement = doc.selectFirst("div.information > span:not([class])");
33+
String publishedAt = publishedAtElement != null ? publishedAtElement.text() : "";
3034

31-
LocalDate dataCreatedDate = transLocalDate(publishedAt);
35+
LocalDate dataCreatedDate = publishedAt.isBlank() ? DEFAULT_DATE : transLocalDate(publishedAt) ;
3236

3337
// 내용(ai한테 줘야함)
34-
String content = doc.selectFirst("div.atom-one").text();
38+
Element contentElement = doc.selectFirst("div.atom-one");
39+
String content = contentElement != null ? contentElement.text() : "";
3540

3641
// 썸네일 이미지 url
37-
String imageUrl = doc.selectFirst("meta[property=og:image]").attr("content");
42+
Element imageUrlElement = doc.selectFirst("meta[property=og:image]");
43+
String imageUrl = imageUrlElement != null ? imageUrlElement.attr("content") : "";
3844

3945
// 출처
40-
String source = doc.selectFirst("span.username > a").text();
46+
Element sourceElement = doc.selectFirst("span.username > a");
47+
String source = sourceElement != null ? sourceElement.text() : "";
4148

4249
return new CrawlerResult<>(
4350
CrawlerResult.CrawlerType.SPECIFIC,
@@ -51,10 +58,10 @@ public LocalDate transLocalDate(String rawDate) {
5158
if(rawDate.contains("일 전")){
5259
int daysAgo = Integer.parseInt(rawDate.split("일 전")[0].trim());
5360
return LocalDate.now().minusDays(daysAgo);
54-
}else if(rawDate.contains("방금 전")) {
55-
return LocalDate.now();
56-
}else if(rawDate.contains("시간 전")||rawDate.contains("분 전")){
61+
}else if(rawDate.contains("시간 전")||rawDate.contains("방금 전")||rawDate.contains("분 전")){
5762
return LocalDate.now();
63+
}else if (rawDate.contains("어제")){
64+
return LocalDate.now().minusDays(1);
5865
}
5966

6067
return LocalDate.parse(rawDate, VELOG_FORMATTER);
Lines changed: 36 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,43 @@
11
package org.tuna.zoopzoop.backend.domain.datasource.crawler.service;
22

3+
import org.jsoup.Jsoup;
4+
import org.jsoup.nodes.Document;
5+
import org.junit.jupiter.api.DisplayName;
6+
import org.junit.jupiter.api.Test;
7+
import org.tuna.zoopzoop.backend.domain.datasource.crawler.dto.CrawlerResult;
8+
import org.tuna.zoopzoop.backend.domain.datasource.crawler.dto.SpecificSiteDto;
9+
10+
import java.io.IOException;
11+
import java.time.LocalDate;
12+
13+
import static org.assertj.core.api.AssertionsForClassTypes.assertThat;
14+
315
class VelogCrawlerTest {
416

517
private final VelogCrawler velogCrawler = new VelogCrawler();
618

7-
// 날짜 바뀐 velog 포스트에 대해 에러 처리 필요
8-
// Text '어제' could not be parsed at index 0
9-
// java.time.format.DateTimeParseException
10-
// @Test
11-
// void testExtract() throws IOException {
12-
// Document doc = Jsoup.connect("https://velog.io/@hyeonnnnn/VampireSurvivorsClone-04.-PoolManager").get();
13-
// CrawlerResult<?> result = velogCrawler.extract(doc);
14-
// assertThat(result).isNotNull();
15-
//
16-
// System.out.println(result);
17-
// }
19+
@Test
20+
@DisplayName("Velog 크롤러 작동 테스트")
21+
void testExtract() throws IOException {
22+
Document doc = Jsoup.connect("https://velog.io/@imcute0703123/%EC%BA%A0%ED%8D%BC%EC%8A%A4%EC%8B%9C%EA%B7%B8%EB%84%90-2025-%EB%B0%B1%EC%84%9D%EC%97%B0%ED%99%94-1%EB%93%B1-%EB%B6%80%EC%8A%A4-%EB%9F%AC%EB%B8%8C%EB%9D%BC%EC%9D%B8-%EB%A7%A4%EC%B9%AD-%EC%84%9C%EB%B9%84%EC%8A%A4-%ED%9A%8C%EA%B3%A0%EB%A1%9D").get();
23+
CrawlerResult<?> result = velogCrawler.extract(doc);
24+
assertThat(result).isNotNull();
25+
26+
System.out.println(result);
27+
}
28+
29+
@Test
30+
@DisplayName("Velog 크롤러 예외처리 테스트")
31+
void testExtractException() throws IOException {
32+
String html = "<html><head></head><body></body></html>";
33+
Document doc = Jsoup.parse(html);
34+
35+
SpecificSiteDto result = velogCrawler.extract(doc).data();
36+
37+
assertThat(result.title()).isEmpty();
38+
assertThat(result.content()).isEmpty();
39+
assertThat(result.imageUrl()).isEmpty();
40+
assertThat(result.source()).isEmpty();
41+
assertThat(result.dataCreatedDate()).isEqualTo(LocalDate.EPOCH);
42+
}
1843
}

0 commit comments

Comments
 (0)