diff --git a/src/main/java/org/tuna/zoopzoop/backend/domain/datasource/crawler/service/Crawler.java b/src/main/java/org/tuna/zoopzoop/backend/domain/datasource/crawler/service/Crawler.java index e8eb831a..d100093e 100644 --- a/src/main/java/org/tuna/zoopzoop/backend/domain/datasource/crawler/service/Crawler.java +++ b/src/main/java/org/tuna/zoopzoop/backend/domain/datasource/crawler/service/Crawler.java @@ -6,6 +6,12 @@ import java.time.LocalDate; public interface Crawler { + + // 작성 일자가 null일 경우 기본값 설정 + // LocalDate.EPOCH(1970-01-01 - 시간이 없는 값 표현할 때 사용되는 관용적 기준점) + // 이 값이 사용되면 작성 일자가 없는 것으로 간주 + LocalDate DEFAULT_DATE = LocalDate.EPOCH; + boolean supports(String domain); CrawlerResult extract(Document doc); LocalDate transLocalDate(String rawDate); diff --git a/src/main/java/org/tuna/zoopzoop/backend/domain/datasource/crawler/service/VelogCrawler.java b/src/main/java/org/tuna/zoopzoop/backend/domain/datasource/crawler/service/VelogCrawler.java index f7d6e178..13ce0f61 100644 --- a/src/main/java/org/tuna/zoopzoop/backend/domain/datasource/crawler/service/VelogCrawler.java +++ b/src/main/java/org/tuna/zoopzoop/backend/domain/datasource/crawler/service/VelogCrawler.java @@ -1,6 +1,9 @@ package org.tuna.zoopzoop.backend.domain.datasource.crawler.service; import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.springframework.core.Ordered; +import org.springframework.core.annotation.Order; import org.springframework.stereotype.Component; import org.tuna.zoopzoop.backend.domain.datasource.crawler.dto.CrawlerResult; import org.tuna.zoopzoop.backend.domain.datasource.crawler.dto.SpecificSiteDto; @@ -9,6 +12,7 @@ import java.time.format.DateTimeFormatter; @Component +@Order(Ordered.HIGHEST_PRECEDENCE) public class VelogCrawler implements Crawler{ private static final SupportedDomain DOMAIN = SupportedDomain.VELOG; private static final DateTimeFormatter VELOG_FORMATTER = DateTimeFormatter.ofPattern("yyyy년 M월 d일"); @@ -19,25 +23,28 @@ public boolean supports(String domain) { } @Override - public CrawlerResult extract(Document doc) { + public CrawlerResult extract(Document doc) { // 제목 - String title = doc.selectFirst("meta[property=og:title]").attr("content"); + Element titleElement = doc.selectFirst("meta[property=og:title]"); + String title = titleElement != null ? titleElement.attr("content") : ""; // 작성 날짜 - String publishedAt = doc.selectFirst( - "div.information > span:not([class])" - ).text(); + Element publishedAtElement = doc.selectFirst("div.information > span:not([class])"); + String publishedAt = publishedAtElement != null ? publishedAtElement.text() : ""; - LocalDate dataCreatedDate = transLocalDate(publishedAt); + LocalDate dataCreatedDate = publishedAt.isBlank() ? DEFAULT_DATE : transLocalDate(publishedAt) ; // 내용(ai한테 줘야함) - String content = doc.selectFirst("div.atom-one").text(); + Element contentElement = doc.selectFirst("div.atom-one"); + String content = contentElement != null ? contentElement.text() : ""; // 썸네일 이미지 url - String imageUrl = doc.selectFirst("meta[property=og:image]").attr("content"); + Element imageUrlElement = doc.selectFirst("meta[property=og:image]"); + String imageUrl = imageUrlElement != null ? imageUrlElement.attr("content") : ""; // 출처 - String source = doc.selectFirst("span.username > a").text(); + Element sourceElement = doc.selectFirst("span.username > a"); + String source = sourceElement != null ? sourceElement.text() : ""; return new CrawlerResult<>( CrawlerResult.CrawlerType.SPECIFIC, @@ -51,10 +58,10 @@ public LocalDate transLocalDate(String rawDate) { if(rawDate.contains("일 전")){ int daysAgo = Integer.parseInt(rawDate.split("일 전")[0].trim()); return LocalDate.now().minusDays(daysAgo); - }else if(rawDate.contains("방금 전")) { - return LocalDate.now(); - }else if(rawDate.contains("시간 전")||rawDate.contains("분 전")){ + }else if(rawDate.contains("시간 전")||rawDate.contains("방금 전")||rawDate.contains("분 전")){ return LocalDate.now(); + }else if (rawDate.contains("어제")){ + return LocalDate.now().minusDays(1); } return LocalDate.parse(rawDate, VELOG_FORMATTER); diff --git a/src/test/java/org/tuna/zoopzoop/backend/domain/datasource/crawler/service/VelogCrawlerTest.java b/src/test/java/org/tuna/zoopzoop/backend/domain/datasource/crawler/service/VelogCrawlerTest.java index 43db531f..e6029a05 100644 --- a/src/test/java/org/tuna/zoopzoop/backend/domain/datasource/crawler/service/VelogCrawlerTest.java +++ b/src/test/java/org/tuna/zoopzoop/backend/domain/datasource/crawler/service/VelogCrawlerTest.java @@ -1,18 +1,43 @@ package org.tuna.zoopzoop.backend.domain.datasource.crawler.service; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.junit.jupiter.api.DisplayName; +import org.junit.jupiter.api.Test; +import org.tuna.zoopzoop.backend.domain.datasource.crawler.dto.CrawlerResult; +import org.tuna.zoopzoop.backend.domain.datasource.crawler.dto.SpecificSiteDto; + +import java.io.IOException; +import java.time.LocalDate; + +import static org.assertj.core.api.AssertionsForClassTypes.assertThat; + class VelogCrawlerTest { private final VelogCrawler velogCrawler = new VelogCrawler(); - // 날짜 바뀐 velog 포스트에 대해 에러 처리 필요 - // Text '어제' could not be parsed at index 0 -// java.time.format.DateTimeParseException -// @Test -// void testExtract() throws IOException { -// Document doc = Jsoup.connect("https://velog.io/@hyeonnnnn/VampireSurvivorsClone-04.-PoolManager").get(); -// CrawlerResult result = velogCrawler.extract(doc); -// assertThat(result).isNotNull(); -// -// System.out.println(result); -// } + @Test + @DisplayName("Velog 크롤러 작동 테스트") + void testExtract() throws IOException { + Document doc = Jsoup.connect("https://velog.io/@imcute0703123/%EC%BA%A0%ED%8D%BC%EC%8A%A4%EC%8B%9C%EA%B7%B8%EB%84%90-2025-%EB%B0%B1%EC%84%9D%EC%97%B0%ED%99%94-1%EB%93%B1-%EB%B6%80%EC%8A%A4-%EB%9F%AC%EB%B8%8C%EB%9D%BC%EC%9D%B8-%EB%A7%A4%EC%B9%AD-%EC%84%9C%EB%B9%84%EC%8A%A4-%ED%9A%8C%EA%B3%A0%EB%A1%9D").get(); + CrawlerResult result = velogCrawler.extract(doc); + assertThat(result).isNotNull(); + + System.out.println(result); + } + + @Test + @DisplayName("Velog 크롤러 예외처리 테스트") + void testExtractException() throws IOException { + String html = ""; + Document doc = Jsoup.parse(html); + + SpecificSiteDto result = velogCrawler.extract(doc).data(); + + assertThat(result.title()).isEmpty(); + assertThat(result.content()).isEmpty(); + assertThat(result.imageUrl()).isEmpty(); + assertThat(result.source()).isEmpty(); + assertThat(result.dataCreatedDate()).isEqualTo(LocalDate.EPOCH); + } } \ No newline at end of file