From 0648e41cd1f2bf2b2639e87cd4b73a0366545e07 Mon Sep 17 00:00:00 2001 From: taekyung Date: Wed, 1 Oct 2025 23:52:11 +0900 Subject: [PATCH 1/3] =?UTF-8?q?fix=20:=20=EC=9E=91=EC=84=B1=EC=9D=BC?= =?UTF-8?q?=EC=9E=90=20=ED=8C=8C=EC=8B=B1=20=EC=9D=B4=EC=8A=88=20=ED=95=B4?= =?UTF-8?q?=EA=B2=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../crawler/service/VelogCrawler.java | 7 +++-- .../crawler/service/VelogCrawlerTest.java | 28 +++++++++++-------- 2 files changed, 21 insertions(+), 14 deletions(-) diff --git a/src/main/java/org/tuna/zoopzoop/backend/domain/datasource/crawler/service/VelogCrawler.java b/src/main/java/org/tuna/zoopzoop/backend/domain/datasource/crawler/service/VelogCrawler.java index f7d6e178..e8fd6cd1 100644 --- a/src/main/java/org/tuna/zoopzoop/backend/domain/datasource/crawler/service/VelogCrawler.java +++ b/src/main/java/org/tuna/zoopzoop/backend/domain/datasource/crawler/service/VelogCrawler.java @@ -51,12 +51,13 @@ public LocalDate transLocalDate(String rawDate) { if(rawDate.contains("일 전")){ int daysAgo = Integer.parseInt(rawDate.split("일 전")[0].trim()); return LocalDate.now().minusDays(daysAgo); - }else if(rawDate.contains("방금 전")) { - return LocalDate.now(); - }else if(rawDate.contains("시간 전")||rawDate.contains("분 전")){ + }else if(rawDate.contains("시간 전")||rawDate.contains("방금 전")||rawDate.contains("분 전")){ return LocalDate.now(); + }else if (rawDate.contains("어제")){ + return LocalDate.now().minusDays(1); } + return LocalDate.parse(rawDate, VELOG_FORMATTER); } } diff --git a/src/test/java/org/tuna/zoopzoop/backend/domain/datasource/crawler/service/VelogCrawlerTest.java b/src/test/java/org/tuna/zoopzoop/backend/domain/datasource/crawler/service/VelogCrawlerTest.java index 43db531f..d0e7bb2f 100644 --- a/src/test/java/org/tuna/zoopzoop/backend/domain/datasource/crawler/service/VelogCrawlerTest.java +++ b/src/test/java/org/tuna/zoopzoop/backend/domain/datasource/crawler/service/VelogCrawlerTest.java @@ -1,18 +1,24 @@ package org.tuna.zoopzoop.backend.domain.datasource.crawler.service; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.junit.jupiter.api.Test; +import org.tuna.zoopzoop.backend.domain.datasource.crawler.dto.CrawlerResult; + +import java.io.IOException; + +import static org.assertj.core.api.AssertionsForClassTypes.assertThat; + class VelogCrawlerTest { private final VelogCrawler velogCrawler = new VelogCrawler(); - // 날짜 바뀐 velog 포스트에 대해 에러 처리 필요 - // Text '어제' could not be parsed at index 0 -// java.time.format.DateTimeParseException -// @Test -// void testExtract() throws IOException { -// Document doc = Jsoup.connect("https://velog.io/@hyeonnnnn/VampireSurvivorsClone-04.-PoolManager").get(); -// CrawlerResult result = velogCrawler.extract(doc); -// assertThat(result).isNotNull(); -// -// System.out.println(result); -// } + @Test + void testExtract() throws IOException { + Document doc = Jsoup.connect("https://velog.io/@imcute0703123/%EC%BA%A0%ED%8D%BC%EC%8A%A4%EC%8B%9C%EA%B7%B8%EB%84%90-2025-%EB%B0%B1%EC%84%9D%EC%97%B0%ED%99%94-1%EB%93%B1-%EB%B6%80%EC%8A%A4-%EB%9F%AC%EB%B8%8C%EB%9D%BC%EC%9D%B8-%EB%A7%A4%EC%B9%AD-%EC%84%9C%EB%B9%84%EC%8A%A4-%ED%9A%8C%EA%B3%A0%EB%A1%9D").get(); + CrawlerResult result = velogCrawler.extract(doc); + assertThat(result).isNotNull(); + + System.out.println(result); + } } \ No newline at end of file From bb9de31deed1355f30851207961a8ba8286f6e3e Mon Sep 17 00:00:00 2001 From: taekyung Date: Fri, 10 Oct 2025 14:51:56 +0900 Subject: [PATCH 2/3] =?UTF-8?q?refactor:=20velog=20=ED=81=AC=EB=A1=A4?= =?UTF-8?q?=EB=9F=AC=20=EC=9A=B0=EC=84=A0=EC=88=9C=EC=9C=84=20=EB=86=92?= =?UTF-8?q?=EA=B2=8C=20=EC=84=A4=EC=A0=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../domain/datasource/crawler/service/VelogCrawler.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/main/java/org/tuna/zoopzoop/backend/domain/datasource/crawler/service/VelogCrawler.java b/src/main/java/org/tuna/zoopzoop/backend/domain/datasource/crawler/service/VelogCrawler.java index e8fd6cd1..be3e2d30 100644 --- a/src/main/java/org/tuna/zoopzoop/backend/domain/datasource/crawler/service/VelogCrawler.java +++ b/src/main/java/org/tuna/zoopzoop/backend/domain/datasource/crawler/service/VelogCrawler.java @@ -1,6 +1,8 @@ package org.tuna.zoopzoop.backend.domain.datasource.crawler.service; import org.jsoup.nodes.Document; +import org.springframework.core.Ordered; +import org.springframework.core.annotation.Order; import org.springframework.stereotype.Component; import org.tuna.zoopzoop.backend.domain.datasource.crawler.dto.CrawlerResult; import org.tuna.zoopzoop.backend.domain.datasource.crawler.dto.SpecificSiteDto; @@ -9,6 +11,7 @@ import java.time.format.DateTimeFormatter; @Component +@Order(Ordered.HIGHEST_PRECEDENCE) public class VelogCrawler implements Crawler{ private static final SupportedDomain DOMAIN = SupportedDomain.VELOG; private static final DateTimeFormatter VELOG_FORMATTER = DateTimeFormatter.ofPattern("yyyy년 M월 d일"); From 47faabfa1ade073f4817a45a177fc896883abcda Mon Sep 17 00:00:00 2001 From: taekyung Date: Fri, 10 Oct 2025 16:17:00 +0900 Subject: [PATCH 3/3] =?UTF-8?q?fix:=20=EB=B2=A8=EB=A1=9C=EA=B7=B8=20?= =?UTF-8?q?=ED=81=AC=EB=A1=A4=EB=9F=AC=20=EC=98=88=EC=99=B8=20=EC=B2=98?= =?UTF-8?q?=EB=A6=AC=20=EC=B6=94=EA=B0=80=20(HTML=20=EC=9A=94=EC=86=8C=20?= =?UTF-8?q?=EC=97=86=EC=9D=84=20=EC=8B=9C=20=EB=B9=88=20=EB=AC=B8=EC=9E=90?= =?UTF-8?q?=EC=97=B4,=20=EC=9E=91=EC=84=B1=EC=9D=BC=20=EC=97=86=EC=9D=84?= =?UTF-8?q?=20=EC=8B=9C=20=EA=B8=B0=EB=B3=B8=EA=B0=92)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../datasource/crawler/service/Crawler.java | 6 +++++ .../crawler/service/VelogCrawler.java | 23 +++++++++++-------- .../crawler/service/VelogCrawlerTest.java | 19 +++++++++++++++ 3 files changed, 38 insertions(+), 10 deletions(-) diff --git a/src/main/java/org/tuna/zoopzoop/backend/domain/datasource/crawler/service/Crawler.java b/src/main/java/org/tuna/zoopzoop/backend/domain/datasource/crawler/service/Crawler.java index e8eb831a..d100093e 100644 --- a/src/main/java/org/tuna/zoopzoop/backend/domain/datasource/crawler/service/Crawler.java +++ b/src/main/java/org/tuna/zoopzoop/backend/domain/datasource/crawler/service/Crawler.java @@ -6,6 +6,12 @@ import java.time.LocalDate; public interface Crawler { + + // 작성 일자가 null일 경우 기본값 설정 + // LocalDate.EPOCH(1970-01-01 - 시간이 없는 값 표현할 때 사용되는 관용적 기준점) + // 이 값이 사용되면 작성 일자가 없는 것으로 간주 + LocalDate DEFAULT_DATE = LocalDate.EPOCH; + boolean supports(String domain); CrawlerResult extract(Document doc); LocalDate transLocalDate(String rawDate); diff --git a/src/main/java/org/tuna/zoopzoop/backend/domain/datasource/crawler/service/VelogCrawler.java b/src/main/java/org/tuna/zoopzoop/backend/domain/datasource/crawler/service/VelogCrawler.java index be3e2d30..13ce0f61 100644 --- a/src/main/java/org/tuna/zoopzoop/backend/domain/datasource/crawler/service/VelogCrawler.java +++ b/src/main/java/org/tuna/zoopzoop/backend/domain/datasource/crawler/service/VelogCrawler.java @@ -1,6 +1,7 @@ package org.tuna.zoopzoop.backend.domain.datasource.crawler.service; import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; import org.springframework.core.Ordered; import org.springframework.core.annotation.Order; import org.springframework.stereotype.Component; @@ -22,25 +23,28 @@ public boolean supports(String domain) { } @Override - public CrawlerResult extract(Document doc) { + public CrawlerResult extract(Document doc) { // 제목 - String title = doc.selectFirst("meta[property=og:title]").attr("content"); + Element titleElement = doc.selectFirst("meta[property=og:title]"); + String title = titleElement != null ? titleElement.attr("content") : ""; // 작성 날짜 - String publishedAt = doc.selectFirst( - "div.information > span:not([class])" - ).text(); + Element publishedAtElement = doc.selectFirst("div.information > span:not([class])"); + String publishedAt = publishedAtElement != null ? publishedAtElement.text() : ""; - LocalDate dataCreatedDate = transLocalDate(publishedAt); + LocalDate dataCreatedDate = publishedAt.isBlank() ? DEFAULT_DATE : transLocalDate(publishedAt) ; // 내용(ai한테 줘야함) - String content = doc.selectFirst("div.atom-one").text(); + Element contentElement = doc.selectFirst("div.atom-one"); + String content = contentElement != null ? contentElement.text() : ""; // 썸네일 이미지 url - String imageUrl = doc.selectFirst("meta[property=og:image]").attr("content"); + Element imageUrlElement = doc.selectFirst("meta[property=og:image]"); + String imageUrl = imageUrlElement != null ? imageUrlElement.attr("content") : ""; // 출처 - String source = doc.selectFirst("span.username > a").text(); + Element sourceElement = doc.selectFirst("span.username > a"); + String source = sourceElement != null ? sourceElement.text() : ""; return new CrawlerResult<>( CrawlerResult.CrawlerType.SPECIFIC, @@ -60,7 +64,6 @@ public LocalDate transLocalDate(String rawDate) { return LocalDate.now().minusDays(1); } - return LocalDate.parse(rawDate, VELOG_FORMATTER); } } diff --git a/src/test/java/org/tuna/zoopzoop/backend/domain/datasource/crawler/service/VelogCrawlerTest.java b/src/test/java/org/tuna/zoopzoop/backend/domain/datasource/crawler/service/VelogCrawlerTest.java index d0e7bb2f..e6029a05 100644 --- a/src/test/java/org/tuna/zoopzoop/backend/domain/datasource/crawler/service/VelogCrawlerTest.java +++ b/src/test/java/org/tuna/zoopzoop/backend/domain/datasource/crawler/service/VelogCrawlerTest.java @@ -2,10 +2,13 @@ import org.jsoup.Jsoup; import org.jsoup.nodes.Document; +import org.junit.jupiter.api.DisplayName; import org.junit.jupiter.api.Test; import org.tuna.zoopzoop.backend.domain.datasource.crawler.dto.CrawlerResult; +import org.tuna.zoopzoop.backend.domain.datasource.crawler.dto.SpecificSiteDto; import java.io.IOException; +import java.time.LocalDate; import static org.assertj.core.api.AssertionsForClassTypes.assertThat; @@ -14,6 +17,7 @@ class VelogCrawlerTest { private final VelogCrawler velogCrawler = new VelogCrawler(); @Test + @DisplayName("Velog 크롤러 작동 테스트") void testExtract() throws IOException { Document doc = Jsoup.connect("https://velog.io/@imcute0703123/%EC%BA%A0%ED%8D%BC%EC%8A%A4%EC%8B%9C%EA%B7%B8%EB%84%90-2025-%EB%B0%B1%EC%84%9D%EC%97%B0%ED%99%94-1%EB%93%B1-%EB%B6%80%EC%8A%A4-%EB%9F%AC%EB%B8%8C%EB%9D%BC%EC%9D%B8-%EB%A7%A4%EC%B9%AD-%EC%84%9C%EB%B9%84%EC%8A%A4-%ED%9A%8C%EA%B3%A0%EB%A1%9D").get(); CrawlerResult result = velogCrawler.extract(doc); @@ -21,4 +25,19 @@ void testExtract() throws IOException { System.out.println(result); } + + @Test + @DisplayName("Velog 크롤러 예외처리 테스트") + void testExtractException() throws IOException { + String html = ""; + Document doc = Jsoup.parse(html); + + SpecificSiteDto result = velogCrawler.extract(doc).data(); + + assertThat(result.title()).isEmpty(); + assertThat(result.content()).isEmpty(); + assertThat(result.imageUrl()).isEmpty(); + assertThat(result.source()).isEmpty(); + assertThat(result.dataCreatedDate()).isEqualTo(LocalDate.EPOCH); + } } \ No newline at end of file