Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,12 @@
import java.time.LocalDate;

public interface Crawler {

// 작성 일자가 null일 경우 기본값 설정
// LocalDate.EPOCH(1970-01-01 - 시간이 없는 값 표현할 때 사용되는 관용적 기준점)
// 이 값이 사용되면 작성 일자가 없는 것으로 간주
LocalDate DEFAULT_DATE = LocalDate.EPOCH;

boolean supports(String domain);
CrawlerResult<?> extract(Document doc);
LocalDate transLocalDate(String rawDate);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
package org.tuna.zoopzoop.backend.domain.datasource.crawler.service;

import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.springframework.core.Ordered;
import org.springframework.core.annotation.Order;
import org.springframework.stereotype.Component;
import org.tuna.zoopzoop.backend.domain.datasource.crawler.dto.CrawlerResult;
import org.tuna.zoopzoop.backend.domain.datasource.crawler.dto.SpecificSiteDto;
Expand All @@ -9,6 +12,7 @@
import java.time.format.DateTimeFormatter;

@Component
@Order(Ordered.HIGHEST_PRECEDENCE)
public class VelogCrawler implements Crawler{
private static final SupportedDomain DOMAIN = SupportedDomain.VELOG;
private static final DateTimeFormatter VELOG_FORMATTER = DateTimeFormatter.ofPattern("yyyy년 M월 d일");
Expand All @@ -19,25 +23,28 @@ public boolean supports(String domain) {
}

@Override
public CrawlerResult<?> extract(Document doc) {
public CrawlerResult<SpecificSiteDto> extract(Document doc) {
// 제목
String title = doc.selectFirst("meta[property=og:title]").attr("content");
Element titleElement = doc.selectFirst("meta[property=og:title]");
String title = titleElement != null ? titleElement.attr("content") : "";

// 작성 날짜
String publishedAt = doc.selectFirst(
"div.information > span:not([class])"
).text();
Element publishedAtElement = doc.selectFirst("div.information > span:not([class])");
String publishedAt = publishedAtElement != null ? publishedAtElement.text() : "";

LocalDate dataCreatedDate = transLocalDate(publishedAt);
LocalDate dataCreatedDate = publishedAt.isBlank() ? DEFAULT_DATE : transLocalDate(publishedAt) ;

// 내용(ai한테 줘야함)
String content = doc.selectFirst("div.atom-one").text();
Element contentElement = doc.selectFirst("div.atom-one");
String content = contentElement != null ? contentElement.text() : "";

// 썸네일 이미지 url
String imageUrl = doc.selectFirst("meta[property=og:image]").attr("content");
Element imageUrlElement = doc.selectFirst("meta[property=og:image]");
String imageUrl = imageUrlElement != null ? imageUrlElement.attr("content") : "";

// 출처
String source = doc.selectFirst("span.username > a").text();
Element sourceElement = doc.selectFirst("span.username > a");
String source = sourceElement != null ? sourceElement.text() : "";

return new CrawlerResult<>(
CrawlerResult.CrawlerType.SPECIFIC,
Expand All @@ -51,10 +58,10 @@ public LocalDate transLocalDate(String rawDate) {
if(rawDate.contains("일 전")){
int daysAgo = Integer.parseInt(rawDate.split("일 전")[0].trim());
return LocalDate.now().minusDays(daysAgo);
}else if(rawDate.contains("방금 전")) {
return LocalDate.now();
}else if(rawDate.contains("시간 전")||rawDate.contains("분 전")){
}else if(rawDate.contains("시간 전")||rawDate.contains("방금 전")||rawDate.contains("분 전")){
return LocalDate.now();
}else if (rawDate.contains("어제")){
return LocalDate.now().minusDays(1);
}

return LocalDate.parse(rawDate, VELOG_FORMATTER);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,18 +1,43 @@
package org.tuna.zoopzoop.backend.domain.datasource.crawler.service;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.junit.jupiter.api.DisplayName;
import org.junit.jupiter.api.Test;
import org.tuna.zoopzoop.backend.domain.datasource.crawler.dto.CrawlerResult;
import org.tuna.zoopzoop.backend.domain.datasource.crawler.dto.SpecificSiteDto;

import java.io.IOException;
import java.time.LocalDate;

import static org.assertj.core.api.AssertionsForClassTypes.assertThat;

class VelogCrawlerTest {

private final VelogCrawler velogCrawler = new VelogCrawler();

// 날짜 바뀐 velog 포스트에 대해 에러 처리 필요
// Text '어제' could not be parsed at index 0
// java.time.format.DateTimeParseException
// @Test
// void testExtract() throws IOException {
// Document doc = Jsoup.connect("https://velog.io/@hyeonnnnn/VampireSurvivorsClone-04.-PoolManager").get();
// CrawlerResult<?> result = velogCrawler.extract(doc);
// assertThat(result).isNotNull();
//
// System.out.println(result);
// }
@Test
@DisplayName("Velog 크롤러 작동 테스트")
void testExtract() throws IOException {
Document doc = Jsoup.connect("https://velog.io/@imcute0703123/%EC%BA%A0%ED%8D%BC%EC%8A%A4%EC%8B%9C%EA%B7%B8%EB%84%90-2025-%EB%B0%B1%EC%84%9D%EC%97%B0%ED%99%94-1%EB%93%B1-%EB%B6%80%EC%8A%A4-%EB%9F%AC%EB%B8%8C%EB%9D%BC%EC%9D%B8-%EB%A7%A4%EC%B9%AD-%EC%84%9C%EB%B9%84%EC%8A%A4-%ED%9A%8C%EA%B3%A0%EB%A1%9D").get();
CrawlerResult<?> result = velogCrawler.extract(doc);
assertThat(result).isNotNull();

System.out.println(result);
}

@Test
@DisplayName("Velog 크롤러 예외처리 테스트")
void testExtractException() throws IOException {
String html = "<html><head></head><body></body></html>";
Document doc = Jsoup.parse(html);

SpecificSiteDto result = velogCrawler.extract(doc).data();

assertThat(result.title()).isEmpty();
assertThat(result.content()).isEmpty();
assertThat(result.imageUrl()).isEmpty();
assertThat(result.source()).isEmpty();
assertThat(result.dataCreatedDate()).isEqualTo(LocalDate.EPOCH);
}
}