Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,10 @@ dependencies {

// Elastic Search
implementation 'org.springframework.boot:spring-boot-starter-data-elasticsearch'

// retry (ai retry용)
implementation 'org.springframework.retry:spring-retry'
implementation 'org.springframework:spring-aspects'
}

dependencyManagement {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ public class AiPrompt {
- 제공된 태그와 중복 가능하다.
- 필요하면 새로운 태그를 만들어도 된다.
4. 출력은 반드시 아래 JSON 형식으로 해라. Markdown 문법(```)은 쓰지 마라.
- 해당 정보가 없으면 null말고 무조건 빈 문자열로 출력해줘라.
- 해당정보가 없을 시 summary는 빈 문자열, category는 null, tags는 빈 리스트로 출력해줘라.

[출력 JSON 형식]
{
Expand Down
Original file line number Diff line number Diff line change
@@ -1,23 +1,32 @@
package org.tuna.zoopzoop.backend.domain.datasource.ai.service;

import com.fasterxml.jackson.core.JsonParseException;
import com.fasterxml.jackson.core.JsonProcessingException;
import lombok.RequiredArgsConstructor;
import org.springframework.ai.chat.client.ChatClient;
import org.springframework.retry.annotation.Backoff;
import org.springframework.retry.annotation.Recover;
import org.springframework.retry.annotation.Retryable;
import org.springframework.stereotype.Service;
import org.tuna.zoopzoop.backend.domain.datasource.ai.dto.AiExtractorDto;
import org.tuna.zoopzoop.backend.domain.datasource.ai.dto.AnalyzeContentDto;
import org.tuna.zoopzoop.backend.domain.datasource.ai.prompt.AiPrompt;
import org.tuna.zoopzoop.backend.domain.datasource.entity.Tag;
import org.tuna.zoopzoop.backend.domain.datasource.repository.TagRepository;

import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;

@Service
@RequiredArgsConstructor
public class AiService {
private final ChatClient chatClient;
private final TagRepository tagRepository;

@Retryable(
maxAttempts = 3,
backoff = @Backoff(delay = 500),
retryFor = {JsonParseException.class, JsonProcessingException.class}
)
public AiExtractorDto extract(String rawHtml) {
AiExtractorDto response = chatClient.prompt()
.user(AiPrompt.EXTRACTION.formatted(rawHtml))
Expand All @@ -27,6 +36,22 @@ public AiExtractorDto extract(String rawHtml) {
return response;
}

@Recover
public AiExtractorDto extractRecover(Exception e, String rawHtml) {
return new AiExtractorDto(
"",
null,
"",
"",
""
);
}

@Retryable(
maxAttempts = 3,
backoff = @Backoff(delay = 500),
retryFor = {JsonParseException.class, JsonProcessingException.class}
)
public AnalyzeContentDto analyzeContent(String content, List<Tag> tagList) {
// JSON 배열 문자열로 변환
String tags = tagList.stream()
Expand All @@ -41,4 +66,14 @@ public AnalyzeContentDto analyzeContent(String content, List<Tag> tagList) {

return response;
}

@Recover
public AnalyzeContentDto analyzeContentRecover(Exception e, String content, List<Tag> tagList) {
return new AnalyzeContentDto(
"",
null,
new ArrayList<>()
);
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,10 @@
import org.jsoup.nodes.Document;
import org.tuna.zoopzoop.backend.domain.datasource.crawler.dto.CrawlerResult;

import java.io.IOException;
import java.time.LocalDate;

public interface Crawler {
boolean supports(String domain);
CrawlerResult<?> extract(Document doc) throws IOException;
CrawlerResult<?> extract(Document doc);
LocalDate transLocalDate(String rawDate);
}
Original file line number Diff line number Diff line change
@@ -1,30 +1,24 @@
package org.tuna.zoopzoop.backend.domain.datasource.crawler.service;

import lombok.RequiredArgsConstructor;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.springframework.stereotype.Service;
import org.tuna.zoopzoop.backend.domain.datasource.crawler.dto.CrawlerResult;

import java.io.IOException;
import java.util.List;

@Service
@RequiredArgsConstructor
public class CrawlerManagerService {
private final List<Crawler> crawlers;

public CrawlerResult<?> extractContent(String url) throws IOException {
Document doc = Jsoup.connect(url)
.userAgent("Mozilla/5.0")
.timeout(10000)
.get();

public CrawlerResult<?> extractContent(String url, Document doc) {
for (Crawler crawler : crawlers) {
if (crawler.supports(url)) {
return crawler.extract(doc);
}
}

return null;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,30 @@ public boolean supports(String url) {

@Override
public CrawlerResult<?> extract(Document doc) {
// 불필요한 태그 제거
doc.select("script, style, noscript, meta, link").remove();
// img 태그
doc.select("img[src]").forEach(el ->
el.attr("src", el.absUrl("src"))
);

// meta 태그 (Open Graph, Twitter Card 등)
doc.select("meta[content]").forEach(meta -> {
String absUrl = meta.absUrl("content");
if (!absUrl.isEmpty() && !absUrl.equals(meta.attr("content"))) {
meta.attr("content", absUrl);
}
});

// 본문만 가져오기 (HTML)
String cleanHtml = doc.body().html();
String cleanHtml = doc.body().html()
.replaceAll("<script[^>]*>.*?</script>", "")
.replaceAll("<style[^>]*>.*?</style>", "")
// 주석 제거
.replaceAll("<!--.*?-->", "")
// 연속된 공백 제거
.replaceAll("\\s+", " ")
// 불필요한 속성 제거
.replaceAll("(class|id|style|onclick|onload)=\"[^\"]*\"", "")
.trim();

return new CrawlerResult<>(
CrawlerResult.CrawlerType.UNSPECIFIC,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package org.tuna.zoopzoop.backend.domain.datasource.crawler.service;

import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
Expand All @@ -9,11 +10,14 @@
import org.springframework.stereotype.Component;
import org.tuna.zoopzoop.backend.domain.datasource.crawler.dto.CrawlerResult;
import org.tuna.zoopzoop.backend.domain.datasource.crawler.dto.SpecificSiteDto;
import org.tuna.zoopzoop.backend.domain.datasource.exception.ServiceException;

import java.io.IOException;
import java.time.LocalDate;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.util.Optional;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

@Component
@Order(Ordered.HIGHEST_PRECEDENCE)
Expand All @@ -28,28 +32,38 @@ public boolean supports(String domain) {
}

@Override
public CrawlerResult<?> extract(Document doc) throws IOException {
public CrawlerResult<?> extract(Document doc) {
/*
블로그 본문은 <iframe id="mainFrame"> 안에 로드되므로
먼저 메인 페이지를 가져온 뒤 iframe의 src를 추출하여
해당 URL로 다시 connect 해야 실제 본문 내용을 크롤링할 수 있다.
*/
Element iframe = doc.selectFirst("iframe#mainFrame");
String iframeUrl = iframe.absUrl("src");
String iframeUrl = Optional.ofNullable(doc.selectFirst("iframe#mainFrame"))
.map(el -> el.absUrl("src"))
.orElse("");

Document iframeDoc = Jsoup.connect(iframeUrl)
.userAgent("Mozilla/5.0") // 크롤링 차단 방지를 위해 user-agent 설정 권장
.timeout(10 * 1000) // 타임아웃 (10초)
.get();
Document iframeDoc;

try {
Connection.Response response = Jsoup.connect(iframeUrl)
.userAgent("Mozilla/5.0")
.timeout(5000)
.followRedirects(true)
.execute();

iframeDoc = response.parse();
} catch (Exception e) {
throw new ServiceException("URL 접속에 실패했습니다.");
}

// 제목
Element titleSpans = iframeDoc.selectFirst("div.se-module.se-module-text.se-title-text");
String title = titleSpans.text();
String title = iframeDoc.select("meta[property=og:title]").attr("content");

// 작성일자
String publishedAt = iframeDoc.selectFirst("span.se_publishDate.pcol2").text();
LocalDateTime rawDate = LocalDateTime.parse(publishedAt, DateTimeFormatter.ofPattern("yyyy. M. d. HH:mm"));
LocalDate dataCreatedDate = rawDate.toLocalDate();
String publishedAt = Optional.ofNullable(iframeDoc.selectFirst("span.se_publishDate.pcol2"))
.map(Element::text)
.orElse("");
LocalDate dataCreatedDate = transLocalDate(publishedAt);

// 내용
Elements spans = iframeDoc.select(".se-main-container span");
Expand All @@ -60,17 +74,11 @@ public CrawlerResult<?> extract(Document doc) throws IOException {
String content = sb.toString();

// 썸네일 이미지 URL
Element img = iframeDoc.select("div.se-main-container img").first();

String imageUrl = "";
if (img != null) {
if (!img.attr("data-lazy-src").isEmpty()) {
imageUrl = img.attr("data-lazy-src");
}
}
String imageUrl = iframeDoc.select("meta[property=og:image]").attr("content");

// 출처
String source = "네이버 블로그";
String source = iframeDoc.select("meta[property=og:site_name]").attr("content");


return new CrawlerResult<>(
CrawlerResult.CrawlerType.SPECIFIC,
Expand All @@ -79,8 +87,54 @@ public CrawlerResult<?> extract(Document doc) throws IOException {
}

@Override
public LocalDate transLocalDate(String rawDate) {
LocalDateTime dateTime = LocalDateTime.parse(rawDate, NAVERBLOG_FORMATTER);
return dateTime.toLocalDate(); // 시간 버리고 날짜만
public LocalDate transLocalDate(String publishedAt) {

if (publishedAt == null || publishedAt.isEmpty()) {
return null;
}

publishedAt = publishedAt.trim();

// "방금 전"
if (publishedAt.equals("방금 전")) {
return LocalDate.now();
}

// "?분 전"
Pattern minutePattern = Pattern.compile("(\\d+)분\\s*전");
Matcher minuteMatcher = minutePattern.matcher(publishedAt);
if (minuteMatcher.find()) {
int minutes = Integer.parseInt(minuteMatcher.group(1));
LocalDateTime pastTime = LocalDateTime.now().minusMinutes(minutes);
return pastTime.toLocalDate();
}

// "?시간 전"
Pattern hourPattern = Pattern.compile("(\\d+)시간\\s*전");
Matcher hourMatcher = hourPattern.matcher(publishedAt);
if (hourMatcher.find()) {
int hours = Integer.parseInt(hourMatcher.group(1));
LocalDateTime pastTime = LocalDateTime.now().minusHours(hours);
return pastTime.toLocalDate();
}

// "yyyy. M. d. HH:mm" 또는 "yyyy. M. d. H:mm" 형식
try {
// 시간 부분 제거하고 날짜만 추출
Pattern datePattern = Pattern.compile("(\\d{4})\\s*\\.\\s*(\\d{1,2})\\s*\\.\\s*(\\d{1,2})");
Matcher dateMatcher = datePattern.matcher(publishedAt);

if (dateMatcher.find()) {
int year = Integer.parseInt(dateMatcher.group(1));
int month = Integer.parseInt(dateMatcher.group(2));
int day = Integer.parseInt(dateMatcher.group(3));
return LocalDate.of(year, month, day);
}
} catch (Exception e) {
System.err.println("날짜 파싱 실패: " + publishedAt);
}

// 파싱 실패 시 null 반환
return null;
}
}
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package org.tuna.zoopzoop.backend.domain.datasource.crawler.service;

import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.springframework.core.Ordered;
import org.springframework.core.annotation.Order;
import org.springframework.stereotype.Component;
Expand All @@ -9,6 +10,7 @@

import java.time.LocalDate;
import java.time.format.DateTimeFormatter;
import java.util.Optional;

@Component
@Order(Ordered.HIGHEST_PRECEDENCE)
Expand All @@ -25,22 +27,33 @@ public boolean supports(String domain) {
@Override
public CrawlerResult<?> extract(Document doc) {
// 제목
String title = doc.selectFirst("h2#title_area").text();
String title = doc.select("meta[property=og:title]").attr("content");


// 작성 날짜
String publishedAt = doc.selectFirst(
"span.media_end_head_info_datestamp_time._ARTICLE_DATE_TIME"
).attr("data-date-time");
LocalDate dataCreatedDate = transLocalDate(publishedAt);
String publishedAt = Optional.ofNullable(
doc.selectFirst("span.media_end_head_info_datestamp_time._ARTICLE_DATE_TIME")
)
.map(el -> el.attr("data-date-time"))
.orElse(""); // 값 없으면 빈 문자열

LocalDate dataCreatedDate = publishedAt.isEmpty()
? null
: transLocalDate(publishedAt);


// 내용(ai한테 줘야함)
String content = doc.select("article").text();
String content = Optional.ofNullable(doc.selectFirst("article"))
.map(Element::text)
.orElse("");


// 썸네일 이미지 url
String imageUrl = doc.selectFirst("img#img1._LAZY_LOADING._LAZY_LOADING_INIT_HIDE").attr("data-src");
String imageUrl = doc.select("meta[property=og:image]").attr("content");


// 출처
String source = doc.selectFirst("span.media_end_head_top_logo_text").text();
String source = doc.select("meta[name=twitter:creator]").attr("content");

return new CrawlerResult<>(
CrawlerResult.CrawlerType.SPECIFIC,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
public enum SupportedDomain {
NAVERNEWS("n.news.naver.com"),
NAVERBLOG("blog.naver.com"),
VELOG("velog.io");
VELOG("velog.io"),
TISTORY("tistory.com");

private final String domain;

Expand Down
Loading