From 99410479343f3975b4fbf82c4469f2bf7940e273 Mon Sep 17 00:00:00 2001 From: ohsoohyuk Date: Fri, 10 Oct 2025 16:10:52 +0900 Subject: [PATCH] =?UTF-8?q?feat/OPS-393=20:=20=ED=8B=B0=EC=8A=A4=ED=86=A0?= =?UTF-8?q?=EB=A6=AC=20=ED=81=AC=EB=A1=A4=EB=9F=AC=20=EC=83=9D=EC=84=B1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- build.gradle | 4 + .../domain/datasource/ai/prompt/AiPrompt.java | 2 +- .../datasource/ai/service/AiService.java | 39 ++- .../datasource/crawler/service/Crawler.java | 3 +- .../service/CrawlerManagerService.java | 10 +- .../crawler/service/GenericCrawler.java | 25 +- .../crawler/service/NaverBlogCrawler.java | 104 ++++-- .../crawler/service/NaverNewsCrawler.java | 29 +- .../crawler/service/SupportedDomain.java | 3 +- .../crawler/service/TistoryCralwer.java | 78 +++++ .../service/DataProcessorService.java | 52 ++- .../exception/DataSourceExceptionHandler.java | 22 ++ .../exception/ServiceException.java | 7 + .../backend/global/webMvc/RetryConfig.java | 12 + .../service/CrawlerManagerServiceTest.java | 304 +++++++++++------- .../service/DataProcessorServiceTest.java | 121 +++++++ 16 files changed, 642 insertions(+), 173 deletions(-) create mode 100644 src/main/java/org/tuna/zoopzoop/backend/domain/datasource/crawler/service/TistoryCralwer.java create mode 100644 src/main/java/org/tuna/zoopzoop/backend/domain/datasource/exception/DataSourceExceptionHandler.java create mode 100644 src/main/java/org/tuna/zoopzoop/backend/domain/datasource/exception/ServiceException.java create mode 100644 src/main/java/org/tuna/zoopzoop/backend/global/webMvc/RetryConfig.java create mode 100644 src/test/java/org/tuna/zoopzoop/backend/domain/datasource/service/DataProcessorServiceTest.java diff --git a/build.gradle b/build.gradle index 4508c3a0..cb6516a7 100644 --- a/build.gradle +++ b/build.gradle @@ -128,6 +128,10 @@ dependencies { // Elastic Search implementation 'org.springframework.boot:spring-boot-starter-data-elasticsearch' + + // retry (ai retry용) + implementation 'org.springframework.retry:spring-retry' + implementation 'org.springframework:spring-aspects' } dependencyManagement { diff --git a/src/main/java/org/tuna/zoopzoop/backend/domain/datasource/ai/prompt/AiPrompt.java b/src/main/java/org/tuna/zoopzoop/backend/domain/datasource/ai/prompt/AiPrompt.java index 4e564c07..5e9268b5 100644 --- a/src/main/java/org/tuna/zoopzoop/backend/domain/datasource/ai/prompt/AiPrompt.java +++ b/src/main/java/org/tuna/zoopzoop/backend/domain/datasource/ai/prompt/AiPrompt.java @@ -41,7 +41,7 @@ public class AiPrompt { - 제공된 태그와 중복 가능하다. - 필요하면 새로운 태그를 만들어도 된다. 4. 출력은 반드시 아래 JSON 형식으로 해라. Markdown 문법(```)은 쓰지 마라. - - 해당 정보가 없으면 null말고 무조건 빈 문자열로 출력해줘라. + - 해당정보가 없을 시 summary는 빈 문자열, category는 null, tags는 빈 리스트로 출력해줘라. [출력 JSON 형식] { diff --git a/src/main/java/org/tuna/zoopzoop/backend/domain/datasource/ai/service/AiService.java b/src/main/java/org/tuna/zoopzoop/backend/domain/datasource/ai/service/AiService.java index 1ff5f460..fa94e7aa 100644 --- a/src/main/java/org/tuna/zoopzoop/backend/domain/datasource/ai/service/AiService.java +++ b/src/main/java/org/tuna/zoopzoop/backend/domain/datasource/ai/service/AiService.java @@ -1,14 +1,19 @@ package org.tuna.zoopzoop.backend.domain.datasource.ai.service; +import com.fasterxml.jackson.core.JsonParseException; +import com.fasterxml.jackson.core.JsonProcessingException; import lombok.RequiredArgsConstructor; import org.springframework.ai.chat.client.ChatClient; +import org.springframework.retry.annotation.Backoff; +import org.springframework.retry.annotation.Recover; +import org.springframework.retry.annotation.Retryable; import org.springframework.stereotype.Service; import org.tuna.zoopzoop.backend.domain.datasource.ai.dto.AiExtractorDto; import org.tuna.zoopzoop.backend.domain.datasource.ai.dto.AnalyzeContentDto; import org.tuna.zoopzoop.backend.domain.datasource.ai.prompt.AiPrompt; import org.tuna.zoopzoop.backend.domain.datasource.entity.Tag; -import org.tuna.zoopzoop.backend.domain.datasource.repository.TagRepository; +import java.util.ArrayList; import java.util.List; import java.util.stream.Collectors; @@ -16,8 +21,12 @@ @RequiredArgsConstructor public class AiService { private final ChatClient chatClient; - private final TagRepository tagRepository; + @Retryable( + maxAttempts = 3, + backoff = @Backoff(delay = 500), + retryFor = {JsonParseException.class, JsonProcessingException.class} + ) public AiExtractorDto extract(String rawHtml) { AiExtractorDto response = chatClient.prompt() .user(AiPrompt.EXTRACTION.formatted(rawHtml)) @@ -27,6 +36,22 @@ public AiExtractorDto extract(String rawHtml) { return response; } + @Recover + public AiExtractorDto extractRecover(Exception e, String rawHtml) { + return new AiExtractorDto( + "", + null, + "", + "", + "" + ); + } + + @Retryable( + maxAttempts = 3, + backoff = @Backoff(delay = 500), + retryFor = {JsonParseException.class, JsonProcessingException.class} + ) public AnalyzeContentDto analyzeContent(String content, List tagList) { // JSON 배열 문자열로 변환 String tags = tagList.stream() @@ -41,4 +66,14 @@ public AnalyzeContentDto analyzeContent(String content, List tagList) { return response; } + + @Recover + public AnalyzeContentDto analyzeContentRecover(Exception e, String content, List tagList) { + return new AnalyzeContentDto( + "", + null, + new ArrayList<>() + ); + } + } \ No newline at end of file diff --git a/src/main/java/org/tuna/zoopzoop/backend/domain/datasource/crawler/service/Crawler.java b/src/main/java/org/tuna/zoopzoop/backend/domain/datasource/crawler/service/Crawler.java index 6ab8f443..e8eb831a 100644 --- a/src/main/java/org/tuna/zoopzoop/backend/domain/datasource/crawler/service/Crawler.java +++ b/src/main/java/org/tuna/zoopzoop/backend/domain/datasource/crawler/service/Crawler.java @@ -3,11 +3,10 @@ import org.jsoup.nodes.Document; import org.tuna.zoopzoop.backend.domain.datasource.crawler.dto.CrawlerResult; -import java.io.IOException; import java.time.LocalDate; public interface Crawler { boolean supports(String domain); - CrawlerResult extract(Document doc) throws IOException; + CrawlerResult extract(Document doc); LocalDate transLocalDate(String rawDate); } diff --git a/src/main/java/org/tuna/zoopzoop/backend/domain/datasource/crawler/service/CrawlerManagerService.java b/src/main/java/org/tuna/zoopzoop/backend/domain/datasource/crawler/service/CrawlerManagerService.java index 5a511c09..4b9a472f 100644 --- a/src/main/java/org/tuna/zoopzoop/backend/domain/datasource/crawler/service/CrawlerManagerService.java +++ b/src/main/java/org/tuna/zoopzoop/backend/domain/datasource/crawler/service/CrawlerManagerService.java @@ -1,12 +1,10 @@ package org.tuna.zoopzoop.backend.domain.datasource.crawler.service; import lombok.RequiredArgsConstructor; -import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.springframework.stereotype.Service; import org.tuna.zoopzoop.backend.domain.datasource.crawler.dto.CrawlerResult; -import java.io.IOException; import java.util.List; @Service @@ -14,17 +12,13 @@ public class CrawlerManagerService { private final List crawlers; - public CrawlerResult extractContent(String url) throws IOException { - Document doc = Jsoup.connect(url) - .userAgent("Mozilla/5.0") - .timeout(10000) - .get(); - + public CrawlerResult extractContent(String url, Document doc) { for (Crawler crawler : crawlers) { if (crawler.supports(url)) { return crawler.extract(doc); } } + return null; } } diff --git a/src/main/java/org/tuna/zoopzoop/backend/domain/datasource/crawler/service/GenericCrawler.java b/src/main/java/org/tuna/zoopzoop/backend/domain/datasource/crawler/service/GenericCrawler.java index c36ede86..95d9cf67 100644 --- a/src/main/java/org/tuna/zoopzoop/backend/domain/datasource/crawler/service/GenericCrawler.java +++ b/src/main/java/org/tuna/zoopzoop/backend/domain/datasource/crawler/service/GenericCrawler.java @@ -19,11 +19,30 @@ public boolean supports(String url) { @Override public CrawlerResult extract(Document doc) { - // 불필요한 태그 제거 - doc.select("script, style, noscript, meta, link").remove(); + // img 태그 + doc.select("img[src]").forEach(el -> + el.attr("src", el.absUrl("src")) + ); + + // meta 태그 (Open Graph, Twitter Card 등) + doc.select("meta[content]").forEach(meta -> { + String absUrl = meta.absUrl("content"); + if (!absUrl.isEmpty() && !absUrl.equals(meta.attr("content"))) { + meta.attr("content", absUrl); + } + }); // 본문만 가져오기 (HTML) - String cleanHtml = doc.body().html(); + String cleanHtml = doc.body().html() + .replaceAll("]*>.*?", "") + .replaceAll("]*>.*?", "") + // 주석 제거 + .replaceAll("", "") + // 연속된 공백 제거 + .replaceAll("\\s+", " ") + // 불필요한 속성 제거 + .replaceAll("(class|id|style|onclick|onload)=\"[^\"]*\"", "") + .trim(); return new CrawlerResult<>( CrawlerResult.CrawlerType.UNSPECIFIC, diff --git a/src/main/java/org/tuna/zoopzoop/backend/domain/datasource/crawler/service/NaverBlogCrawler.java b/src/main/java/org/tuna/zoopzoop/backend/domain/datasource/crawler/service/NaverBlogCrawler.java index 640686a7..aa9f920d 100644 --- a/src/main/java/org/tuna/zoopzoop/backend/domain/datasource/crawler/service/NaverBlogCrawler.java +++ b/src/main/java/org/tuna/zoopzoop/backend/domain/datasource/crawler/service/NaverBlogCrawler.java @@ -1,5 +1,6 @@ package org.tuna.zoopzoop.backend.domain.datasource.crawler.service; +import org.jsoup.Connection; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; @@ -9,11 +10,14 @@ import org.springframework.stereotype.Component; import org.tuna.zoopzoop.backend.domain.datasource.crawler.dto.CrawlerResult; import org.tuna.zoopzoop.backend.domain.datasource.crawler.dto.SpecificSiteDto; +import org.tuna.zoopzoop.backend.domain.datasource.exception.ServiceException; -import java.io.IOException; import java.time.LocalDate; import java.time.LocalDateTime; import java.time.format.DateTimeFormatter; +import java.util.Optional; +import java.util.regex.Matcher; +import java.util.regex.Pattern; @Component @Order(Ordered.HIGHEST_PRECEDENCE) @@ -28,28 +32,38 @@ public boolean supports(String domain) { } @Override - public CrawlerResult extract(Document doc) throws IOException { + public CrawlerResult extract(Document doc) { /* 블로그 본문은