Skip to content

Commit 2fa5e5a

Browse files
authored
Merge pull request #3 from Gachon-Univ-Creative-Code-Innovation/fix/GUC-155-crawling-bug
fix: 크롤링 수정
2 parents a69d569 + 2cf0d4a commit 2fa5e5a

File tree

8 files changed

+404
-41
lines changed

8 files changed

+404
-41
lines changed

Dockerfile

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
# Python 3.9 이상 이미지 사용
2+
FROM python:3.9-slim
3+
4+
# 필수 패키지 설치
5+
RUN apt-get update && apt-get install -y \
6+
chromium \
7+
chromium-driver \
8+
libgtk-3-0 \
9+
libglib2.0-0 \
10+
libnss3 \
11+
libxss1 \
12+
libx11-6 \
13+
wget \
14+
unzip \
15+
curl \
16+
--no-install-recommends && \
17+
apt-get clean && rm -rf /var/lib/apt/lists/*
18+
19+
20+
# 환경 변수 설정
21+
ENV CHROME_BIN=/usr/bin/chromium
22+
ENV CHROMEDRIVER_BIN=/usr/lib/chromium/chromedriver
23+
24+
25+
# 작업 디렉터리 설정
26+
WORKDIR /app
27+
28+
# 위치 지정
29+
ENV PYTHONPATH=/app
30+
31+
# 필요 파일 복사
32+
COPY . .
33+
34+
# 의존성 설치
35+
RUN pip install --upgrade pip
36+
RUN pip install --no-cache-dir -r requirements.txt
37+
38+
# FastAPI 실행
39+
CMD ["uvicorn", "Main:app", "--host", "0.0.0.0", "--port", "8000"]

Main.py

Lines changed: 224 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,224 @@
1+
from fastapi import FastAPI
2+
from fastapi.responses import JSONResponse
3+
from fastapi.middleware.cors import CORSMiddleware
4+
from apscheduler.schedulers.background import BackgroundScheduler
5+
from apscheduler.triggers.cron import CronTrigger
6+
from src.Crawling.CrawlingToText import GetRoadmapDf
7+
from src.Utils.RepoToDB import UploadRoadmap
8+
from src.Utils.RepoToStorage import UploadSvgToStorage
9+
from pyppeteer import launch
10+
import datetime
11+
import asyncio
12+
import os
13+
14+
app = FastAPI(title="Roadmap Service API")
15+
df = GetRoadmapDf()
16+
scheduler = BackgroundScheduler()
17+
18+
19+
# 로드맵 재로드를 하는 스케줄러 실행
20+
def scheduled_job():
21+
print(f"[{datetime.datetime.now()}] [스케줄러] 로드맵 재로드 실행")
22+
global df
23+
df = GetRoadmapDf()
24+
25+
26+
@app.on_event("startup")
27+
def start_scheduler():
28+
"""매일 새벽 3시 정각에 작업 실행 (서버 시간 기준)"""
29+
trigger = CronTrigger(hour=3, minute=0)
30+
scheduler.add_job(scheduled_job, trigger)
31+
scheduler.start()
32+
33+
34+
# CORS 설정 (필요시)
35+
app.add_middleware(
36+
CORSMiddleware,
37+
allow_origins=["*"],
38+
allow_methods=["*"],
39+
allow_headers=["*"],
40+
)
41+
42+
43+
# health check
44+
@app.get("/api/roadmap/health-check")
45+
async def healthCcheck():
46+
return {"status": 200, "message": "서버 상태 확인", "data": "working"}
47+
48+
49+
# favicon 요청 무시 -> 로드맵 서비스는 favicon을 제공하지 않음
50+
## favicon.ico는 웹사이트에 접속하면 자동으로 요청하는 파일임
51+
@app.get("/api/roadmap/favicon.ico")
52+
def favicon():
53+
return {"status": 400, "message": "No favicon"}
54+
55+
56+
# 전체 로드맵 리스트 반환
57+
@app.get("/api/roadmap")
58+
def ReadAllRoadmaps():
59+
"""전체 로드맵 리스트 반환"""
60+
try:
61+
records = df.to_dict("records")
62+
63+
return {
64+
"status": "200",
65+
"message": "전체 로드맵 리스트를 성공적으로 불러왔습니다.",
66+
"data": records,
67+
}
68+
69+
except Exception as e:
70+
return JSONResponse(
71+
status_code=500,
72+
content={
73+
"status": "500",
74+
"message": f"로드맵 리스트 조회 실패: {str(e)}",
75+
"data": None,
76+
},
77+
)
78+
79+
80+
# 로드맵 데이터를 우리가 보기 위한 코드
81+
@app.get("/api/roadmap/")
82+
async def GetRoadmapSvg(urlName: str):
83+
"""SVG 파일을 실시간으로 생성하여 응답"""
84+
roadmapDf = df[df["urlName"] == urlName]
85+
if roadmapDf.empty:
86+
return JSONResponse(
87+
status_code=404,
88+
content={
89+
"status": "404",
90+
"message": f"로드맵을 찾을 수 없습니다: https://roadmap.sh/{urlName}",
91+
"data": None,
92+
},
93+
)
94+
95+
url = f"https://roadmap.sh/{urlName}"
96+
97+
try:
98+
browser = await launch(
99+
headless=True,
100+
executablePath="/usr/bin/chromium", # apt로 설치된 경로
101+
args=["--no-sandbox"],
102+
)
103+
page = await browser.newPage()
104+
await page.setViewport({"width": 1980, "height": 1080})
105+
await page.goto(url, {"waitUntil": "load", "timeout": 180000})
106+
107+
svgElement = None
108+
for _ in range(60):
109+
svgElement = await page.querySelector("#resource-svg-wrap svg")
110+
if svgElement:
111+
break
112+
await asyncio.sleep(1)
113+
114+
if not svgElement:
115+
await browser.close()
116+
return JSONResponse(
117+
status_code=404,
118+
content={
119+
"status": "404",
120+
"message": "SVG 요소를 찾을 수 없습니다.",
121+
"data": None,
122+
},
123+
)
124+
svgHtml = await page.evaluate("(element) => element.outerHTML", svgElement)
125+
await browser.close()
126+
return {
127+
"status": "200",
128+
"message": "로드맵 SVG가 생성되었습니다.",
129+
"data": svgHtml,
130+
}
131+
132+
except Exception as e:
133+
return JSONResponse(
134+
status_code=500,
135+
content={
136+
"status": "500",
137+
"message": f"로드맵 SVG 생성 실패: {e}",
138+
"data": None,
139+
},
140+
)
141+
142+
143+
@app.post("/api/roadmap/save/")
144+
async def SaveRoadmapSvg(urlName: str):
145+
"""SVG파일을 roadmap.sh에서 크롤링하고 Supabase에 저장"""
146+
# 로드맵 이름 조회
147+
roadmapDf = df[df["urlName"] == urlName]
148+
if roadmapDf.empty:
149+
return JSONResponse(
150+
status_code=404,
151+
content={
152+
"status": "404",
153+
"message": f"로드맵을 찾을 수 없습니다: https://roadmap.sh/{urlName}",
154+
"data": None,
155+
},
156+
)
157+
roadmapName = roadmapDf.iloc[0]["roadmapName"]
158+
url = f"https://roadmap.sh/{urlName}"
159+
160+
try:
161+
# pyppeteer로 웹 페이지 접근 및 SVG 추출
162+
browser = await launch(
163+
headless=True,
164+
executablePath="/usr/bin/chromium", # apt로 설치된 경로
165+
args=["--no-sandbox"],
166+
)
167+
page = await browser.newPage()
168+
await page.setViewport({"width": 1980, "height": 1080})
169+
await page.goto(url, {"waitUntil": "load", "timeout": 180000})
170+
171+
svgElement = None
172+
for _ in range(60):
173+
svgElement = await page.querySelector("#resource-svg-wrap svg")
174+
if svgElement:
175+
break
176+
await asyncio.sleep(1)
177+
178+
if not svgElement:
179+
await browser.close()
180+
return JSONResponse(
181+
status_code=404,
182+
content={
183+
"status": "404",
184+
"message": "SVG 요소를 찾을 수 없습니다.",
185+
"data": None,
186+
},
187+
)
188+
svgHtml = await page.evaluate("(element) => element.outerHTML", svgElement)
189+
# SVG 파일 로컬 저장
190+
fileName = f"{roadmapName}Roadmap.svg"
191+
localPath = f"/tmp/{fileName}"
192+
with open(localPath, "w", encoding="utf-8") as f:
193+
f.write(svgHtml)
194+
195+
await browser.close()
196+
197+
# Supabase 스토리지에 SVG 업로드
198+
UploadSvgToStorage(filename=fileName, localPath=localPath)
199+
200+
# Supabase DB에 메타데이터 저장
201+
roadmapData = {
202+
"urlName": urlName,
203+
"svgPath": fileName,
204+
"description": roadmapName,
205+
}
206+
UploadRoadmap(roadmapData)
207+
208+
# 로컬 파일 삭제
209+
os.remove(fileName)
210+
211+
return {
212+
"status": "200",
213+
"message": f"로드맵 SVG가 Supabase에 저장되었습니다: {fileName}.",
214+
"data": None,
215+
}
216+
except Exception as e:
217+
return JSONResponse(
218+
status_code=500,
219+
content={
220+
"status": "500",
221+
"message": f"로드맵 SVG 저장 실패: {e}",
222+
"data": None,
223+
},
224+
)

requirements.txt

Lines changed: 10 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1,42 +1,11 @@
1-
annotated-types==0.7.0
2-
anyio==4.9.0
3-
appdirs==1.4.4
4-
attrs==25.3.0
5-
certifi==2025.1.31
6-
cffi==1.17.1
7-
cryptography==44.0.2
8-
dotenv==0.9.9
9-
exceptiongroup==1.2.2
10-
fastapi==0.115.12
11-
h11==0.14.0
12-
idna==3.10
13-
importlib_metadata==8.6.1
14-
jwt==1.3.1
15-
numpy==2.0.2
16-
outcome==1.3.0.post0
17-
pandas==2.2.3
18-
pycparser==2.22
19-
pydantic==2.11.3
20-
pydantic_core==2.33.1
21-
pyee==11.1.1
1+
fastapi
2+
uvicorn[standard]
3+
pandas
4+
selenium
5+
python-dotenv
6+
supabase
7+
pyjwt
8+
webdriver-manager
229
pyppeteer==2.0.0
23-
PySocks==1.7.1
24-
python-dateutil==2.9.0.post0
25-
python-dotenv==1.1.0
26-
pytz==2025.2
27-
selenium==4.31.0
28-
six==1.17.0
29-
sniffio==1.3.1
30-
sortedcontainers==2.4.0
31-
starlette==0.46.2
32-
tqdm==4.67.1
33-
trio==0.29.0
34-
trio-websocket==0.12.2
35-
typing-inspection==0.4.0
36-
typing_extensions==4.13.2
37-
tzdata==2025.2
38-
urllib3==1.26.20
39-
websocket-client==1.8.0
40-
websockets==10.4
41-
wsproto==1.2.0
42-
zipp==3.21.0
10+
httpx
11+
apscheduler

0 commit comments

Comments
 (0)