avhub/main.py at main · demonyins/avhub · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
# -*- encoding: utf-8 -*-
import os
import requests
import json
from bs4 import BeautifulSoup
from typing import Union
from fastapi import FastAPI
from fastapi.responses import JSONResponse
from fastapi.middleware.cors import CORSMiddleware
from fastapi import FastAPI, HTTPException
import random
from utils.spider import *
import hydra
from utils.logger import setup_logger
import schedule
import time
from contextlib import asynccontextmanager
import pathlib
import re
from concurrent.futures import ThreadPoolExecutor
import asyncio

@hydra.main(config_path='data/', config_name='config', version_base=None)
def main(cfg: DictConfig):
    # 初始化日志记录器
    global logger
    logger = setup_logger(cfg)

    @asynccontextmanager
    async def lifespan(app: FastAPI):
        # 启动前的操作
        logger.info("Application startup")
        yield
        # 关闭时的操作
        logger.info("Application shutdown")

    app = FastAPI(lifespan=lifespan)

    app.add_middleware(
        CORSMiddleware,
        allow_origins=cfg.app.cors_origins,
        allow_credentials=cfg.app.cors_credentials,
        allow_methods=cfg.app.cors_methods,
        allow_headers=cfg.app.cors_headers,
    )

    # 创建线程池
    executor = ThreadPoolExecutor(max_workers=10)

    def _fetch_url(url: str) -> str:
        """获取URL内容"""
        try:
            response = requests.get(url, timeout=10)  # 减少超时时间到10秒
            response.raise_for_status()
            return response.text
        except Exception as e:
            logger.error(f"Failed to fetch URL {url}: {str(e)}")
            return ""

    def _parse_html(html_content: str, image_dir_url: str) -> list:
        """解析HTML内容并提取链接"""
        try:
            soup = BeautifulSoup(html_content, 'html.parser')
            a_tags = soup.find_all('a', href=True)
            links = [image_dir_url + tag['href'] for tag in a_tags if tag['href'] != '../']
            return [link for link in links if link.endswith('.webp')] or links
        except Exception as e:
            logger.error(f"Failed to parse HTML: {str(e)}")
            return []

    async def get_image_url(video_url: str) -> str:
        """异步获取图片URL"""
        try:
            # 构建图片目录URL
            image_dir_url = video_url.replace('index.m3u8', 'image/')

            # 设置超时时间为15秒的Future
            loop = asyncio.get_event_loop()
            html_content = await asyncio.wait_for(
                loop.run_in_executor(executor, _fetch_url, image_dir_url),
                timeout=15
            )

            if not html_content:
                return None

            # HTML解析设置5秒超时
            links = await asyncio.wait_for(
                loop.run_in_executor(executor, _parse_html, html_content, image_dir_url),
                timeout=5
            )

            if not links:
                logger.warning("No image links found.")
                return None

            return random.choice(links)
        except asyncio.TimeoutError:
            logger.error(f"Timeout while processing image URL for {video_url}")
            return None
        except Exception as e:
            logger.error(f"Failed to obtain the image URL: {str(e)}")
            return None

    async def read_random_line(file_path: str) -> tuple[str, str]:
        """异步读取随机行并获取图片URL"""
        if not os.path.isfile(file_path):
            logger.error("File not found")
            raise HTTPException(status_code=404, detail="File not found")

        try:
            loop = asyncio.get_event_loop()
            # 文件读取设置2秒超时
            lines = await asyncio.wait_for(
                loop.run_in_executor(executor, lambda: open(file_path, 'r').readlines()),
                timeout=2
            )

            if not lines:
                logger.error("File is empty")
                raise HTTPException(status_code=400, detail="File is empty")

            random_line = random.choice(lines).strip()
            # 获取图片URL设置总超时20秒
            img_url = await asyncio.wait_for(get_image_url(random_line), timeout=20)

            return random_line, img_url
        except asyncio.TimeoutError:
            logger.error("Timeout while reading random line or fetching image URL")
            # 如果超时，返回视频URL但不返回图片URL
            return random.choice(lines).strip() if lines else None, None
        except Exception as e:
            logger.error(f"Error in read_random_line: {str(e)}")
            raise HTTPException(status_code=500, detail=str(e))

    @app.get("/v1/hacg")
    async def read_hacg():
        try:
            with open(cfg.files.hacg_json_path, 'r', encoding='utf-8') as file:
                data = json.load(file)
            logger.info("HACG data fetched successfully")
            return JSONResponse({"data": data}, headers={'content-type': 'application/json;charset=utf-8'})
        except Exception as e:
            logger.error(f"Failed to fetch HACG data: {str(e)}")
            raise HTTPException(status_code=500, detail="Internal Server Error")

    @app.get("/v1/avcode/{code_str}")
    async def crawl_av(code_str: str):
        # 规范化code_str，只保留字母和数字
        code_str = re.sub(r'[^a-zA-Z0-9]', '', code_str).lower()

        # 如果启用了缓存，确保缓存目录存在并尝试从缓存读取
        if cfg.av_spider.use_cache:
            # 确保缓存目录存在
            pathlib.Path(cfg.av_spider.cache_dir).mkdir(parents=True, exist_ok=True)

            cache_path = os.path.join(cfg.av_spider.cache_dir, f"{code_str}.json")
            try:
                if os.path.exists(cache_path):
                    with open(cache_path, 'r', encoding='utf-8') as f:
                        cached_data = json.load(f)
                        logger.info(f"Cache hit for AV code: {code_str}")
                        return {"status": "succeed", "data": cached_data}
            except Exception as e:
                logger.error(f"Error reading cache file: {str(e)}")

        # 如果没有缓存或缓存读取失败，从网络获取
        crawler = AVSpider(av_code=code_str,
                          source_url=cfg.av_spider.source_url,
                          proxy_url=cfg.av_spider.proxy_url,
                          use_proxy=cfg.av_spider.use_proxy,
                          cfg=cfg)

        try:
            magnet_links = await crawler.process_av_code()

            if not magnet_links:
                logger.error(f"No magnet links found for AV code: {code_str}")
                raise HTTPException(status_code=404, detail="No magnet links found")

            # 准备数据
            magnet_data = [str(item) for item in magnet_links]

            # 如果启用了缓存，保存到缓存文件（只保存数据部分）
            if cfg.av_spider.use_cache:
                try:
                    with open(cache_path, 'w', encoding='utf-8') as f:
                        json.dump(magnet_data, f, ensure_ascii=False, indent=4)
                    logger.info(f"Cache written for AV code: {code_str}")
                except Exception as e:
                    logger.error(f"Error writing cache file: {str(e)}")

            logger.info(f"Magnet links found for AV code: {code_str}")
            return {"status": "succeed", "data": magnet_data}
        except Exception as e:
            logger.error(f"Error processing AV code {code_str}: {str(e)}")
            raise HTTPException(status_code=500, detail=str(e))
        finally:
            del crawler  # 确保资源被正确释放

    @app.get("/v1/get_video")
    async def get_random_video_url():
        """Returns a random video URL and its corresponding image URL."""
        try:
            file_path = cfg.files.video_urls_txt_path
            # 设置整体操作超时为25秒
            video_url, img_url = await asyncio.wait_for(
                read_random_line(file_path),
                timeout=25
            )

            if not video_url:
                raise HTTPException(status_code=500, detail="Failed to get video URL")

            logger.info("Random video URL and image URL fetched successfully")
            return {
                "url": video_url,
                "img_url": img_url or ""
            }
        except asyncio.TimeoutError:
            logger.error("Global timeout in get_random_video_url")
            raise HTTPException(status_code=504, detail="Request timeout")
        except Exception as e:
            logger.error(f"Failed to fetch random video URL: {str(e)}")
            raise HTTPException(status_code=500, detail=str(e))

    def run_hacg_spider():
        hacg_spider = HacgSpider(url=cfg.hacg_spider.source_url, filepath=cfg.files.hacg_json_path, cfg=cfg)
        hacg_spider.update_json_file()
        logger.info("HacgSpider task completed.")

    # Schedule the HacgSpider task to run daily at 1 AM
    schedule.every().day.at("01:00").do(run_hacg_spider)

    # Function to keep running the scheduler in the background
    def run_scheduler():
        while True:
            schedule.run_pending()
            time.sleep(60)  # Check every minute

    import threading
    # Start the scheduler in a separate thread
    scheduler_thread = threading.Thread(target=run_scheduler)
    scheduler_thread.daemon = True
    scheduler_thread.start()

    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8000)

if __name__ == "__main__":
    main()