diff --git a/v2.0/bookmark.py b/v2.0/bookmark.py index 5e9a215..991e7ab 100644 --- a/v2.0/bookmark.py +++ b/v2.0/bookmark.py @@ -7,8 +7,9 @@ # import math import json import time +import random -from config import BOOKMARK_HIDE_ENABLE,SKIP_ISEXISTS_ILLUST,BOOKMARK_PATH +from config import BOOKMARK_HIDE_ENABLE,SKIP_ISEXISTS_ILLUST,SKIP_DB_EXISTS_ILLUST,BOOKMARK_PATH from downer import Downloader from log_record import logger from message import TEMP_MSG @@ -16,6 +17,7 @@ from ptimer import Timer # TODO from tag import TAG_FLAG_BOOKMARK +from checker import run_startup_check class Bookmark(object): @@ -136,6 +138,16 @@ def thread_by_illust(self, *args): pid = args[0] info = None + # 基于数据库的提前跳过,避免网络与文件系统检测 + if hasattr(self.db, "pool") and SKIP_DB_EXISTS_ILLUST: + try: + exists, _ = self.db.check_illust(pid, table="bookmark") + if exists: + logger.info(f"SKIP_DB_EXISTS_ILLUST - {pid}") + return info + except Exception: + pass + # 跳过已下载插画的请求 if SKIP_ISEXISTS_ILLUST and self.file_manager.search_isExistsPid( BOOKMARK_PATH,"b",*(pid,)): @@ -202,6 +214,12 @@ def thread_by_illust(self, *args): @logger.catch def run(self): + # 启动一致性检查(最近 20 条) + try: + run_startup_check(self.Downloader) + except Exception as e: + logger.warning(f"启动一致性检查失败: {e}") + # TDOD TAG COUNT开始工作 TAG_FLAG_BOOKMARK = False logger.info(TEMP_MSG["BEGIN_INFO"].format(self.class_name)) @@ -249,8 +267,7 @@ def run(self): pool.put(self.thread_by_illust,(pid,),callback) offset += self.bookmark_page_offset - # 固定休眠 - time.sleep(1) + # 取消固定/随机等待,统一交由 Downloader 自适应限速(仅对线上接口生效) except Exception as e: logger.warning("Exception {}".format(e)) finally: diff --git a/v2.0/checker.py b/v2.0/checker.py new file mode 100644 index 0000000..eb16c06 --- /dev/null +++ b/v2.0/checker.py @@ -0,0 +1,149 @@ +# coding=utf8 +""" +启动一致性检查: +- 对 pixiv / bookmark 表各取最近 20 条记录,核对本地文件夹图片数量与 pageCount 是否一致; +- 使用 IsValidImage 校验图片完整性; +- 若不一致或存在损坏图片,则触发自动重下;重下后再次校验,仍失败则告警。 +""" + +import os +from typing import List, Tuple, Optional + +from downer import Downloader +from folder import file_manager +from image_check import IsValidImage +from log_record import logger + +TABLES = ["pixiv", "bookmark"] +LATEST_LIMIT = 20 + + +def list_local_images(dir_path: str, pid: int) -> List[str]: + if not dir_path or not os.path.isdir(dir_path): + return [] + files = [] + try: + for name in os.listdir(dir_path): + if not name.lower().startswith(str(pid)): + continue + # 常见图片扩展 + lower = name.lower() + if lower.endswith((".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp")): + files.append(os.path.join(dir_path, name)) + except Exception: + return [] + return files + + +def verify_folder(pid: int, path: str, page_count: int, illust_type: int) -> Tuple[bool, List[str]]: + """返回 (是否通过, 需重下的文件列表)。 + - ugoira(type=2):zip 或 gif 任一存在且 gif 可读视为通过 + - 单图/多图:图片数量需达到目标且每张校验通过 + """ + if not path or not os.path.isdir(path): + return False, [] + + # 动图:gif 或 zip 任一存在即可(gif 优先验证完整性) + if illust_type == 2: + gif = os.path.join(path, f"{pid}.gif") + zipf = os.path.join(path, f"{pid}.zip") + if os.path.exists(gif): + ok = IsValidImage(gif) + return (ok, [] if ok else [gif]) + if os.path.exists(zipf): + return True, [] + return False, [] + + images = list_local_images(path, pid) + if page_count <= 1: + if not images: + return False, [] + # 任取第一张作为单图 + img = images[0] + ok = IsValidImage(img) + return (ok, [] if ok else [img]) + + # 多图:至少 page_count 张,且全部有效 + if len(images) < page_count: + return False, images + invalid = [p for p in images if not IsValidImage(p)] + return (len(invalid) == 0, invalid) + + +def redownload_one(d: Downloader, table: str, rec: dict) -> bool: + """按表与记录重新下载该 PID,下载完成后再次校验。返回最终是否通过。""" + pid = int(rec.get("pid")) + illust_type = int(rec.get("illustType", 0)) + page_count = int(rec.get("pageCount", 1)) + + # 推导保存目录(不写 DB) + save_path = rec.get("path") + if not save_path or save_path == "None": + if table == "bookmark": + user_dir = file_manager.bk_path + else: + uid = int(rec.get("uid", 0)) + user_name = rec.get("userName", "") + user_dir = file_manager.select_user_path(uid, user_name) + save_path = file_manager.mkdir_illusts(user_dir, pid) + + # 走 Downloader 获取信息与下载 + info = d.get_illust_info(pid, extra=("bookmark" if table == "bookmark" else "pixiv")) + if not info or isinstance(info, str): + logger.warning(f"[checker] 获取作品信息失败或不可访问: {pid} -> {info}") + return False + + # 再次校验 + ok, invalid = verify_folder(pid, save_path, page_count, illust_type) + if not ok: + logger.warning(f"[checker] 重新下载后仍不一致/损坏: {pid} | invalid={len(invalid)}") + return ok + + +def run_startup_check(d: Optional[Downloader] = None): + d = d or Downloader() + for table in TABLES: + logger.info(f"[checker] 启动一致性检查 | 表: {table} | 最近 N: {LATEST_LIMIT}") + try: + latest = d.db.select_latest_records(table=table, limit=LATEST_LIMIT) + except Exception: + latest = [] + if not latest: + logger.info(f"[checker] 表 {table} 无记录可检") + continue + + total = 0 + ok_cnt = 0 + fix_cnt = 0 + fail_cnt = 0 + + for rec in latest: + pid = int(rec.get("pid")) + path = rec.get("path") + page_count = int(rec.get("pageCount", 1)) + illust_type = int(rec.get("illustType", 0)) + total += 1 + + ok, invalid = verify_folder(pid, path, page_count, illust_type) + if ok: + ok_cnt += 1 + continue + + logger.warning(f"[checker] 不一致/疑似损坏,准备重下: {table} pid={pid} path={path}") + ok2 = redownload_one(d, table, rec) + if not ok2: + fail_cnt += 1 + logger.warning(f"[checker] 重下失败:{table} pid={pid}") + else: + fix_cnt += 1 + logger.success(f"[checker] 重下完成 | 表:{table} PID:{pid} 再次校验通过") + + logger.info( + f"[checker] 完成 | 表:{table} 总数:{total} 通过:{ok_cnt} 修复:{fix_cnt} 失败:{fail_cnt}" + ) + + +if __name__ == "__main__": + run_startup_check() + + diff --git a/v2.0/config.py b/v2.0/config.py index 8ec0a00..4624ce6 100644 --- a/v2.0/config.py +++ b/v2.0/config.py @@ -55,6 +55,11 @@ # 2.同时也无法更新对应pid在数据库中的记录 SKIP_ISEXISTS_ILLUST = True +# 基于数据库提前跳过: +# 1. 若数据库中已存在该 pid 的记录,则不再请求作品信息(不触碰网络/文件系统) +# 2. 适用于你信任数据库记录已经代表“已下载/已处理”的场景 +SKIP_DB_EXISTS_ILLUST = True + # 关注画师爬虫控制开关 PIXIV_CRAWLER_ENABLED = False @@ -117,4 +122,14 @@ # ===============DEBUG=============== # TODO DEBUG = False -# ===============DEBUG=============== \ No newline at end of file +# ===============DEBUG=============== + + +# =============== 自适应限速(仅信息接口) =============== +# 启用后,仅针对 /ajax/ 或 /touch/ajax/ 的信息请求进行自适应轻量等待; +# 下载文件不受影响。未命中限流时延迟会快速衰减为 0。 +ADAPTIVE_LIMIT_ENABLED = True +ADAPTIVE_DELAY_MAX = 1.0 # 最大延迟(秒) +ADAPTIVE_DELAY_INCREASE = 0.2 # 命中限流后的递增步长(秒) +ADAPTIVE_DELAY_DECAY_RATIO = 0.5 # 成功请求后的衰减系数(0-1) +# ======================================================= \ No newline at end of file diff --git a/v2.0/crawler.py b/v2.0/crawler.py index 4da106e..2844f43 100644 --- a/v2.0/crawler.py +++ b/v2.0/crawler.py @@ -6,15 +6,17 @@ """ import json import time +import random import re -from config import SKIP_ISEXISTS_ILLUST,ROOT_PATH +from config import SKIP_ISEXISTS_ILLUST, SKIP_DB_EXISTS_ILLUST, ROOT_PATH from downer import Downloader from log_record import logger from message import TEMP_MSG from thread_pool import ThreadPool,callback from tag import TAG_FLAG_USER from ptimer import Timer +from checker import run_startup_check class Crawler(object): @@ -177,6 +179,16 @@ def thread_by_illust(self, *args): uid = args[1] info = None + # 基于数据库的提前跳过,避免网络与文件系统检测 + if hasattr(self.db, "pool") and SKIP_DB_EXISTS_ILLUST: + try: + exists, _ = self.db.check_illust(pid) + if exists: + logger.info(f"SKIP_DB_EXISTS_ILLUST - {uid} - {pid}") + return info + except Exception: + pass + # 跳过已下载插画的请求 if SKIP_ISEXISTS_ILLUST and self.file_manager.search_isExistsPid( ROOT_PATH,"c",*(uid,pid,)): @@ -243,6 +255,12 @@ def thread_by_illust(self, *args): @logger.catch def run(self): + # 启动一致性检查(最近 20 条) + try: + run_startup_check(self.Downloader) + except Exception as e: + logger.warning(f"启动一致性检查失败: {e}") + # 开始工作 TAG_FLAG_USER = False logger.info(TEMP_MSG["BEGIN_INFO"].format(self.class_name)) @@ -288,8 +306,7 @@ def run(self): for pid in all_illust: pool.put(self.thread_by_illust,(pid,u["uid"],),callback) - # 固定休眠 - time.sleep(5) + # 取消固定/随机等待,统一交由 Downloader 自适应限速(仅对线上接口生效) # 无作品更新 else: logger.info(TEMP_MSG["NOW_USER_INFO"].format(self.class_name,position,u["userName"],u["uid"],len(all_illust))) diff --git a/v2.0/db.py b/v2.0/db.py index c36fa2e..9cf24ee 100644 --- a/v2.0/db.py +++ b/v2.0/db.py @@ -289,6 +289,23 @@ def select_illust(self, pid, table="pixiv"): cur.close() conn.close() + def select_latest_records(self, table="pixiv", limit=20): + """ + 获取指定表按自增 id 倒序的最近 N 条记录,包含 pid/path/pageCount/illustType/uid/userName。 + """ + conn,cur = self.get_conn() + sql = f"""SELECT pid,path,pageCount,illustType,uid,userName FROM {table} ORDER BY id DESC LIMIT %s""" + try: + cur.execute(sql, (int(limit),)) + r = cur.fetchall() + return r or [] + except Exception as e: + logger.warning(f" - {e}") + return [] + finally: + cur.close() + conn.close() + def select_user(self, start_id, limit=100, table="pxusers"): """ 获取关注列表用户的数据库信息 diff --git a/v2.0/downer.py b/v2.0/downer.py index baf63dc..53fbd88 100644 --- a/v2.0/downer.py +++ b/v2.0/downer.py @@ -12,16 +12,26 @@ import imageio import zipfile import requests +from urllib3.exceptions import InsecureRequestWarning +import urllib3 # 强制取消警告 -from requests.packages.urllib3.exceptions import InsecureRequestWarning -requests.packages.urllib3.disable_warnings(InsecureRequestWarning) - -from config import USERS_LIMIT,BOOKMARK_LIMIT +urllib3.disable_warnings(InsecureRequestWarning) + +from config import ( + USERS_LIMIT, + BOOKMARK_LIMIT, + ADAPTIVE_LIMIT_ENABLED, + ADAPTIVE_DELAY_MAX, + ADAPTIVE_DELAY_INCREASE, + ADAPTIVE_DELAY_DECAY_RATIO, + DOWNLOAD_POST_DELAY_SECONDS, +) from db import db_client from folder import file_manager from log_record import logger from login import client from message import TEMP_MSG +from image_check import IsValidImage # class Down(object): @@ -35,7 +45,6 @@ def __init__(self): self.db = db_client() self.headers = { # "Connection": "keep-alive", - "Host": "www.pixiv.net", "referer": "https://www.pixiv.net/", "origin": "https://accounts.pixiv.net", "accept-language": "zh-CN,zh;q=0.9", # 返回translation,中文翻译的标签组 @@ -61,6 +70,18 @@ def __init__(self): # print("user_id",self.client.user_id) + # 自适应限速:仅对信息类接口(ajax/touch)生效;下载不受限 + self._adaptive_delay = 0.0 # 当前等待(秒) + self._adaptive_delay_max = ADAPTIVE_DELAY_MAX + self._adaptive_increase = ADAPTIVE_DELAY_INCREASE + self._adaptive_decay_ratio = ADAPTIVE_DELAY_DECAY_RATIO + + def calculate_host(self, url): + try: + return url.split("//")[-1].split("/")[0] + except Exception: + return "" + def baseRequest(self, options, data=None, params=None, retry_num=5): ''' :params options: 请求参数,暂时只用到headers和url @@ -76,9 +97,18 @@ def baseRequest(self, options, data=None, params=None, retry_num=5): options ={"url":"origin_url","headers":demo_headers} baseRequest(options=options) ''' - base_headers = [options["headers"] if "headers" in options.keys() else self.headers][0] + base_headers = dict(options["headers"]) if "headers" in options.keys() else dict(self.headers) + # 动态设置 Host,避免 421 + host = self.calculate_host(options.get("url", "")) + if host: + base_headers["Host"] = host try: + # 自适应限速:仅对信息类接口生效 + _url = options.get("url", "") + if ADAPTIVE_LIMIT_ENABLED and any(key in _url for key in ["/ajax/", "/touch/ajax/"]): + if self._adaptive_delay > 0: + time.sleep(self._adaptive_delay) # if options["method"].lower() == "get": # 网络请求函数get、post请求,暂时不判断method字段,待后续更新 # logger.debug("cookie_list {}".format(len(self.cookie_list))) @@ -91,6 +121,19 @@ def baseRequest(self, options, data=None, params=None, retry_num=5): verify = False, timeout = 10, ) + # 非 200 情况处理 + if getattr(response, "status_code", 0) != 200: + # 若是信息接口 404,多半为作品被删除/私密,返回可解析的错误 JSON,避免误判为网络失败 + if response.status_code == 404 and any(key in _url for key in ["/ajax/illust/", "/touch/ajax/illust/details"]): + class _FakeResponse: + pass + fake = _FakeResponse() + fake.status_code = 404 + fake.text = json.dumps({"error": True, "message": TEMP_MSG["PID_DELETED_TEXT"]}) + return fake + # 其他情况按失败处理 + logger.warning(f"请求失败[{response.status_code}] - {options['url']}") + return None return response except Exception as e: if retry_num > 0: @@ -125,6 +168,8 @@ def get_illust_info(self, pid, extra="pixiv"): resp = json.loads(r.text) except json.decoder.JSONDecodeError as e: logger.warning(TEMP_MSG["JSON_DECODE_ERR"].format(r.text)) + # 解析失败视为异常,轻微抬升延迟 + self._adaptive_delay = min(self._adaptive_delay_max, self._adaptive_delay + self._adaptive_increase / 2) return None # 未登录 @@ -152,9 +197,16 @@ def get_illust_info(self, pid, extra="pixiv"): return TEMP_MSG["PID_UNAUTH_ACCESS_2"] elif resp["message"] == TEMP_MSG["LIMIT_TEXT"]: + # 命中限流:抬升自适应延迟 + self._adaptive_delay = min(self._adaptive_delay_max, self._adaptive_delay + self._adaptive_increase) return TEMP_MSG["LIMIT_TEXT"] + # 兜底:常见 404/错误页的“无法找到您所请求的页面” + elif TEMP_MSG["PID_ERROR_TEXT"] in str(resp.get("message", "")): + return TEMP_MSG["PID_ERROR_TEXT"] # 作品数据 + # 请求成功:衰减自适应延迟 + self._adaptive_delay *= self._adaptive_decay_ratio info = resp["body"] # uid uid = int(info["author_details"]["user_id"]) @@ -293,7 +345,7 @@ def illustSingle(self, data): name = "{}.{}".format(data["pid"],original.split(".")[-1]) illustPath = os.path.join(path_,name) - if os.path.exists(illustPath) == True and os.path.getsize(illustPath) > 1000: + if os.path.exists(illustPath) == True and os.path.getsize(illustPath) > 1000 and IsValidImage(illustPath): # 作品存在且大于1000字节,为了避免58字节错误页面和其他错误页面 # logger.info("{}已存在".format(name)) pass @@ -303,7 +355,8 @@ def illustSingle(self, data): return None size = self.downSomething(illustPath,c.content) logger.success(TEMP_MSG["DM_DOWNLOAD_SUCCESS_INFO"].format(self.class_name,name,self.size2Mb(size))) - time.sleep(1) + if DOWNLOAD_POST_DELAY_SECONDS > 0: + time.sleep(DOWNLOAD_POST_DELAY_SECONDS) def illustMulti(self, data): """ @@ -327,7 +380,7 @@ def illustMulti(self, data): # 78997178-0.png name = new_original.split("/")[-1].replace("_p","-") illustPath = os.path.join(path_,name) - if os.path.exists(illustPath) == True and os.path.getsize(illustPath) > 1000: + if os.path.exists(illustPath) == True and os.path.getsize(illustPath) > 1000 and IsValidImage(illustPath): # logger.debug("{}已存在".format(name)) pass else: @@ -336,7 +389,8 @@ def illustMulti(self, data): return None size = self.downSomething(illustPath,c.content) logger.success(TEMP_MSG["DM_DOWNLOAD_SUCCESS_INFO"].format(self.class_name,name,self.size2Mb(size))) - time.sleep(1) + if DOWNLOAD_POST_DELAY_SECONDS > 0: + time.sleep(DOWNLOAD_POST_DELAY_SECONDS) def illustGif(self, data): """ diff --git a/v2.0/folder.py b/v2.0/folder.py index dabb600..556a37e 100644 --- a/v2.0/folder.py +++ b/v2.0/folder.py @@ -18,13 +18,39 @@ def __init__(self): isExists = os.path.exists(self.bk_path) if not isExists:os.makedirs(self.bk_path) + # uid 到画师目录的内存索引,避免频繁全盘扫描 + self.uid_to_user_path = {} + self._build_user_index() + + def _build_user_index(self): + """ + 扫描一次根目录,建立 uid -> user_path 的索引。 + 目录命名规则为:"{uid}--{userName}",取 "--" 之前的部分作为 uid。 + """ + try: + for folder in os.listdir(self.path): + # 只索引形如 uid--name 的目录 + if "--" in folder: + uid_prefix = folder.split('--')[0] + if uid_prefix.isdigit(): + self.uid_to_user_path[uid_prefix] = os.path.join(self.path, folder) + except FileNotFoundError: + # 根目录尚未创建时忽略 + pass + def select_user_path(self, uid, userName): - for folder in os.listdir(self.path): - if str(uid) == folder.split('--')[0]: - user_path = os.path.join(self.path,folder) - return user_path - else: - return self.mkdir_painter({"uid":uid,"userName":userName}) + uid_str = str(uid) + # 优先走内存索引 + user_path = self.uid_to_user_path.get(uid_str) + if user_path and os.path.exists(user_path): + return user_path + # 索引缺失或目录不存在时,重建一次索引 + self._build_user_index() + user_path = self.uid_to_user_path.get(uid_str) + if user_path and os.path.exists(user_path): + return user_path + # 未找到则创建 + return self.mkdir_painter({"uid":uid,"userName":userName}) def mkdir_painter(self, info): ''' @@ -43,10 +69,14 @@ def mkdir_painter(self, info): for folder in os.listdir(self.path): if str(uid) == folder.split('--')[0]: user_path = os.path.join(self.path,folder) + # 回填索引 + self.uid_to_user_path[str(uid)] = user_path return user_path user_path = os.path.join(self.path,painter_name) os.makedirs(user_path) + # 新建后回填索引 + self.uid_to_user_path[str(uid)] = user_path return user_path def mkdir_illusts(self, user_path,pid): @@ -74,15 +104,30 @@ def search_isExistsPid(self,root,extra="c",*args): :return True or False 已存在/不存在 ''' if extra == "c": - uid = args[0] - pid = args[1] - flag = glob.glob(os.path.join(root, f"{uid}**/{pid}/{pid}**.**")) + uid = str(args[0]) + pid = str(args[1]) + # 通过索引快速定位画师目录,避免全盘 glob + user_path = self.uid_to_user_path.get(uid) + if not user_path or not os.path.isdir(user_path): + # 尝试重建索引后再取一次 + self._build_user_index() + user_path = self.uid_to_user_path.get(uid) + if not user_path or not os.path.isdir(user_path): + return False + illust_dir = os.path.join(user_path, pid) + if not os.path.isdir(illust_dir): + return False + # 仅在该插画目录内局部匹配,提高效率 + flag = glob.glob(os.path.join(illust_dir, f"{pid}*.*")) + return bool(flag) elif extra == "b": - pid = args[0] - flag = glob.glob(os.path.join(root, f"{pid}/{pid}**.**")) + pid = str(args[0]) + illust_dir = os.path.join(root, pid) + if not os.path.isdir(illust_dir): + return False + flag = glob.glob(os.path.join(illust_dir, f"{pid}*.*")) + return bool(flag) - if flag: - return True return False diff --git a/v2.0/image_check.py b/v2.0/image_check.py new file mode 100644 index 0000000..aa4559f --- /dev/null +++ b/v2.0/image_check.py @@ -0,0 +1,107 @@ +from os import PathLike +import os +from io import BytesIO +import argparse +from typing import Iterable, Tuple +from PIL import Image, ImageSequence, UnidentifiedImageError + +# 判断文件是否为有效(完整且可解码)的图片 +# 输入参数为文件路径,或二进制文件对象 +def IsValidImage(file) -> bool: + try: + # 路径输入:优先走不读全文件的校验路径 + if isinstance(file, (str, PathLike)): + file_path = str(file) + if not os.path.exists(file_path) or os.path.getsize(file_path) == 0: + return False + # 结构校验 + with Image.open(file_path) as im: + im.verify() + # 解码校验(重新打开;动图需逐帧解码) + with Image.open(file_path) as im: + if getattr(im, "is_animated", False): + for frame in ImageSequence.Iterator(im): + frame.load() + else: + im.load() + return True + + # 文件对象输入:复制到内存后双重校验 + try: + file.seek(0) + except Exception: + pass + buf = file.read() + if not buf: + return False + with Image.open(BytesIO(buf)) as im: + im.verify() + with Image.open(BytesIO(buf)) as im: + if getattr(im, "is_animated", False): + for frame in ImageSequence.Iterator(im): + frame.load() + else: + im.load() + return True + except (UnidentifiedImageError, OSError, SyntaxError): + return False + + +def _iter_image_files(root_dir: str, extensions: Tuple[str, ...]) -> Iterable[str]: + for root, _, files in os.walk(root_dir): + image_files = [fn for fn in files if fn.lower().endswith(extensions)] + if not image_files: + continue + for filename in image_files: + yield os.path.join(root, filename) + + +def main(): + parser = argparse.ArgumentParser(description="校验目录下图片文件是否损坏(Pillow verify+load 双重校验)") + parser.add_argument( + "path", + nargs="?", + default=os.environ.get("PIXIC_CHECK_PATH", r"E:\\BaiduNetdiskDownload\\pixic"), + help="需要检测的根目录(默认读取环境变量 PIXIC_CHECK_PATH,若无则为示例路径)", + ) + parser.add_argument( + "--ext", + nargs="*", + default=[".png", ".jpg", ".jpeg", ".webp", ".gif", ".bmp"], + help="需要检测的图片扩展名列表(不区分大小写)", + ) + parser.add_argument( + "--output", + default="./corrupted_images.txt", + help="输出损坏文件路径的文本文件", + ) + parser.add_argument( + "--append", + action="store_true", + help="是否以追加模式写入输出文件(默认覆盖)", + ) + args = parser.parse_args() + + extensions = tuple(ext.lower() if ext.startswith(".") else f".{ext.lower()}" for ext in args.ext) + + if not args.append: + # 覆盖清空旧结果 + with open(args.output, "w", encoding="utf-8") as f: + pass + + total = 0 + broken = 0 + + for img_file in _iter_image_files(args.path, extensions): + total += 1 + if not IsValidImage(img_file): + broken += 1 + print(img_file) + with open(args.output, "a", encoding="utf-8") as output_file: + output_file.write(img_file + "\n") + + print(f"checked: {total}, corrupted: {broken}, ok: {total - broken}") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/v2.0/log_record.py b/v2.0/log_record.py index ce7689b..268dc3b 100644 --- a/v2.0/log_record.py +++ b/v2.0/log_record.py @@ -12,6 +12,9 @@ import sys from loguru import logger from config import DEBUG +import threading +import re +import atexit if DEBUG: level = "DEBUG" @@ -19,15 +22,160 @@ level = "INFO" log_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "log") + # remove default handler logger.remove() -# 控制台输出 -logger.add( - sys.stderr, - level=level + +# ============== 控制台单行汇聚日志(多前缀) ============== +_agg_state = { + "active": False, # 是否处于单行显示状态 + "width": 0, # 当前行宽,用于清除 + "count": 0, # 当前类别累计计数 + "category": None, # 当前类别标识 +} +_agg_lock = threading.Lock() +_ansi_re = re.compile(r"\x1b\[[0-9;]*m") + +def _visible_len(text: str) -> int: + try: + return len(_ansi_re.sub("", text)) + except Exception: + return len(text) +def _erase_line(): + """清除整行内容并将光标回到行首。""" + try: + # ANSI: 清除整行并回到行首,兼容支持虚拟终端的 Windows 终端/PowerShell 7+ + sys.stderr.write("\r\x1b[2K\r") + # 兼容降级:若终端不支持 ANSI,则用空格覆盖当前记录宽度 + if _agg_state.get("width", 0) > 0: + sys.stderr.write(" " * _agg_state["width"]) + sys.stderr.write("\r") + sys.stderr.flush() + except Exception: + pass + + +def _is_skip_message(record): + try: + msg = record.get("message", "") + return isinstance(msg, str) and msg.startswith("SKIP_ISEXISTS_ILLUST") + except Exception: + return False + + +def _is_bookmark_now_message(record): + try: + msg = record.get("message", "") + return isinstance(msg, str) and msg.startswith("Pixiv收藏作品第") + except Exception: + return False + + +def _is_skip_db_message(record): + try: + msg = record.get("message", "") + return isinstance(msg, str) and msg.startswith("SKIP_DB_EXISTS_ILLUST") + except Exception: + return False + + +def _clear_agg_line(): + if _agg_state["active"]: + _erase_line() + _agg_state["active"] = False + _agg_state["width"] = 0 + _agg_state["count"] = 0 + _agg_state["category"] = None + + +def _general_console_sink(message: str): + # 在输出其他日志前,若存在单行聚合状态,则先清除该行 + with _agg_lock: + _clear_agg_line() + sys.stderr.write(message) + + +def _agg_write(category: str, message: str): + # 单行动态刷新聚合输出 + with _agg_lock: + # 切换类别时清空行并重置计数 + if _agg_state["category"] != category: + _clear_agg_line() + _agg_state["category"] = category + _agg_state["count"] += 1 + text = f"{message.strip()} (x{_agg_state['count']})" + try: + # 使用 ANSI 清行,避免多字节/颜色转义导致的残留 + _erase_line() + sys.stderr.write("\r" + text) + sys.stderr.flush() + _agg_state["active"] = True + _agg_state["width"] = _visible_len(text) + except Exception: + sys.stderr.write(message + "\n") + + +def _skip_console_sink(message: str): + _agg_write("skip_exists", message) + + +def _bookmark_now_console_sink(message: str): + _agg_write("bookmark_now", message) + + +# 控制台输出(常规日志,不包括单行聚合的行) +logger.add( + _general_console_sink, + level=level, + filter=lambda record: not ( + _is_skip_message(record) or _is_bookmark_now_message(record) or _is_skip_db_message(record) + ), + colorize=True +) + +# 控制台输出(仅处理以 SKIP_ISEXISTS_ILLUST 开头的行,单行动态刷新) +logger.add( + _skip_console_sink, + level=level, + filter=lambda record: _is_skip_message(record), + format="{message}", + colorize=True +) + +# 控制台输出(仅处理 BOOKMARK_NOW_INFO 行,单行动态刷新) +logger.add( + _bookmark_now_console_sink, + level=level, + filter=lambda record: _is_bookmark_now_message(record), + format="{message}", + colorize=True +) + +# 控制台输出(仅处理 SKIP_DB_EXISTS_ILLUST 行,单行动态刷新) +def _skip_db_console_sink(message: str): + _agg_write("skip_db_exists", message) + + +logger.add( + _skip_db_console_sink, + level=level, + filter=lambda record: _is_skip_db_message(record), + format="{message}", + colorize=True ) -# 日志写入 -logger.add( + +# 程序结束时,确保清除聚合行,避免残留 +def _agg_finalize(): + try: + with _agg_lock: + _clear_agg_line() + except Exception: + pass + +atexit.register(_agg_finalize) + +# 日志写入文件(保持原状,包含所有日志) +logger.add( os.path.join(log_path, "{time}.log"), encoding="utf-8", rotation="00:00", diff --git a/v2.0/login.py b/v2.0/login.py index 29873ba..b14f3e5 100644 --- a/v2.0/login.py +++ b/v2.0/login.py @@ -107,7 +107,8 @@ def get_cookie(self): chrome_options.add_argument('user-data-dir='+PRO_DIR) try: - driver = webdriver.Chrome(chrome_options=chrome_options) + # 兼容新版 Selenium:使用 options 参数 + driver = webdriver.Chrome(options=chrome_options) # selenium.common.exceptions.WebDriverException: # Message: unknown error: cannot create default profile directory # PRO_DIR错误 diff --git a/v2.0/requirements.txt b/v2.0/requirements.txt index ab30903..2cacb01 100644 --- a/v2.0/requirements.txt +++ b/v2.0/requirements.txt @@ -5,4 +5,5 @@ lxml PyMySQL==0.9.3 requests==2.23.0 selenium==3.141.0 -loguru==0.5.3 \ No newline at end of file +loguru==0.5.3 +Pillow \ No newline at end of file