Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 20 additions & 3 deletions v2.0/bookmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,17 @@
# import math
import json
import time
import random

from config import BOOKMARK_HIDE_ENABLE,SKIP_ISEXISTS_ILLUST,BOOKMARK_PATH
from config import BOOKMARK_HIDE_ENABLE,SKIP_ISEXISTS_ILLUST,SKIP_DB_EXISTS_ILLUST,BOOKMARK_PATH
from downer import Downloader
from log_record import logger
from message import TEMP_MSG
from thread_pool import ThreadPool,callback
from ptimer import Timer
# TODO
from tag import TAG_FLAG_BOOKMARK
from checker import run_startup_check


class Bookmark(object):
Expand Down Expand Up @@ -136,6 +138,16 @@ def thread_by_illust(self, *args):
pid = args[0]
info = None

# 基于数据库的提前跳过,避免网络与文件系统检测
if hasattr(self.db, "pool") and SKIP_DB_EXISTS_ILLUST:
try:
exists, _ = self.db.check_illust(pid, table="bookmark")
if exists:
logger.info(f"SKIP_DB_EXISTS_ILLUST - {pid}")
return info
except Exception:
pass

# 跳过已下载插画的请求
if SKIP_ISEXISTS_ILLUST and self.file_manager.search_isExistsPid(
BOOKMARK_PATH,"b",*(pid,)):
Expand Down Expand Up @@ -202,6 +214,12 @@ def thread_by_illust(self, *args):

@logger.catch
def run(self):
# 启动一致性检查(最近 20 条)
try:
run_startup_check(self.Downloader)
except Exception as e:
logger.warning(f"启动一致性检查失败: {e}")

# TDOD TAG COUNT开始工作
TAG_FLAG_BOOKMARK = False
logger.info(TEMP_MSG["BEGIN_INFO"].format(self.class_name))
Expand Down Expand Up @@ -249,8 +267,7 @@ def run(self):
pool.put(self.thread_by_illust,(pid,),callback)

offset += self.bookmark_page_offset
# 固定休眠
time.sleep(1)
# 取消固定/随机等待,统一交由 Downloader 自适应限速(仅对线上接口生效)
except Exception as e:
logger.warning("Exception {}".format(e))
finally:
Expand Down
149 changes: 149 additions & 0 deletions v2.0/checker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
# coding=utf8
"""
启动一致性检查:
- 对 pixiv / bookmark 表各取最近 20 条记录,核对本地文件夹图片数量与 pageCount 是否一致;
- 使用 IsValidImage 校验图片完整性;
- 若不一致或存在损坏图片,则触发自动重下;重下后再次校验,仍失败则告警。
"""

import os
from typing import List, Tuple, Optional

from downer import Downloader
from folder import file_manager
from image_check import IsValidImage
from log_record import logger

TABLES = ["pixiv", "bookmark"]
LATEST_LIMIT = 20


def list_local_images(dir_path: str, pid: int) -> List[str]:
if not dir_path or not os.path.isdir(dir_path):
return []
files = []
try:
for name in os.listdir(dir_path):
if not name.lower().startswith(str(pid)):
continue
# 常见图片扩展
lower = name.lower()
if lower.endswith((".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp")):
files.append(os.path.join(dir_path, name))
except Exception:
return []
return files


def verify_folder(pid: int, path: str, page_count: int, illust_type: int) -> Tuple[bool, List[str]]:
"""返回 (是否通过, 需重下的文件列表)。
- ugoira(type=2):zip 或 gif 任一存在且 gif 可读视为通过
- 单图/多图:图片数量需达到目标且每张校验通过
"""
if not path or not os.path.isdir(path):
return False, []

# 动图:gif 或 zip 任一存在即可(gif 优先验证完整性)
if illust_type == 2:
gif = os.path.join(path, f"{pid}.gif")
zipf = os.path.join(path, f"{pid}.zip")
if os.path.exists(gif):
ok = IsValidImage(gif)
return (ok, [] if ok else [gif])
if os.path.exists(zipf):
return True, []
return False, []

images = list_local_images(path, pid)
if page_count <= 1:
if not images:
return False, []
# 任取第一张作为单图
img = images[0]
ok = IsValidImage(img)
return (ok, [] if ok else [img])

# 多图:至少 page_count 张,且全部有效
if len(images) < page_count:
return False, images
invalid = [p for p in images if not IsValidImage(p)]
return (len(invalid) == 0, invalid)


def redownload_one(d: Downloader, table: str, rec: dict) -> bool:
"""按表与记录重新下载该 PID,下载完成后再次校验。返回最终是否通过。"""
pid = int(rec.get("pid"))
illust_type = int(rec.get("illustType", 0))
page_count = int(rec.get("pageCount", 1))

# 推导保存目录(不写 DB)
save_path = rec.get("path")
if not save_path or save_path == "None":
if table == "bookmark":
user_dir = file_manager.bk_path
else:
uid = int(rec.get("uid", 0))
user_name = rec.get("userName", "")
user_dir = file_manager.select_user_path(uid, user_name)
save_path = file_manager.mkdir_illusts(user_dir, pid)

# 走 Downloader 获取信息与下载
info = d.get_illust_info(pid, extra=("bookmark" if table == "bookmark" else "pixiv"))
if not info or isinstance(info, str):
logger.warning(f"[checker] 获取作品信息失败或不可访问: {pid} -> {info}")
return False

# 再次校验
ok, invalid = verify_folder(pid, save_path, page_count, illust_type)
if not ok:
logger.warning(f"[checker] 重新下载后仍不一致/损坏: {pid} | invalid={len(invalid)}")
return ok


def run_startup_check(d: Optional[Downloader] = None):
d = d or Downloader()
for table in TABLES:
logger.info(f"[checker] 启动一致性检查 | 表: {table} | 最近 N: {LATEST_LIMIT}")
try:
latest = d.db.select_latest_records(table=table, limit=LATEST_LIMIT)
except Exception:
latest = []
if not latest:
logger.info(f"[checker] 表 {table} 无记录可检")
continue

total = 0
ok_cnt = 0
fix_cnt = 0
fail_cnt = 0

for rec in latest:
pid = int(rec.get("pid"))
path = rec.get("path")
page_count = int(rec.get("pageCount", 1))
illust_type = int(rec.get("illustType", 0))
total += 1

ok, invalid = verify_folder(pid, path, page_count, illust_type)
if ok:
ok_cnt += 1
continue

logger.warning(f"[checker] 不一致/疑似损坏,准备重下: {table} pid={pid} path={path}")
ok2 = redownload_one(d, table, rec)
if not ok2:
fail_cnt += 1
logger.warning(f"[checker] 重下失败:{table} pid={pid}")
else:
fix_cnt += 1
logger.success(f"[checker] 重下完成 | 表:{table} PID:{pid} 再次校验通过")

logger.info(
f"[checker] 完成 | 表:{table} 总数:{total} 通过:{ok_cnt} 修复:{fix_cnt} 失败:{fail_cnt}"
)


if __name__ == "__main__":
run_startup_check()


17 changes: 16 additions & 1 deletion v2.0/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,11 @@
# 2.同时也无法更新对应pid在数据库中的记录
SKIP_ISEXISTS_ILLUST = True

# 基于数据库提前跳过:
# 1. 若数据库中已存在该 pid 的记录,则不再请求作品信息(不触碰网络/文件系统)
# 2. 适用于你信任数据库记录已经代表“已下载/已处理”的场景
SKIP_DB_EXISTS_ILLUST = True

# 关注画师爬虫控制开关
PIXIV_CRAWLER_ENABLED = False

Expand Down Expand Up @@ -117,4 +122,14 @@
# ===============DEBUG===============
# TODO
DEBUG = False
# ===============DEBUG===============
# ===============DEBUG===============


# =============== 自适应限速(仅信息接口) ===============
# 启用后,仅针对 /ajax/ 或 /touch/ajax/ 的信息请求进行自适应轻量等待;
# 下载文件不受影响。未命中限流时延迟会快速衰减为 0。
ADAPTIVE_LIMIT_ENABLED = True
ADAPTIVE_DELAY_MAX = 1.0 # 最大延迟(秒)
ADAPTIVE_DELAY_INCREASE = 0.2 # 命中限流后的递增步长(秒)
ADAPTIVE_DELAY_DECAY_RATIO = 0.5 # 成功请求后的衰减系数(0-1)
# =======================================================
23 changes: 20 additions & 3 deletions v2.0/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,17 @@
"""
import json
import time
import random
import re

from config import SKIP_ISEXISTS_ILLUST,ROOT_PATH
from config import SKIP_ISEXISTS_ILLUST, SKIP_DB_EXISTS_ILLUST, ROOT_PATH
from downer import Downloader
from log_record import logger
from message import TEMP_MSG
from thread_pool import ThreadPool,callback
from tag import TAG_FLAG_USER
from ptimer import Timer
from checker import run_startup_check


class Crawler(object):
Expand Down Expand Up @@ -177,6 +179,16 @@ def thread_by_illust(self, *args):
uid = args[1]
info = None

# 基于数据库的提前跳过,避免网络与文件系统检测
if hasattr(self.db, "pool") and SKIP_DB_EXISTS_ILLUST:
try:
exists, _ = self.db.check_illust(pid)
if exists:
logger.info(f"SKIP_DB_EXISTS_ILLUST - {uid} - {pid}")
return info
except Exception:
pass

# 跳过已下载插画的请求
if SKIP_ISEXISTS_ILLUST and self.file_manager.search_isExistsPid(
ROOT_PATH,"c",*(uid,pid,)):
Expand Down Expand Up @@ -243,6 +255,12 @@ def thread_by_illust(self, *args):

@logger.catch
def run(self):
# 启动一致性检查(最近 20 条)
try:
run_startup_check(self.Downloader)
except Exception as e:
logger.warning(f"启动一致性检查失败: {e}")

# 开始工作
TAG_FLAG_USER = False
logger.info(TEMP_MSG["BEGIN_INFO"].format(self.class_name))
Expand Down Expand Up @@ -288,8 +306,7 @@ def run(self):
for pid in all_illust:
pool.put(self.thread_by_illust,(pid,u["uid"],),callback)

# 固定休眠
time.sleep(5)
# 取消固定/随机等待,统一交由 Downloader 自适应限速(仅对线上接口生效)
# 无作品更新
else:
logger.info(TEMP_MSG["NOW_USER_INFO"].format(self.class_name,position,u["userName"],u["uid"],len(all_illust)))
Expand Down
17 changes: 17 additions & 0 deletions v2.0/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,6 +289,23 @@ def select_illust(self, pid, table="pixiv"):
cur.close()
conn.close()

def select_latest_records(self, table="pixiv", limit=20):
"""
获取指定表按自增 id 倒序的最近 N 条记录,包含 pid/path/pageCount/illustType/uid/userName。
"""
conn,cur = self.get_conn()
sql = f"""SELECT pid,path,pageCount,illustType,uid,userName FROM {table} ORDER BY id DESC LIMIT %s"""
try:
cur.execute(sql, (int(limit),))
r = cur.fetchall()
return r or []
except Exception as e:
logger.warning(f"<Exception> - {e}")
return []
finally:
cur.close()
conn.close()

def select_user(self, start_id, limit=100, table="pxusers"):
"""
获取关注列表用户的数据库信息
Expand Down
Loading