|
| 1 | +import base64 |
| 2 | +import json |
| 3 | +import sys |
| 4 | +from datetime import datetime |
| 5 | + |
| 6 | +import py_common.log as log |
| 7 | +import requests |
| 8 | +from lxml import html |
| 9 | +from py_common.config import get_config |
| 10 | +from py_common.types import ScrapedScene, SceneSearchResult |
| 11 | +from py_common.util import scraper_args |
| 12 | + |
| 13 | +BASE_URL = "https://www.avbase.net" |
| 14 | + |
| 15 | +# DO NOT EDIT THIS FILE |
| 16 | +# run the scraper once and edit the config.ini file instead |
| 17 | +config = get_config( |
| 18 | + default="""# flaresolverr endpoint, change this if flaresolverr is not running on the same server as Stash |
| 19 | +flaresolverr_url = http://localhost:8191/v1 |
| 20 | +""" |
| 21 | +) |
| 22 | + |
| 23 | +def fetch_page_html(url): |
| 24 | + headers = {"Content-Type": "application/json"} |
| 25 | + data = { |
| 26 | + "cmd": "request.get", |
| 27 | + "url": url, |
| 28 | + "maxTimeout": 60000 |
| 29 | + } |
| 30 | + response = requests.post(config.flaresolverr_url, headers=headers, json=data) |
| 31 | + res_html = response.json()["solution"]["response"] |
| 32 | + return res_html |
| 33 | + |
| 34 | + |
| 35 | +def scene_search(url) -> list[ScrapedScene]: |
| 36 | + scraped = fetch_page_html(url) |
| 37 | + tree = html.fromstring(scraped) |
| 38 | + scenes = tree.xpath('//div[@class="relative"]') |
| 39 | + |
| 40 | + return list(map(map_scene, scenes)) |
| 41 | + |
| 42 | + |
| 43 | +def process_date(date_string): |
| 44 | + return datetime.strptime(date_string, '%Y/%m/%d').date().isoformat() |
| 45 | + |
| 46 | + |
| 47 | +def map_scene(node) -> SceneSearchResult: |
| 48 | + title = node.xpath('.//div[@class="grow"]/a[contains(@href,"/works")]')[0].text.strip() |
| 49 | + performers = list(map(map_by_name, node.xpath('.//a[contains(@href,"/talents")]/span'))) |
| 50 | + maker = node.xpath('.//a[contains(@href,"label")]')[0].text |
| 51 | + scraped_image = node.xpath('.//img[@loading]/@src')[0] |
| 52 | + image_bytes = fetch_as_base64(scraped_image) |
| 53 | + image_base64 = f"data:image/jpeg;base64,{image_bytes}" |
| 54 | + url = BASE_URL + node.xpath('.//div[@class="grow"]/a[contains(@href,"/works")]/@href')[0] |
| 55 | + raw_date = node.xpath('.//a[contains(@href,"/works/date")]')[0].text |
| 56 | + date = process_date(raw_date) |
| 57 | + scene: SceneSearchResult = { |
| 58 | + "title": title, |
| 59 | + "urls": [url], |
| 60 | + "image" : image_base64, |
| 61 | + "studio": { |
| 62 | + "name": maker |
| 63 | + }, |
| 64 | + "date": date |
| 65 | + } |
| 66 | + scene['performers'] = performers |
| 67 | + return scene |
| 68 | + |
| 69 | + |
| 70 | +def map_by_name(node): |
| 71 | + return { |
| 72 | + "name": node.text |
| 73 | + } |
| 74 | + |
| 75 | + |
| 76 | +def scene_by_name(name) -> list[ScrapedScene]: |
| 77 | + avbase_url = f"{BASE_URL}/works?q={name.strip()}" |
| 78 | + return scene_search(avbase_url) |
| 79 | + |
| 80 | + |
| 81 | +def scene_by_url(url) -> ScrapedScene: |
| 82 | + scraped = fetch_page_html(url) |
| 83 | + tree = html.fromstring(scraped) |
| 84 | + raw_date = tree.xpath("//a[contains(@href,'/works/date/')]")[0].text |
| 85 | + date = process_date(raw_date) |
| 86 | + maker = tree.xpath("//a[contains(@href,'/makers')]")[0].text |
| 87 | + director = tree.xpath("//a[contains(@href,'/works?q=')]") |
| 88 | + title = tree.xpath("//h1[contains(@class, 'text-lg')]")[0].text |
| 89 | + performers = list(map(map_by_name, tree.xpath("//a[contains(@class, 'chip')]/span[1]"))) |
| 90 | + image_url = tree.xpath("//a[contains(@class,'md:grow')]/div/img/@src")[0] |
| 91 | + image_bytes = fetch_as_base64(image_url) |
| 92 | + image_base64 = f"data:image/jpg;base64,{image_bytes}" |
| 93 | + # url = tree.xpath("html/head//link[@rel='canonical']/@href")[0] |
| 94 | + tags = list(map(map_by_name, tree.xpath("//a[contains(@href,'/tags/')]"))) |
| 95 | + code = tree.xpath("//span[contains(text(), '名寄せID: ')]/following-sibling::div/span[contains(text(),'-')]")[0].text |
| 96 | + description = tree.xpath("//div[contains(@class, 'text-xs') and contains(text(), '紹介文')]/following-sibling::div") |
| 97 | + |
| 98 | + scene: ScrapedScene = { |
| 99 | + "title": title, |
| 100 | + "urls": [url], |
| 101 | + "image": image_base64, |
| 102 | + "studio": { |
| 103 | + "name": maker |
| 104 | + }, |
| 105 | + "date": date, |
| 106 | + "director": director[0].text if director else None, |
| 107 | + "details" : description[0].text if description else None, |
| 108 | + "performers": performers, |
| 109 | + "code": code, |
| 110 | + "tags": tags |
| 111 | + } |
| 112 | + return scene |
| 113 | + |
| 114 | + |
| 115 | +def fetch_as_base64(url: str) -> str | None: |
| 116 | + return base64.b64encode(requests.get(url).content).decode('utf-8') |
| 117 | + |
| 118 | +if __name__ == "__main__": |
| 119 | + op, args = scraper_args() |
| 120 | + result = None |
| 121 | + match op, args: |
| 122 | + case "scene-by-name", {"name": name} if name: |
| 123 | + result = scene_by_name(name) |
| 124 | + case "scene-by-query-fragment", {"url": url} if url: |
| 125 | + result = scene_by_url(url) |
| 126 | + case "scene-by-url", {"url": url} if url: |
| 127 | + result = scene_by_url(url) |
| 128 | + case _: |
| 129 | + log.error(f"Operation: {op}, arguments: {json.dumps(args)}") |
| 130 | + sys.exit(1) |
| 131 | + print(json.dumps(result)) |
0 commit comments