Merge pull request #2626 from javadmirer/master

feederbox826 · web-flow · commit 86b4937a91be · 2025-12-26T05:12:23.000-05:00
Add avbase python scraper
diff --git a/scrapers/avbase-python/avbase-python.py b/scrapers/avbase-python/avbase-python.py
@@ -0,0 +1,131 @@
+import base64
+import json
+import sys
+from datetime import datetime
+
+import py_common.log as log
+import requests
+from lxml import html
+from py_common.config import get_config
+from py_common.types import ScrapedScene, SceneSearchResult
+from py_common.util import scraper_args
+
+BASE_URL = "https://www.avbase.net"
+
+# DO NOT EDIT THIS FILE
+# run the scraper once and edit the config.ini file instead
+config = get_config(
+    default="""# flaresolverr endpoint, change this if flaresolverr is not running on the same server as Stash
+flaresolverr_url = http://localhost:8191/v1
+"""
+)
+
+def fetch_page_html(url):
+    headers = {"Content-Type": "application/json"}
+    data = {
+        "cmd": "request.get",
+        "url": url,
+        "maxTimeout": 60000
+    }
+    response = requests.post(config.flaresolverr_url, headers=headers, json=data)
+    res_html = response.json()["solution"]["response"]
+    return res_html
+
+
+def scene_search(url) -> list[ScrapedScene]:
+    scraped = fetch_page_html(url)
+    tree = html.fromstring(scraped)
+    scenes = tree.xpath('//div[@class="relative"]')
+
+    return list(map(map_scene, scenes))
+
+
+def process_date(date_string):
+    return datetime.strptime(date_string, '%Y/%m/%d').date().isoformat()
+
+
+def map_scene(node) -> SceneSearchResult:
+    title = node.xpath('.//div[@class="grow"]/a[contains(@href,"/works")]')[0].text.strip()
+    performers = list(map(map_by_name, node.xpath('.//a[contains(@href,"/talents")]/span')))
+    maker = node.xpath('.//a[contains(@href,"label")]')[0].text
+    scraped_image = node.xpath('.//img[@loading]/@src')[0]
+    image_bytes = fetch_as_base64(scraped_image)
+    image_base64 = f"data:image/jpeg;base64,{image_bytes}"
+    url = BASE_URL + node.xpath('.//div[@class="grow"]/a[contains(@href,"/works")]/@href')[0]
+    raw_date = node.xpath('.//a[contains(@href,"/works/date")]')[0].text
+    date = process_date(raw_date)
+    scene: SceneSearchResult = {
+        "title": title,
+        "urls": [url],
+        "image" : image_base64,
+        "studio": {
+            "name": maker
+        },
+        "date": date
+    }
+    scene['performers'] = performers
+    return scene
+
+
+def map_by_name(node):
+    return {
+        "name": node.text
+    }
+
+
+def scene_by_name(name) -> list[ScrapedScene]:
+    avbase_url = f"{BASE_URL}/works?q={name.strip()}"
+    return scene_search(avbase_url)
+
+
+def scene_by_url(url) -> ScrapedScene:
+    scraped = fetch_page_html(url)
+    tree = html.fromstring(scraped)
+    raw_date = tree.xpath("//a[contains(@href,'/works/date/')]")[0].text
+    date = process_date(raw_date)
+    maker = tree.xpath("//a[contains(@href,'/makers')]")[0].text
+    director = tree.xpath("//a[contains(@href,'/works?q=')]")
+    title = tree.xpath("//h1[contains(@class, 'text-lg')]")[0].text
+    performers = list(map(map_by_name, tree.xpath("//a[contains(@class, 'chip')]/span[1]")))
+    image_url = tree.xpath("//a[contains(@class,'md:grow')]/div/img/@src")[0]
+    image_bytes = fetch_as_base64(image_url)
+    image_base64 = f"data:image/jpg;base64,{image_bytes}"
+    # url = tree.xpath("html/head//link[@rel='canonical']/@href")[0]
+    tags = list(map(map_by_name, tree.xpath("//a[contains(@href,'/tags/')]")))
+    code = tree.xpath("//span[contains(text(), '名寄せID: ')]/following-sibling::div/span[contains(text(),'-')]")[0].text
+    description = tree.xpath("//div[contains(@class, 'text-xs') and contains(text(), '紹介文')]/following-sibling::div")
+
+    scene: ScrapedScene = {
+        "title": title,
+        "urls": [url],
+        "image": image_base64,
+        "studio": {
+            "name": maker
+        },
+        "date": date,
+        "director": director[0].text if director else None,
+        "details" : description[0].text if description else None,
+        "performers": performers,
+        "code": code,
+        "tags": tags
+    }
+    return scene
+
+
+def fetch_as_base64(url: str) -> str | None:
+    return base64.b64encode(requests.get(url).content).decode('utf-8')
+
+if __name__ == "__main__":
+    op, args = scraper_args()
+    result = None
+    match op, args:
+        case "scene-by-name", {"name": name} if name:
+            result = scene_by_name(name)
+        case "scene-by-query-fragment", {"url": url} if url:
+            result = scene_by_url(url)
+        case "scene-by-url", {"url": url} if url:
+            result = scene_by_url(url)
+        case _:
+            log.error(f"Operation: {op}, arguments: {json.dumps(args)}")
+            sys.exit(1)
+    print(json.dumps(result))
diff --git a/scrapers/avbase-python/avbase-python.yml b/scrapers/avbase-python/avbase-python.yml
@@ -0,0 +1,23 @@
+name: avbase-python
+sceneByName:
+  action: script
+  script:
+    - python
+    - avbase-python.py
+    - scene-by-name
+
+sceneByQueryFragment:
+  action: script
+  script:
+    - python
+    - avbase-python.py
+    - scene-by-query-fragment
+
+sceneByURL:
+  - action: script
+    script:
+      - python
+      - avbase-python.py
+      - scene-by-url
+    url:
+      - avbase.net/works/