Skip to content

Commit 86b4937

Browse files
authored
Merge pull request #2626 from javadmirer/master
Add avbase python scraper
2 parents 88b089a + 0128208 commit 86b4937

File tree

2 files changed

+154
-0
lines changed

2 files changed

+154
-0
lines changed
Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
import base64
2+
import json
3+
import sys
4+
from datetime import datetime
5+
6+
import py_common.log as log
7+
import requests
8+
from lxml import html
9+
from py_common.config import get_config
10+
from py_common.types import ScrapedScene, SceneSearchResult
11+
from py_common.util import scraper_args
12+
13+
BASE_URL = "https://www.avbase.net"
14+
15+
# DO NOT EDIT THIS FILE
16+
# run the scraper once and edit the config.ini file instead
17+
config = get_config(
18+
default="""# flaresolverr endpoint, change this if flaresolverr is not running on the same server as Stash
19+
flaresolverr_url = http://localhost:8191/v1
20+
"""
21+
)
22+
23+
def fetch_page_html(url):
24+
headers = {"Content-Type": "application/json"}
25+
data = {
26+
"cmd": "request.get",
27+
"url": url,
28+
"maxTimeout": 60000
29+
}
30+
response = requests.post(config.flaresolverr_url, headers=headers, json=data)
31+
res_html = response.json()["solution"]["response"]
32+
return res_html
33+
34+
35+
def scene_search(url) -> list[ScrapedScene]:
36+
scraped = fetch_page_html(url)
37+
tree = html.fromstring(scraped)
38+
scenes = tree.xpath('//div[@class="relative"]')
39+
40+
return list(map(map_scene, scenes))
41+
42+
43+
def process_date(date_string):
44+
return datetime.strptime(date_string, '%Y/%m/%d').date().isoformat()
45+
46+
47+
def map_scene(node) -> SceneSearchResult:
48+
title = node.xpath('.//div[@class="grow"]/a[contains(@href,"/works")]')[0].text.strip()
49+
performers = list(map(map_by_name, node.xpath('.//a[contains(@href,"/talents")]/span')))
50+
maker = node.xpath('.//a[contains(@href,"label")]')[0].text
51+
scraped_image = node.xpath('.//img[@loading]/@src')[0]
52+
image_bytes = fetch_as_base64(scraped_image)
53+
image_base64 = f"data:image/jpeg;base64,{image_bytes}"
54+
url = BASE_URL + node.xpath('.//div[@class="grow"]/a[contains(@href,"/works")]/@href')[0]
55+
raw_date = node.xpath('.//a[contains(@href,"/works/date")]')[0].text
56+
date = process_date(raw_date)
57+
scene: SceneSearchResult = {
58+
"title": title,
59+
"urls": [url],
60+
"image" : image_base64,
61+
"studio": {
62+
"name": maker
63+
},
64+
"date": date
65+
}
66+
scene['performers'] = performers
67+
return scene
68+
69+
70+
def map_by_name(node):
71+
return {
72+
"name": node.text
73+
}
74+
75+
76+
def scene_by_name(name) -> list[ScrapedScene]:
77+
avbase_url = f"{BASE_URL}/works?q={name.strip()}"
78+
return scene_search(avbase_url)
79+
80+
81+
def scene_by_url(url) -> ScrapedScene:
82+
scraped = fetch_page_html(url)
83+
tree = html.fromstring(scraped)
84+
raw_date = tree.xpath("//a[contains(@href,'/works/date/')]")[0].text
85+
date = process_date(raw_date)
86+
maker = tree.xpath("//a[contains(@href,'/makers')]")[0].text
87+
director = tree.xpath("//a[contains(@href,'/works?q=')]")
88+
title = tree.xpath("//h1[contains(@class, 'text-lg')]")[0].text
89+
performers = list(map(map_by_name, tree.xpath("//a[contains(@class, 'chip')]/span[1]")))
90+
image_url = tree.xpath("//a[contains(@class,'md:grow')]/div/img/@src")[0]
91+
image_bytes = fetch_as_base64(image_url)
92+
image_base64 = f"data:image/jpg;base64,{image_bytes}"
93+
# url = tree.xpath("html/head//link[@rel='canonical']/@href")[0]
94+
tags = list(map(map_by_name, tree.xpath("//a[contains(@href,'/tags/')]")))
95+
code = tree.xpath("//span[contains(text(), '名寄せID: ')]/following-sibling::div/span[contains(text(),'-')]")[0].text
96+
description = tree.xpath("//div[contains(@class, 'text-xs') and contains(text(), '紹介文')]/following-sibling::div")
97+
98+
scene: ScrapedScene = {
99+
"title": title,
100+
"urls": [url],
101+
"image": image_base64,
102+
"studio": {
103+
"name": maker
104+
},
105+
"date": date,
106+
"director": director[0].text if director else None,
107+
"details" : description[0].text if description else None,
108+
"performers": performers,
109+
"code": code,
110+
"tags": tags
111+
}
112+
return scene
113+
114+
115+
def fetch_as_base64(url: str) -> str | None:
116+
return base64.b64encode(requests.get(url).content).decode('utf-8')
117+
118+
if __name__ == "__main__":
119+
op, args = scraper_args()
120+
result = None
121+
match op, args:
122+
case "scene-by-name", {"name": name} if name:
123+
result = scene_by_name(name)
124+
case "scene-by-query-fragment", {"url": url} if url:
125+
result = scene_by_url(url)
126+
case "scene-by-url", {"url": url} if url:
127+
result = scene_by_url(url)
128+
case _:
129+
log.error(f"Operation: {op}, arguments: {json.dumps(args)}")
130+
sys.exit(1)
131+
print(json.dumps(result))
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
name: avbase-python
2+
sceneByName:
3+
action: script
4+
script:
5+
- python
6+
- avbase-python.py
7+
- scene-by-name
8+
9+
sceneByQueryFragment:
10+
action: script
11+
script:
12+
- python
13+
- avbase-python.py
14+
- scene-by-query-fragment
15+
16+
sceneByURL:
17+
- action: script
18+
script:
19+
- python
20+
- avbase-python.py
21+
- scene-by-url
22+
url:
23+
- avbase.net/works/

0 commit comments

Comments
 (0)