Skip to content

Commit a1316db

Browse files
vhbui02eight04
andauthored
Add: support colamanga, fix imagehandler doesn't work on temp files (#401)
* feat: remove `www.colamanga.com` from `oh.py` module, refs #398 * fix: `imagehandler` is ignored even when existed in module * feat: add 'colamanga.com' support, closes #398 * fix: better way to handle is ignored * Change: use only one vm --------- Co-authored-by: eight04 <eight04@gmail.com>
1 parent 68cd735 commit a1316db

File tree

3 files changed

+239
-4
lines changed

3 files changed

+239
-4
lines changed

comiccrawler/crawler.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -133,8 +133,13 @@ def handle_image(self):
133133
if hasattr(self.mod, "imagehandler"):
134134
if not self.image_bin and self.tempfile_complete:
135135
self.image_bin = content_read(self.tempfile, raw=True)
136-
self.image_ext, self.image_bin = self.mod.imagehandler(
137-
self.image_ext, self.image_bin)
136+
137+
new_image_ext, new_image_bin = self.mod.imagehandler(self.image_ext, self.image_bin)
138+
if new_image_ext != self.image_ext or new_image_bin != self.image_bin:
139+
self.image_ext = new_image_ext
140+
self.image_bin = new_image_bin
141+
if self.tempfile:
142+
content_write(self.tempfile, content=self.image_bin)
138143

139144
def save_image(self):
140145
"""Write image to save path"""

comiccrawler/mods/colamanga.py

Lines changed: 231 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,231 @@
1+
#! python3
2+
# fmt: off
3+
4+
"""colamanga
5+
6+
E.g. Zombie World
7+
https://www.colamanga.com/manga-mr87910/
8+
"""
9+
10+
import re
11+
from html import unescape
12+
13+
# import time
14+
from urllib.parse import urljoin
15+
16+
from Cryptodome.Cipher import AES
17+
from deno_vm import VM
18+
19+
from ..core import Episode, grabhtml
20+
from ..session_manager import session_manager
21+
22+
domain = ["www.colamanga.com"]
23+
name = "colamanga"
24+
25+
26+
def load_config():
27+
session = session_manager.get("https://img.colamanga.com")
28+
session.headers.update(
29+
{
30+
"accept": "*/*",
31+
"accept-language": "en-US,en;q=0.9",
32+
"origin": "https://www.colamanga.com",
33+
}
34+
)
35+
36+
37+
def get_title(html, url):
38+
title = re.search(
39+
'meta property="og:comic:book_name" content="([^"]+)', html
40+
).group(1)
41+
return unescape(title).strip()
42+
43+
44+
def get_episodes(html, url):
45+
s = []
46+
for match in re.finditer(r'title="([^"]+)" href="([^"]+)', html):
47+
title, ep_url = match.groups()
48+
s.append(Episode(unescape(title), urljoin(url, ep_url)))
49+
return s[::-1]
50+
51+
52+
class ScriptCache:
53+
def __init__(self):
54+
self.cache = {}
55+
56+
def fetch(self, html, url, scripts):
57+
for script in scripts:
58+
if script in self.cache:
59+
continue
60+
pattern = 'src="([^"]+{})'.format(script)
61+
js_url = re.search(pattern, html).group(1)
62+
self.cache[script] = grabhtml(urljoin(url, js_url))
63+
64+
def __getitem__(self, script):
65+
return self.cache[script]
66+
67+
def __str__(self):
68+
return "\n".join(self.cache.values())
69+
70+
71+
scripts = ScriptCache()
72+
# NOTE: global dictionary don't need `global` keyword
73+
data = {}
74+
75+
76+
def get_images(html, url):
77+
cdata = re.search("var C_DATA=('[^']+')", html).group(1)
78+
79+
scripts.fetch(
80+
html, url, scripts=[r"\/l\.js", r"common\.js", r"custom\.js", r"manga\.read\.js"]
81+
)
82+
83+
code = """
84+
85+
const _log = console.log;
86+
87+
Function.prototype.toString = (function(_toString) {
88+
return function() {
89+
return _toString.apply(this, arguments).replace(/\\r?\\n/g, '');
90+
}
91+
})(Function.prototype.toString);
92+
93+
self.setInterval = function() {};
94+
95+
self.eval = function(_eval) {
96+
return function() {
97+
_log('eval', arguments[0]);
98+
return _eval.apply(this, arguments);
99+
};
100+
}(self.eval);
101+
102+
self.convertWordArrayToUint8Array =
103+
self.convertUint8ArrayToWordArray =
104+
self.__b_a =
105+
self.__cad =
106+
self.__js =
107+
undefined;
108+
109+
(function() {
110+
111+
let _cookies = "";
112+
113+
function noop(path = "") {
114+
if (path === "document.cookie") return _cookies;
115+
if (path === "$.inArray") return (v, a) => a.indexOf(v);
116+
117+
return new Proxy(() => {}, {
118+
apply: () => noop(`${path}.called`),
119+
get: (target, prop) => {
120+
const propPath = typeof prop == "symbol" ? `${path}.${String(prop)}` : `${path}.${prop}`;
121+
if (propPath == "document.domain") return "www.colamanga.com";
122+
_log("get", propPath);
123+
return noop(propPath);
124+
},
125+
set: (target, prop, value) => {
126+
const propPath = `${path}.${prop}`;
127+
if (propPath == "document.cookie") {
128+
_cookies += value.split(";")[0] + "; ";
129+
}
130+
_log(propPath, value);
131+
return value;
132+
}
133+
});
134+
}
135+
136+
self.window = self;
137+
self.location = {
138+
protocol: "http://",
139+
href: '""" + url + """'
140+
}
141+
self.navigator = {
142+
userAgent: ""
143+
};
144+
self.document = noop("document")
145+
self.$ = noop("$");
146+
self.devtools = noop("devtools");
147+
self.localStorage = noop("localStorage");
148+
149+
self.C_DATA = """ + cdata + "\n" + str(scripts) + """
150+
151+
window.use_domain = {
152+
},
153+
window.lines = {
154+
[mh_info.pageid]: {
155+
use_line: mh_info.domain
156+
}
157+
};
158+
window.chapter_id = mh_info.pageid;
159+
160+
// const imgs = [];
161+
// let dirty = false;
162+
// class Image {
163+
// set src(val) {
164+
// imgs.push(val);
165+
// dirty = true;
166+
// }
167+
// }
168+
// let i = mh_info.startimg;
169+
// do {
170+
// dirty = false;
171+
// __cr.preLoadImg(i++)
172+
// } while (dirty);
173+
174+
__cad.setCookieValue();
175+
// const _tka_pageid = __cad.cookie(__cad.getCookieValue()[0] + mh_info.pageid.toString());
176+
// const _tkb_pageid = __cad.cookie(__cad.getCookieValue()[1] + mh_info.pageid.toString());
177+
const len = __cad.cookie(__cad.getCookieValue()[1] + mh_info.pageid.toString());
178+
179+
const imgs = [];
180+
for (let i = mh_info.startimg; i <= len; ++i) {
181+
imgs.push(__cr.getPicUrl(i));
182+
}
183+
const keyTypes = image_info.keyType
184+
let key;
185+
186+
if (keyTypes == 0) {
187+
__cr.isfromMangaRead = 1;
188+
key = __js.getDataParse();
189+
}
190+
191+
self.getKey = (fn, index) => {
192+
const hexKey = eval(`${fn}(${index})`);
193+
return CryptoJS.enc.Utf8.parse(hexKey);
194+
}
195+
196+
return [imgs, key, keyTypes]
197+
}).call(self);
198+
"""
199+
200+
with VM() as vm:
201+
imgs, key, key_types = vm.run(code)
202+
203+
if not key:
204+
key_pairs = re.findall(r"if\(_0x[0-9a-f]{6}==(0x[0-9a-f]{4,})\)return _0x[0-9a-f]{6}\((0x[0-9a-f]+)\)", scripts[r"manga\.read\.js"])
205+
206+
pair = next((t for t in key_pairs if hex(int(key_types)) in t), None)
207+
if pair is None:
208+
raise ValueError("Key pair not found!")
209+
index = pair[1]
210+
211+
fn_name = re.search(r"function (_0x[0-9a-f]{4})\(_0x[0-9a-f]{6},_0x[0-9a-f]{6}\)", scripts[r"manga\.read\.js"]).group(1)
212+
213+
key = vm.call("getKey", fn_name, index)
214+
215+
data["key"] = key
216+
217+
return [urljoin(url, i) for i in imgs]
218+
219+
220+
def imagehandler(ext, bin):
221+
if ext == ".webp":
222+
key = data["key"]
223+
224+
if not key:
225+
raise ValueError("Key not found!")
226+
227+
key_bytes = b"".join(list(i.to_bytes(4, "big") for i in key["words"]))
228+
cipher = AES.new(key_bytes, mode=AES.MODE_CBC, iv=b"0000000000000000")
229+
bin = cipher.decrypt(bytes(bin))
230+
return ext, bin
231+

comiccrawler/mods/oh.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313

1414
from ..core import Episode, grabhtml
1515

16-
domain = ["www.ohmanhua.com", "www.cocomanhua.com", "www.colamanga.com"]
16+
domain = ["www.ohmanhua.com", "www.cocomanhua.com"]
1717
name = "OH漫畫"
1818

1919
def get_title(html, url):
@@ -94,7 +94,6 @@ def get_images(html, url):
9494
apply: () => noop(`${path}.called`),
9595
get: (target, prop) => {
9696
const propPath = typeof prop == "symbol" ? `${path}.${String(prop)}` : `${path}.${prop}`;
97-
if (propPath == "document.domain") return "www.colamanga.com";
9897
_log("get", propPath);
9998
return noop(propPath);
10099
},

0 commit comments

Comments
 (0)