ebookRen/runtime_metadata.py at master · Piotr-Grechuta/ebookRen · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
from __future__ import annotations

import os
import re
import tempfile
import xml.etree.ElementTree as ET
import zipfile
from pathlib import Path
from typing import Callable

CONTAINER_NS = "urn:oasis:names:tc:opendocument:xmlns:container"
OPF_NS = "http://www.idpf.org/2007/opf"
DC_NS = "http://purl.org/dc/elements/1.1/"

ET.register_namespace("", OPF_NS)
ET.register_namespace("dc", DC_NS)
ET.register_namespace("opf", OPF_NS)


def _xml_tag(namespace: str, local_name: str) -> str:
    return f"{{{namespace}}}{local_name}"


def _local_name(tag: str) -> str:
    return tag.split("}", 1)[1] if tag.startswith("{") else tag


def _find_metadata_element(root: ET.Element) -> ET.Element:
    for child in root:
        if _local_name(child.tag) == "metadata":
            return child
    metadata = ET.Element(_xml_tag(OPF_NS, "metadata"))
    root.insert(0, metadata)
    return metadata


def _resolve_epub_package_path(path: Path) -> str:
    with zipfile.ZipFile(path, "r") as archive:
        container_xml = archive.read("META-INF/container.xml")
    container_root = ET.fromstring(container_xml)
    rootfile = container_root.find(f".//{{{CONTAINER_NS}}}rootfile")
    if rootfile is None:
        raise ValueError("Brak wpisu rootfile w META-INF/container.xml")
    package_path = (rootfile.get("full-path") or "").strip()
    if not package_path:
        raise ValueError("Brak atrybutu full-path w rootfile EPUB")
    return package_path


def _rewrite_epub_entry(path: Path, *, entry_name: str, payload: bytes) -> None:
    temp_path: Path | None = None
    try:
        with tempfile.NamedTemporaryFile(delete=False, suffix=path.suffix, dir=path.parent) as handle:
            temp_path = Path(handle.name)
        with zipfile.ZipFile(path, "r") as source_archive, zipfile.ZipFile(temp_path, "w") as target_archive:
            target_archive.comment = source_archive.comment
            for info in source_archive.infolist():
                data = payload if info.filename == entry_name else source_archive.read(info.filename)
                cloned = zipfile.ZipInfo(info.filename, date_time=info.date_time)
                cloned.compress_type = info.compress_type
                cloned.comment = info.comment
                cloned.extra = info.extra
                cloned.create_system = info.create_system
                cloned.create_version = info.create_version
                cloned.extract_version = info.extract_version
                cloned.flag_bits = info.flag_bits
                cloned.volume = info.volume
                cloned.internal_attr = info.internal_attr
                cloned.external_attr = info.external_attr
                target_archive.writestr(cloned, data, compress_type=info.compress_type)
        os.replace(temp_path, path)
    except Exception:
        if temp_path is not None and temp_path.exists():
            temp_path.unlink()
        raise


def _remove_metadata_nodes(metadata: ET.Element, predicate: Callable[[ET.Element], bool]) -> None:
    for child in list(metadata):
        if predicate(child):
            metadata.remove(child)


def _append_dc_text(metadata: ET.Element, local_name: str, text: str) -> ET.Element:
    element = ET.SubElement(metadata, _xml_tag(DC_NS, local_name))
    element.text = text
    return element


def _format_series_index(volume: tuple[int, str] | None) -> str:
    if volume is None:
        return ""
    major, minor = volume
    minor_text = str(minor).zfill(2)
    if major == 0 and minor_text == "00":
        return ""
    return f"{major}.{minor_text}"


def write_epub_metadata(
    path: Path,
    *,
    title: str,
    creators: list[str],
    creator_sort_keys: list[str] | None = None,
    series: str,
    volume: tuple[int, str] | None,
    genre: str,
    extra_subjects: list[str] | None = None,
    clean: Callable[[str | None], str],
    clean_series: Callable[[str | None], str],
    normalize_match_text: Callable[[str | None], str],
) -> None:
    if path.suffix.lower() != ".epub":
        return

    package_path = _resolve_epub_package_path(path)
    with zipfile.ZipFile(path, "r") as archive:
        package_xml = archive.read(package_path)

    package_root = ET.fromstring(package_xml)
    metadata = _find_metadata_element(package_root)
    normalized_title = clean(title) or "Bez tytulu"
    normalized_creators = [clean(item) for item in creators if clean(item)] or ["Nieznany Autor"]
    normalized_creator_sort_keys = [clean(item) for item in (creator_sort_keys or [])]
    normalized_series = clean_series(series)
    normalized_genre = clean(genre)
    normalized_extra_subjects = [clean(item) for item in (extra_subjects or []) if clean(item)]
    series_index = _format_series_index(volume)

    _remove_metadata_nodes(metadata, lambda child: child.tag == _xml_tag(DC_NS, "title"))
    _remove_metadata_nodes(metadata, lambda child: child.tag == _xml_tag(DC_NS, "creator"))
    _remove_metadata_nodes(
        metadata,
        lambda child: _local_name(child.tag) == "meta"
        and (
            (child.get("name") or "").strip().lower() in {"calibre:series", "calibre:series_index"}
            or (child.get("property") or "").strip().lower() in {"belongs-to-collection", "collection-type", "group-position"}
        ),
    )

    _append_dc_text(metadata, "title", normalized_title)
    for index, creator in enumerate(normalized_creators):
        creator_element = _append_dc_text(metadata, "creator", creator)
        sort_key = normalized_creator_sort_keys[index] if index < len(normalized_creator_sort_keys) and normalized_creator_sort_keys[index] else creator
        creator_element.set(_xml_tag(OPF_NS, "role"), "aut")
        creator_element.set(_xml_tag(OPF_NS, "file-as"), sort_key)

    subject_keys = {
        normalize_match_text(child.text or "")
        for child in metadata
        if child.tag == _xml_tag(DC_NS, "subject") and normalize_match_text(child.text or "")
    }
    for subject in [normalized_genre, *normalized_extra_subjects]:
        subject_key = normalize_match_text(subject)
        if not subject or not subject_key or subject_key in subject_keys:
            continue
        subject_keys.add(subject_key)
        _append_dc_text(metadata, "subject", subject)

    if normalized_series and normalize_match_text(normalized_series) != normalize_match_text("Standalone"):
        calibre_series = ET.SubElement(metadata, _xml_tag(OPF_NS, "meta"))
        calibre_series.set("name", "calibre:series")
        calibre_series.set("content", normalized_series)
        if series_index:
            calibre_series_index = ET.SubElement(metadata, _xml_tag(OPF_NS, "meta"))
            calibre_series_index.set("name", "calibre:series_index")
            calibre_series_index.set("content", series_index)

        collection_meta = ET.SubElement(metadata, _xml_tag(OPF_NS, "meta"))
        collection_meta.set("property", "belongs-to-collection")
        collection_meta.set("id", "series-collection")
        collection_meta.text = normalized_series

        collection_type = ET.SubElement(metadata, _xml_tag(OPF_NS, "meta"))
        collection_type.set("refines", "#series-collection")
        collection_type.set("property", "collection-type")
        collection_type.text = "series"

        if series_index:
            group_position = ET.SubElement(metadata, _xml_tag(OPF_NS, "meta"))
            group_position.set("refines", "#series-collection")
            group_position.set("property", "group-position")
            group_position.text = series_index

    updated_package_xml = ET.tostring(package_root, encoding="utf-8", xml_declaration=True)
    _rewrite_epub_entry(path, entry_name=package_path, payload=updated_package_xml)


def read_book_metadata(
    path: Path,
    *,
    metadata_type,
    strip_source_artifacts: Callable[[str | None], str],
    clean: Callable[[str | None], str],
    clean_series: Callable[[str | None], str],
    parse_volume_parts: Callable[[str | None], tuple[int, str] | None],
    epub_module,
):
    stem = path.stem
    segments = [strip_source_artifacts(part) for part in re.split(r"\s*--\s*", stem) if strip_source_artifacts(part)]
    core = segments[0] if segments else stem
    meta = metadata_type(path=path, stem=stem, segments=segments, core=core)

    if path.suffix.lower() != ".epub":
        return meta

    if epub_module is None:
        try:
            from ebooklib import epub as epub_module  # type: ignore[import-not-found]
        except Exception as exc:
            meta.errors.append(f"epub-read: {exc}")
            return meta

    try:
        book = epub_module.read_epub(str(path), options={"ignore_ncx": True})
        meta.title = clean(book.get_metadata("DC", "title")[0][0]) if book.get_metadata("DC", "title") else ""
        meta.creators = [clean(item[0]) for item in book.get_metadata("DC", "creator") if clean(item[0])]
        meta.identifiers = [clean(item[0]) for item in book.get_metadata("DC", "identifier") if clean(item[0])]
        meta.subjects = [clean(item[0]) for item in book.get_metadata("DC", "subject") if clean(item[0])]

        for namespace, key in (
            ("OPF", "calibre:series"),
            ("OPF", "series"),
            ("OPF", "belongs-to-collection"),
        ):
            values = book.get_metadata(namespace, key)
            if values and not meta.meta_series:
                meta.meta_series = clean_series(values[0][0])

        for namespace, key in (
            ("OPF", "calibre:series_index"),
            ("OPF", "series_index"),
            ("OPF", "group-position"),
        ):
            values = book.get_metadata(namespace, key)
            if values and meta.meta_volume is None:
                meta.meta_volume = parse_volume_parts(values[0][0])
    except Exception as exc:
        meta.errors.append(f"epub-read: {exc}")

    return meta