Skip to content

Commit dafd6eb

Browse files
committed
change: Update EPUB parsing method to use lxml instead of BeautifulSoup
1 parent cdf71ea commit dafd6eb

File tree

3 files changed

+38
-30
lines changed

3 files changed

+38
-30
lines changed

README.md

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,18 @@
11
# manga2pdf
2+
[![License: MIT](https://img.shields.io/pypi/l/manga2pdf)](https://opensource.org/licenses/MIT)
3+
[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/manga2pdf)](https://pypi.org/project/manga2pdf)
4+
[![GitHub Release](https://img.shields.io/github/release/mashu3/manga2pdf?color=orange)](https://github.com/mashu3/manga2pdf/releases)
5+
[![PyPi Version](https://img.shields.io/pypi/v/manga2pdf?color=yellow)](https://pypi.org/project/manga2pdf/)
6+
[![Downloads](https://static.pepy.tech/badge/manga2pdf)](https://pepy.tech/project/manga2pdf)
7+
28
## Overview
39
This Python script is specifically designed to convert manga and comic files, including various formats such as zip, epub, and directories containing image files, to PDF format.
410

511
The resulting PDF files are optimized to resemble Japanese manga in terms of page layout and direction. By default, the script uses a "TwoPageRight" page layout that displays two pages side-by-side for a spread view, and a "R2L" (right-to-left) reading direction that is commonly used in Japanese manga.
612

713
## Requirement
814
The script uses the Python libraries **[img2pdf](https://pypi.org/project/img2pdf/)** and **[pikepdf](https://pypi.org/project/pikepdf/)** to do the conversion.
9-
Moreover, it uses **[BeautifulSoup](https://pypi.org/project/beautifulsoup4/)** to read the EPUB files and **[rarfile](https://pypi.org/project/rarfile/)** to read the RAR archive files.
15+
Moreover, it uses **[lxml](https://pypi.org/project/lxml/)** to read the EPUB files and **[rarfile](https://pypi.org/project/rarfile/)** to read the RAR archive files.
1016

1117
It requires the installation of these packages in order to work properly.
1218

@@ -91,4 +97,6 @@ The interface supports English and Japanese, and all settings that can be specif
9197
The GUI is currently under development and additional features are planned for future updates.
9298

9399
## Author
94-
[mashu3](https://github.com/mashu3)
100+
[mashu3](https://github.com/mashu3)
101+
102+
[![Authors](https://contrib.rocks/image?repo=mashu3/manga2pdf)](https://github.com/mashu3/manga2pdf/graphs/contributors)

setup.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
11
from setuptools import setup, find_packages
22

3-
VERSION = "0.1.4"
3+
VERSION = "0.2.0"
44

55
INSTALL_REQUIRES = (
6+
"lxml",
67
"numpy",
78
"img2pdf",
89
"Pillow",
910
"pikepdf",
10-
"rarfile",
11-
"beautifulsoup4"
11+
"rarfile"
1212
)
1313
CLASSIFIERS=[
1414
'License :: OSI Approved :: MIT License',

src/manga2pdf.py

Lines changed: 25 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,10 @@
1515
import warnings
1616
import numpy as np
1717
from PIL import Image
18+
from lxml import etree
1819
import concurrent.futures
19-
from bs4 import BeautifulSoup
20-
2120
warnings.filterwarnings('ignore', category=UserWarning)
22-
from bs4 import XMLParsedAsHTMLWarning
23-
warnings.filterwarnings('ignore', category=XMLParsedAsHTMLWarning)
21+
2422

2523
class MangaPdfConverter():
2624
def __init__(self, input_path: str, output_path: str, pagelayout:str, pagemode:str, direction:str):
@@ -176,44 +174,45 @@ def extract_epub_contents(self, epub):
176174
opf_name = opf_names[0]
177175
page_names = []
178176
with epub.open(opf_name) as opf:
179-
content_opf = opf.read().decode()
180-
opf_soup = BeautifulSoup(content_opf, 'xml')
181-
for item in opf_soup.find_all('item', {'media-type': 'image/jpeg'}):
177+
opf_content = opf.read()
178+
opf_tree = etree.fromstring(opf_content)
179+
namespace = {'dc': 'http://purl.org/dc/elements/1.1/', 'opf': 'http://www.idpf.org/2007/opf'}
180+
manifest = opf_tree.find('opf:manifest', namespaces=namespace)
181+
for item in manifest.findall('opf:item[@media-type="image/jpeg"]', namespaces=namespace):
182182
page_names.append(os.path.join(os.path.dirname(opf_name), item.get('href').replace('/', os.sep)).replace(os.sep, '/'))
183-
for item in opf_soup.find_all('item', {'media-type': 'image/png'}):
183+
for item in manifest.findall('opf:item[@media-type="image/png"]', namespaces=namespace):
184184
page_names.append(os.path.join(os.path.dirname(opf_name), item.get('href').replace('/', os.sep)).replace(os.sep, '/'))
185185
page_items = []
186186
for page_name in page_names:
187187
page_items.append(epub.open(page_name))
188188
return page_names, page_items, ncx_name, opf_name
189-
189+
190190
# Function to extract the index of an EPUB file
191191
def extract_epub_index(self, epub, page_names, ncx_name: str):
192192
page_index = []
193193
with epub.open(ncx_name) as ncx_file:
194194
ncx_content = ncx_file.read()
195-
ncx_soup = BeautifulSoup(ncx_content, 'lxml')
196-
navpoints = ncx_soup.find_all('navpoint')
195+
ncx_tree = etree.fromstring(ncx_content)
196+
namespace = {'ncx': 'http://www.daisy.org/z3986/2005/ncx/', 'html': 'http://www.w3.org/1999/xhtml', 'svg':'http://www.w3.org/2000/svg'}
197+
navmap = ncx_tree.find('ncx:navMap', namespaces=namespace)
198+
navpoints = navmap.findall('ncx:navPoint', namespaces=namespace)
197199
ncx_path = os.path.dirname(ncx_name) + '/'
198200
for navpoint in navpoints:
199-
nav_label = navpoint.navlabel.text.strip()
200-
nav_text = navpoint.content['src']
201+
nav_label = navpoint.find('ncx:navLabel/ncx:text', namespaces=namespace).text.strip()
202+
nav_text = navpoint.find('ncx:content', namespaces=namespace).get('src')
201203
if nav_text.startswith(ncx_path):
202204
nav_text = nav_text[len(ncx_path):]
203205
else:
204206
nav_text = os.path.join(ncx_path, nav_text)
205207
if nav_text.endswith('.xhtml'):
206208
with epub.open(nav_text) as xhtml_file:
207209
xhtml_content = xhtml_file.read()
208-
xhtml_soup = BeautifulSoup(xhtml_content, 'html.parser')
209-
img_tags = xhtml_soup.find_all('image')
210+
xhtml_tree = etree.fromstring(xhtml_content)
211+
img_tags = xhtml_tree.findall('.//svg:image', namespaces=namespace)
210212
if len(img_tags) == 0:
211-
img_tags = xhtml_soup.find_all('img')
213+
img_tags = img_tags = xhtml_tree.findall('.//svg:img', namespaces=namespace)
212214
for img_tag in img_tags:
213-
try:
214-
img_link = img_tag['xlink:href']
215-
except KeyError:
216-
img_link = img_tag['src']
215+
img_link = img_tag.get('{http://www.w3.org/1999/xlink}href', img_tag.get('src'))
217216
image_href = os.path.abspath(os.path.join(os.path.dirname(nav_text), img_link))
218217
image_href = os.path.relpath(image_href, os.getcwd()).replace(os.sep, '/')
219218
index_number = page_names.index(image_href)
@@ -227,21 +226,22 @@ def extract_epub_index(self, epub, page_names, ncx_name: str):
227226
def extract_epub_metadata(self, epub, opf_name: str):
228227
with epub.open(opf_name) as opf_file:
229228
opf_content = opf_file.read()
230-
opf_soup = BeautifulSoup(opf_content, 'lxml')
231-
metadata = opf_soup.find('metadata')
229+
opf_tree = etree.fromstring(opf_content)
230+
namespace = {'dc': 'http://purl.org/dc/elements/1.1/', 'opf': 'http://www.idpf.org/2007/opf'}
231+
metadata = opf_tree.find('opf:metadata', namespaces=namespace)
232232
epub_metadata = {}
233233
for key in ['title', 'creator', 'publisher', 'date', 'language']:
234-
values = metadata.find_all('dc:'+key)
234+
values = metadata.xpath('./dc:' + key, namespaces=namespace)
235235
if key == 'creator':
236236
if values:
237-
epub_metadata[key] = [value for value in values]
237+
epub_metadata[key] = [value.text for value in values]
238238
else:
239239
if values:
240240
epub_metadata[key] = values[0].text
241241
else:
242242
epub_metadata[key] = None
243243
return epub_metadata
244-
244+
245245
# Function to convert input files to a PDF file
246246
def convert(self):
247247
if self.is_epub_file(self.input_path):

0 commit comments

Comments
 (0)