Skip to content

Commit 05ae5eb

Browse files
committed
fix: fix length check and image order
feat: improve efficiency of restore_toc_list
1 parent ecb54f6 commit 05ae5eb

File tree

2 files changed

+23
-45
lines changed

2 files changed

+23
-45
lines changed

src/stas_ln_translator/process.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,11 +30,14 @@ def preprocess_document(soup: BeautifulSoup) -> BeautifulSoup:
3030
# Find all <p> tags
3131
for p_tag in soup.find_all("p"):
3232
# Remove <img> wrapped with <p> and add after <p>
33-
if len(list(p_tag.children)) == 1 and list(p_tag.children)[0].name == "img":
33+
# More robustly check if the paragraph only contains an image, allowing for whitespace.
34+
child_tags = p_tag.find_all(True, recursive=False)
35+
if not p_tag.get_text(strip=True) and len(child_tags) == 1 and child_tags[0].name == 'img':
3436
p_tag.unwrap()
3537
continue
36-
37-
for img in p_tag.find_all("img"):
38+
39+
# Iterate in reverse to maintain the original order of multiple images when moving them after the paragraph.
40+
for img in reversed(p_tag.find_all("img")):
3841
p_tag.insert_after(img.extract())
3942

4043
# Remove <span> tags but preserve their content

src/stas_ln_translator/utils.py

Lines changed: 17 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -160,53 +160,28 @@ def restore_toc_list(
160160
Returns:
161161
TOCList: The restored table of contents (TOC) list.
162162
"""
163-
restored_list: TOCList = []
164-
top_list = [i for i in zip(flattened_list, index_list) if i[1].count("-") == 0]
163+
assert len(flattened_list) == len(index_list)
165164

166-
for item, index in top_list:
167-
if isinstance(item, epub.Link):
168-
restored_list.append(item)
169-
elif isinstance(item, epub.Section):
170-
restored_list.append(
171-
(
172-
item,
173-
_get_children_toc_list(
174-
index,
175-
[
176-
i
177-
for i in zip(flattened_list, index_list)
178-
if i[1].startswith(index)
179-
],
180-
),
181-
)
182-
)
183-
return restored_list
165+
items_by_index = dict(zip(index_list, flattened_list))
166+
children_by_parent_index: dict[str, list[str]] = {}
184167

168+
for index in index_list:
169+
parent_index = "root"
170+
if "-" in index:
171+
parent_index = index.rpartition("-")[0]
185172

186-
def _get_children_toc_list(
187-
parent_index: str,
188-
current_list: list[tuple[epub.Link | epub.Section, str]],
189-
) -> TOCList:
190-
"""The recursive function to recursively get children TOC list.
191-
192-
Args:
193-
parent_index (str): The parent index of the current list.
194-
current_list (list[tuple[epub.Link | epub.Section, str]]): The current list of items to process.
173+
children_by_parent_index.setdefault(parent_index, []).append(index)
195174

196-
Returns:
197-
TOCList: The restored table of contents (TOC) list.
198-
"""
199-
children: TOCList = []
175+
def build_tree(parent_index: str) -> TOCList:
176+
restored_list: TOCList = []
177+
child_indices = children_by_parent_index.get(parent_index, [])
200178

201-
for i, (item, index) in enumerate(current_list):
202-
if index.startswith(parent_index) and (
203-
index.count("-") == parent_index.count("-") + 1
204-
):
179+
for child_index in child_indices:
180+
item = items_by_index[child_index]
205181
if isinstance(item, epub.Link):
206-
children.append(item)
182+
restored_list.append(item)
207183
elif isinstance(item, epub.Section):
208-
# Recursively find children for this section
209-
section_children = _get_children_toc_list(index, current_list[i + 1 :])
210-
children.append((item, section_children))
184+
restored_list.append((item, build_tree(child_index)))
185+
return restored_list
211186

212-
return children
187+
return build_tree("root")

0 commit comments

Comments
 (0)