Skip to content

Commit 07a3b79

Browse files
committed
Better handling for subchapter file headings
A PE reported a bug where, after promoting a sub-chapter file heading from a B- to an A-level heading, the section disappeared. This is a bummer! What was happening was that we were only pulling in the first section of a subchapter file, assuming that there would be only one top-level heading in that file. This sometimes isn't the case! Normally we'd want to enforce that for chapters, i.e., that there is only one top-level section, but in this case we want to allow multiples, since they're getting "demoted" later anyway (or rather: getting included _inside_ the main heading. So, to solve for this, this commit: * Retains the logic for primary chapter files, instead throwing a warning in the log and in the console that there are multiple top-level headings in a chapter file if it's the "main" chapter file. IMO this is best practice, since it'll help enforce ORM style, and accommodating this would be a bigger rewrite (and this is an in-flight book production-needing fix). * Improves subchapter handling logic, now finding the main article and including any non-bibliography top-level sections in the resultant chapter. * Adds tests around all these things (coverage still 100%) I think it's an open question whether or not we want to allow multiple top-level sections in a main file (at least for the purposes of Atlas-only builds), but we'll call that outside the scope for now. Update: Refactor and also handle bibliographies better; this was exposed during the refactor, but essentially we weren't handling bibliography _files_ well, and so I updated the logic to basically say "if we don't have a non-bib section, but we do have a bib section, make that the chapter."
1 parent 3b0bf38 commit 07a3b79

File tree

2 files changed

+129
-23
lines changed

2 files changed

+129
-23
lines changed

jupyter_book_to_htmlbook/file_processing.py

Lines changed: 77 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -135,22 +135,66 @@ def apply_datatype(chapter, ch_name):
135135
return chapter
136136

137137

138+
def get_top_level_sections(soup):
139+
"""
140+
Helper utility to grab top-level sections in main <article>. Returns
141+
all but bibliography sections
142+
"""
143+
section_wrappers = soup.find_all("article", attrs={"role": "main"})
144+
145+
# test case for partial files, not expected in production
146+
if len(section_wrappers) == 0:
147+
sections = soup.find_all('section')
148+
elif len(section_wrappers) != 1:
149+
article = soup.find('article', attrs={"role": "main"})
150+
try:
151+
main_title = article.find('h1').get_text()
152+
except AttributeError:
153+
main_title = soup.find("h1")
154+
print("Warning: " +
155+
f"The chapter with title '{main_title}' is malformed.")
156+
return None, None
157+
else:
158+
main = section_wrappers[0]
159+
sections = []
160+
161+
for element in main.children:
162+
if (
163+
element.name == "section" and
164+
element.get('id') != "bibliography"
165+
):
166+
sections.append(element)
167+
168+
return sections
169+
170+
138171
def get_main_section(soup):
139172
"""
140173
Gets the main "section," or the main chapter text, and additionally
141174
checks to see if there is a separate bibliography section, returning
142175
that if it exists to be dealt with later.
143176
"""
144-
sections = soup.find_all('section')
177+
sections = get_top_level_sections(soup)
178+
145179
try:
146180
main = sections[0]
147-
except IndexError: # does not have a section class for top-level
148-
logging.warning("Looks like {toc_element.name} is malformed.")
149-
return None, None
181+
except IndexError:
182+
main = None
183+
150184
if len(sections) > 1:
151-
bibliography = soup.find('section', id="bibliography")
152-
else:
153-
bibliography = None
185+
article = soup.find('article', attrs={"role": "main"})
186+
try:
187+
main_title = article.find('h1').get_text()
188+
except AttributeError:
189+
main_title = soup.find("h1")
190+
err_msg = f"The chapter with title '{main_title}' " + \
191+
"has extra <section>s " + \
192+
"that will not be processed. Please check the " + \
193+
"notebook source files."
194+
logging.warning(err_msg)
195+
print(err_msg)
196+
bibliography = soup.find('section', id="bibliography")
197+
154198
return main, bibliography
155199

156200

@@ -172,11 +216,14 @@ def process_chapter_soup(
172216

173217
# perform initial swapping and namespace designation
174218
chapter, bib = get_main_section(base_soup)
219+
if bib and not chapter: # bibs can be their own chapters
220+
chapter = bib
221+
bib = None
175222

176223
if not chapter: # guard against malformed files
177224
logging.warning(f"Failed to process {toc_element}.")
178225
raise RuntimeError(
179-
f"Failed to process {toc_element}. Please check for error in " +
226+
f"Failed to process {toc_element}. Please check for errors in " +
180227
"your source file(s). Contact the Tools team for additional " +
181228
"support.")
182229

@@ -189,8 +236,10 @@ def process_chapter_soup(
189236

190237
if chapter_parts:
191238
for subfile in chapter_parts:
192-
subsection, sub_bib = process_chapter_subparts(subfile)
193-
chapter.append(subsection)
239+
subsections, sub_bib = process_chapter_subparts(subfile)
240+
if subsections:
241+
for subsection in subsections:
242+
chapter.append(subsection)
194243
if bib and sub_bib:
195244
entries = sub_bib.find_all("dd") # type: ignore
196245
bib.dl.extend(entries) # type: ignore
@@ -211,19 +260,24 @@ def process_chapter_subparts(subfile):
211260
""" processing for chapters with "sections" """
212261
with open(subfile, 'r') as f:
213262
soup = BeautifulSoup(f, 'lxml')
214-
section, bib = get_main_section(soup)
215-
section['data-type'] = 'sect1' # type: ignore
216-
del section['class'] # type: ignore
217-
# move id from empty span to section
218-
try:
219-
section['id'] = section.select_one('span')['id'] # type: ignore
220-
except TypeError:
221-
# fun fact, this happens when there's not numbering on the toc
222-
pass # like before, if it's not there that's OK.
223-
except KeyError:
224-
# fun fact, this happens when there is numbering on the toc
225-
pass # like before, if it's not there that's OK.
226-
return section, bib
263+
top_level_sections = get_top_level_sections(soup)
264+
265+
for section in top_level_sections:
266+
section['data-type'] = 'sect1' # type: ignore
267+
del section['class'] # type: ignore
268+
# move id from empty span to section
269+
try:
270+
section['id'] = section.select_one( # type: ignore
271+
'span')['id']
272+
except TypeError:
273+
# this happens when there's not numbering on the toc
274+
pass # like before, if it's not there that's OK.
275+
except KeyError:
276+
# fun fact, this happens when there is numbering on the toc
277+
pass # like before, if it's not there that's OK.
278+
bibliography = soup.find('section', id="bibliography")
279+
280+
return top_level_sections, bibliography
227281

228282

229283
def process_chapter(toc_element,

tests/test_file_processing.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,25 @@ def test_compile_chapter_parts_happy_path(self, tmp_book_path):
2929
number_of_sections_expected = 2 # the first html file doesn't get one
3030
assert number_of_sections == number_of_sections_expected
3131

32+
def test_compile_chapter_parts_parts_with_many_h1s(self, tmp_book_path):
33+
"""
34+
We should ensure that subsequent A-level headings inside subchapter
35+
files aren't getting dropped from the book
36+
"""
37+
result = process_chapter_soup([
38+
tmp_book_path / 'notebooks/ch02.00.html',
39+
tmp_book_path / 'notebooks/ch02.01.html',
40+
tmp_book_path / 'notebooks/ch02.02.html',
41+
tmp_book_path / 'notebooks/many_a_levels.html',
42+
])[0]
43+
# the resulting section should have a data-type of "chapter"
44+
assert result["data-type"] == "chapter"
45+
# number of level-1 subsections should be one less than the group
46+
number_of_sections = len(
47+
result.find_all(attrs={"data-type": "sect1"}))
48+
number_of_sections_expected = 4 # the first html file doesn't get one
49+
assert number_of_sections == number_of_sections_expected
50+
3251
def test_process_chapter_single_chapter_file(self, tmp_book_path):
3352
"""
3453
happy path for chapter processing a single chapter file
@@ -45,6 +64,39 @@ def test_process_chapter_single_chapter_file(self, tmp_book_path):
4564
# check on return
4665
assert "ch01.html" in result
4766

67+
def test_process_chapter_single_file_with_multiple_h1s(self,
68+
tmp_book_path,
69+
caplog,
70+
capsys):
71+
"""
72+
Edge case in which a single chapter has multiple top-level sections,
73+
(but the subsequent one is not a bibliography); we want to ensure that
74+
the error is logged as well as printed
75+
"""
76+
test_env = tmp_book_path / 'notebooks'
77+
test_out = test_env / 'output'
78+
test_out.mkdir()
79+
caplog.set_level(logging.DEBUG)
80+
81+
process_chapter((test_env / 'many_a_levels.html'),
82+
test_env, test_out)
83+
log = caplog.text
84+
assert "will not be processed" in capsys.readouterr().out
85+
assert "will not be processed" in log
86+
87+
def test_process_chapter_single_file_bibliogrpahy(self,
88+
tmp_book_path):
89+
"""
90+
Bibliography files should act like normal chapters
91+
"""
92+
test_env = tmp_book_path
93+
test_out = test_env / 'output'
94+
test_out.mkdir()
95+
96+
result = process_chapter((test_env / 'bibliography.html'),
97+
test_env, test_out)
98+
assert "bibliography.html" in result
99+
48100
def test_chapter_promote_headings(self, tmp_path):
49101
"""
50102
we expect to have a single h1 and then a bunch of h2s

0 commit comments

Comments
 (0)