Better handling for subchapter file headings

delfanbaum · delfanbaum · commit 07a3b79424be · 2023-07-14T12:04:34.000-04:00
A PE reported a bug where, after promoting a sub-chapter file heading
from a B- to an A-level heading, the section disappeared. This is a
bummer! What was happening was that we were only pulling in the first
section of a subchapter file, assuming that there would be only one
top-level heading in that file. This sometimes isn't the case! Normally
we'd want to enforce that for chapters, i.e., that there is only one
top-level section, but in this case we want to allow multiples, since
they're getting "demoted" later anyway (or rather: getting included
_inside_ the main heading.

So, to solve for this, this commit:

* Retains the logic for primary chapter files, instead throwing a
  warning in the log and in the console that there are multiple
  top-level headings in a chapter file if it's the "main" chapter file.
  IMO this is best practice, since it'll help enforce ORM style, and
  accommodating this would be a bigger rewrite (and this is an in-flight
  book production-needing fix).
* Improves subchapter handling logic, now finding the main article and
  including any non-bibliography top-level sections in the resultant
  chapter.
* Adds tests around all these things (coverage still 100%)

I think it's an open question whether or not we want to allow multiple
top-level sections in a main file (at least for the purposes of
Atlas-only builds), but we'll call that outside the scope for now.

Update: Refactor and also handle bibliographies better; this was exposed
during the refactor, but essentially we weren't handling bibliography
_files_ well, and so I updated the logic to basically say "if we don't
have a non-bib section, but we do have a bib section, make that the
chapter."
diff --git a/jupyter_book_to_htmlbook/file_processing.py b/jupyter_book_to_htmlbook/file_processing.py
@@ -135,22 +135,66 @@ def apply_datatype(chapter, ch_name):
     return chapter
 
 
+def get_top_level_sections(soup):
+    """
+    Helper utility to grab top-level sections in main <article>. Returns
+    all but bibliography sections
+    """
+    section_wrappers = soup.find_all("article", attrs={"role": "main"})
+
+    # test case for partial files, not expected in production
+    if len(section_wrappers) == 0:
+        sections = soup.find_all('section')
+    elif len(section_wrappers) != 1:
+        article = soup.find('article', attrs={"role": "main"})
+        try:
+            main_title = article.find('h1').get_text()
+        except AttributeError:
+            main_title = soup.find("h1")
+        print("Warning: " +
+              f"The chapter with title '{main_title}' is malformed.")
+        return None, None
+    else:
+        main = section_wrappers[0]
+        sections = []
+
+        for element in main.children:
+            if (
+                    element.name == "section" and
+                    element.get('id') != "bibliography"
+               ):
+                sections.append(element)
+
+    return sections
+
+
 def get_main_section(soup):
     """
     Gets the main "section," or the main chapter text, and additionally
     checks to see if there is a separate bibliography section, returning
     that if it exists to be dealt with later.
     """
-    sections = soup.find_all('section')
+    sections = get_top_level_sections(soup)
+
     try:
         main = sections[0]
-    except IndexError:  # does not have a section class for top-level
-        logging.warning("Looks like {toc_element.name} is malformed.")
-        return None, None
+    except IndexError:
+        main = None
+
     if len(sections) > 1:
-        bibliography = soup.find('section', id="bibliography")
-    else:
-        bibliography = None
+        article = soup.find('article', attrs={"role": "main"})
+        try:
+            main_title = article.find('h1').get_text()
+        except AttributeError:
+            main_title = soup.find("h1")
+        err_msg = f"The chapter with title '{main_title}' " + \
+                  "has extra <section>s " + \
+                  "that will not be processed. Please check the " + \
+                  "notebook source files."
+        logging.warning(err_msg)
+        print(err_msg)
+    bibliography = soup.find('section', id="bibliography")
+
     return main, bibliography
 
 
@@ -172,11 +216,14 @@ def process_chapter_soup(
 
     # perform initial swapping and namespace designation
     chapter, bib = get_main_section(base_soup)
+    if bib and not chapter:  # bibs can be their own chapters
+        chapter = bib
+        bib = None
 
     if not chapter:  # guard against malformed files
         logging.warning(f"Failed to process {toc_element}.")
         raise RuntimeError(
-            f"Failed to process {toc_element}. Please check for error in " +
+            f"Failed to process {toc_element}. Please check for errors in " +
             "your source file(s). Contact the Tools team for additional " +
             "support.")
 
@@ -189,8 +236,10 @@ def process_chapter_soup(
 
         if chapter_parts:
             for subfile in chapter_parts:
-                subsection, sub_bib = process_chapter_subparts(subfile)
-                chapter.append(subsection)
+                subsections, sub_bib = process_chapter_subparts(subfile)
+                if subsections:
+                    for subsection in subsections:
+                        chapter.append(subsection)
                 if bib and sub_bib:
                     entries = sub_bib.find_all("dd")  # type: ignore
                     bib.dl.extend(entries)  # type: ignore
@@ -211,19 +260,24 @@ def process_chapter_subparts(subfile):
     """ processing for chapters with "sections" """
     with open(subfile, 'r') as f:
         soup = BeautifulSoup(f, 'lxml')
-        section, bib = get_main_section(soup)
-        section['data-type'] = 'sect1'  # type: ignore
-        del section['class']  # type: ignore
-        # move id from empty span to section
-        try:
-            section['id'] = section.select_one('span')['id']  # type: ignore
-        except TypeError:
-            # fun fact, this happens when there's not numbering on the toc
-            pass  # like before, if it's not there that's OK.
-        except KeyError:
-            # fun fact, this happens when there is numbering on the toc
-            pass  # like before, if it's not there that's OK.
-    return section, bib
+        top_level_sections = get_top_level_sections(soup)
+
+        for section in top_level_sections:
+            section['data-type'] = 'sect1'  # type: ignore
+            del section['class']  # type: ignore
+            # move id from empty span to section
+            try:
+                section['id'] = section.select_one(  # type: ignore
+                                    'span')['id']
+            except TypeError:
+                # this happens when there's not numbering on the toc
+                pass  # like before, if it's not there that's OK.
+            except KeyError:
+                # fun fact, this happens when there is numbering on the toc
+                pass  # like before, if it's not there that's OK.
+        bibliography = soup.find('section', id="bibliography")
+
+    return top_level_sections, bibliography
 
 
 def process_chapter(toc_element,
diff --git a/tests/test_file_processing.py b/tests/test_file_processing.py
@@ -29,6 +29,25 @@ def test_compile_chapter_parts_happy_path(self, tmp_book_path):
         number_of_sections_expected = 2  # the first html file doesn't get one
         assert number_of_sections == number_of_sections_expected
 
+    def test_compile_chapter_parts_parts_with_many_h1s(self, tmp_book_path):
+        """
+        We should ensure that subsequent A-level headings inside subchapter
+        files aren't getting dropped from the book
+        """
+        result = process_chapter_soup([
+            tmp_book_path / 'notebooks/ch02.00.html',
+            tmp_book_path / 'notebooks/ch02.01.html',
+            tmp_book_path / 'notebooks/ch02.02.html',
+            tmp_book_path / 'notebooks/many_a_levels.html',
+            ])[0]
+        # the resulting section should have a data-type of "chapter"
+        assert result["data-type"] == "chapter"
+        # number of level-1 subsections should be one less than the group
+        number_of_sections = len(
+                result.find_all(attrs={"data-type": "sect1"}))
+        number_of_sections_expected = 4  # the first html file doesn't get one
+        assert number_of_sections == number_of_sections_expected
+
     def test_process_chapter_single_chapter_file(self, tmp_book_path):
         """
         happy path for chapter processing a single chapter file
@@ -45,6 +64,39 @@ def test_process_chapter_single_chapter_file(self, tmp_book_path):
         # check on return
         assert "ch01.html" in result
 
+    def test_process_chapter_single_file_with_multiple_h1s(self,
+                                                           tmp_book_path,
+                                                           caplog,
+                                                           capsys):
+        """
+        Edge case in which a single chapter has multiple top-level sections,
+        (but the subsequent one is not a bibliography); we want to ensure that
+        the error is logged as well as printed
+        """
+        test_env = tmp_book_path / 'notebooks'
+        test_out = test_env / 'output'
+        test_out.mkdir()
+        caplog.set_level(logging.DEBUG)
+
+        process_chapter((test_env / 'many_a_levels.html'),
+                        test_env, test_out)
+        log = caplog.text
+        assert "will not be processed" in capsys.readouterr().out
+        assert "will not be processed" in log
+
+    def test_process_chapter_single_file_bibliogrpahy(self,
+                                                      tmp_book_path):
+        """
+        Bibliography files should act like normal chapters
+        """
+        test_env = tmp_book_path
+        test_out = test_env / 'output'
+        test_out.mkdir()
+
+        result = process_chapter((test_env / 'bibliography.html'),
+                                 test_env, test_out)
+        assert "bibliography.html" in result
+
     def test_chapter_promote_headings(self, tmp_path):
         """
         we expect to have a single h1 and then a bunch of h2s