Add handling for non-anchor xref spans

delfanbaum · delfanbaum · commit b4ce507e016d · 2023-01-13T13:21:32.000-05:00
If Juptyer Book can't find a target for a given `{ref}`, it will create
a `&lt;span&gt;` tag with `xref` and `std-ref` classes applied to it.

Since our formal example workaround adds targets after the fact, that
means that any reference to a formal example is going to be a `&lt;span&gt;`
instead of an `&lt;a&gt;`, which would get picked up and correctly converted
by our `process_internal_refs` function (which has been renamed in this
commit, since there was a typo). This commit/PR adds code to handle
these references so authors can xref to the formal code examples, as
well as tests to ensure that the conversion happens correctly as such
and in chapter processing.
diff --git a/jupyter_book_to_htmlbook/file_processing.py b/jupyter_book_to_htmlbook/file_processing.py
@@ -8,7 +8,8 @@
 from .footnote_processing import process_footnotes
 from .math_processing import process_math
 from .reference_processing import (
-        process_interal_refs,
+        process_internal_refs,
+        process_remaining_refs,
         process_ids,
         process_citations,
         add_glossary_datatypes
@@ -239,7 +240,7 @@ def process_chapter(toc_element,
     # note: must process figs before xrefs
     chapter = process_figures(chapter, build_dir)
     chapter = process_informal_figs(chapter, build_dir)
-    chapter = process_interal_refs(chapter)
+    chapter = process_internal_refs(chapter)
     chapter = process_citations(chapter)
     chapter = process_footnotes(chapter)
     chapter = process_admonitions(chapter)
@@ -250,10 +251,13 @@ def process_chapter(toc_element,
     chapter = move_span_ids_to_sections(chapter)
     chapter = process_sidebars(chapter)
     chapter = process_subsections(chapter)
+    # finally, process any remaining xrefs
+    chapter = process_remaining_refs(chapter)
 
     if chapter.get("data-type") == "glossary":
         add_glossary_datatypes(chapter)
 
+    # ensure we have unique IDs across the book
     chapter, ids = process_ids(chapter, book_ids)
 
     # write the file, preserving any directory structure(s) from source
diff --git a/jupyter_book_to_htmlbook/reference_processing.py b/jupyter_book_to_htmlbook/reference_processing.py
@@ -4,7 +4,7 @@
 from .helpers import base_soup
 
 
-def process_interal_refs(chapter):
+def process_internal_refs(chapter):
     """
     Processes internal a tags with "reference internal" classes.
     Converts bib references into spans (to deal with later), and other
@@ -45,6 +45,26 @@ def process_interal_refs(chapter):
     return chapter
 
 
+def process_remaining_refs(chapter):
+    """
+    Processing for any non-internal "xref" classed spans (i.e., those
+    that Jupyter can't find targets for)
+    """
+    xrefs = chapter.find_all("span", class_="xref")
+    for ref in xrefs:
+        # convert to proper htmlbook cross reference
+        if ref.string and ref.string.find(" ") == -1:
+            ref.name = "a"
+            ref["data-type"] = "xref"
+            ref["href"] = f"#{ref.string}"
+            ref.string = ref.get("href")
+        else:  # in the unlikely case of a badly formatted xref
+            logging.warning(
+                f"Failed to apply xref formatting to {ref}.")
+
+    return chapter
+
+
 def process_ids(chapter, existing_ids=[]):
     """
     Checks a list of IDs against ids that are already being used in the
diff --git a/tests/test_reference_processing.py b/tests/test_reference_processing.py
@@ -1,6 +1,11 @@
 import logging
+import shutil
 from bs4 import BeautifulSoup  # type: ignore
-from jupyter_book_to_htmlbook.reference_processing import process_interal_refs
+from jupyter_book_to_htmlbook.file_processing import process_chapter
+from jupyter_book_to_htmlbook.reference_processing import (
+        process_internal_refs,
+        process_remaining_refs
+    )
 
 
 class TestInternalRefs:
@@ -11,7 +16,7 @@ def test_process_internal_refs_reg_xrefs(self):
         chapter_text = """<a class="reference internal" href="example.html">
         cross reference text</a>"""
         chapter = BeautifulSoup(chapter_text, 'html.parser')
-        result = process_interal_refs(chapter)
+        result = process_internal_refs(chapter)
         assert str(result) == '<a class="reference internal" data-type=' + \
                               '"xref" href="#example.html">#example.html</a>'
 
@@ -29,7 +34,7 @@ def test_process_internal_refs_bibliograpy(self):
 title="Terry Carver...">Carver, 1993</a>]</span>.</p>
 """
         chapter = BeautifulSoup(text, 'html.parser')
-        result = process_interal_refs(chapter)
+        result = process_internal_refs(chapter)
         assert not result.find("a")
         assert "(Baruch 1993)" in result.find("span").contents
 
@@ -42,7 +47,72 @@ def test_alert_on_external_images(self, caplog):
     href="http://example.com/example.png"><img alt="example"
     src="http://example.com/example.png" style="width:100px" /></a>"""
         chapter = BeautifulSoup(chapter_text, 'html.parser')
-        result = process_interal_refs(chapter)
+        result = process_internal_refs(chapter)
         assert result == chapter
         caplog.set_level(logging.DEBUG)
         assert "External image reference:" in caplog.text
+
+
+class TestStandardRefs:
+    """
+    Tests around "std-ref" references, which appear as spans (in the case
+    where Jupyter Book can't find the actual reference).
+    """
+    def test_process_xref_spans(self):
+        """
+        It appears that when an xref doesn't have a target jupyter knows about
+        (e.g., in the case of examples), it puts them into spans. We should
+        check for these and then convert them appropriately.
+        """
+        chapter = BeautifulSoup("""<p>And here follows a formal code example
+(<span class="xref std std-ref">code_example</span>).
+Note that the cell has an “example” tag added to its metadata.</p>""",
+                                "html.parser")
+        result = process_remaining_refs(chapter)
+        xref = result.find("a", class_="xref")
+        assert xref
+        assert xref.get('data-type') == "xref"
+        assert xref.get('href') == "#code_example"
+        assert xref.string == "#code_example"
+
+    def test_process_xref_spans_bad_ref(self, caplog):
+        """
+        In the unlikely case wherein we get a bad xref (i.e., one with
+        spaces or code in it), we log that failure and do nothing
+        """
+        chapter = BeautifulSoup("""<p>And here follows a formal code example
+(<span class="xref std std-ref">code example</span>). Another is
+<span class="xref std std-ref"><span>some_</span><em>code_example</em></span>.
+Note that the cell has an “example” tag added to its metadata.</p>""",
+                                "html.parser")
+        process_remaining_refs(chapter)
+        caplog.set_level(logging.DEBUG)
+        log = caplog.text
+        assert "Failed to apply" in log
+        assert "code example" in log
+        assert "<em>code_example</em>" in log
+
+    def test_examples_refs_in_chapter_processing(self, tmp_path):
+        """
+        More an integration test, ensuring that when we process a chapter
+        the examples are data-typed as such, and that they still get their
+        highlighting
+        """
+        test_env = tmp_path / 'tmp'
+        test_out = test_env / 'output'
+        test_env.mkdir()
+        test_out.mkdir()
+        shutil.copytree('tests/example_book/_build/html/notebooks',
+                        test_env, dirs_exist_ok=True)
+
+        process_chapter(test_env / "code_py.html",
+                        test_env, test_out)
+        with open(test_out / 'code_py.html') as f:
+            soup = BeautifulSoup(f.read(), "html.parser")
+
+        xref = soup.find("a", class_="xref")
+        assert xref
+        assert xref.get("href") == "#hello_tim"
+        assert xref.get("data-type") == "xref"
+        assert xref.get("href") == "#hello_tim"
+        assert xref.string == "#hello_tim"