NiklasRosenstein · celsiusnarhwal · Jan 25, 2023 · Jan 25, 2023 · Jan 26, 2023 · May 27, 2023
diff --git a/.changelog/_unreleased.toml b/.changelog/_unreleased.toml
@@ -1,3 +1,13 @@
+[[entries]]
+id = "5be79248-7b86-465d-953c-d0c69ab64e8a"
+type = "improvement"
+description = "Implement support for NumPy-style docstrings"
+author = "celsiusnarhwal"
+pr = "https://github.com/NiklasRosenstein/pydoc-markdown/pull/279"
+issues = [
+    "https://github.com/celsiusnarhwal/pydoc-markdown/issues/251",
+]
+
 [[entries]]
 id = "4409675c-ea67-4c56-be5a-a7310f779c15"
 type = "improvement"

diff --git a/pyproject.toml b/pyproject.toml
@@ -38,6 +38,7 @@ tomli = "^2.0.0"
 tomli_w = "^1.0.0"
 yapf = ">=0.30.0"
 watchdog = "*"
+numpydoc = "^1.5.0"
 
 [tool.poetry.dev-dependencies]
 pytest = "*"

diff --git a/readme.md b/readme.md
@@ -31,8 +31,8 @@ If you plan on using the [Novella][] integration, you may want to install it as:
 
 ### Features 🌟
 
-* Understands multiple documentation styles (Sphinx, Google, Pydoc-Markdown specific) and converts them to properly
-  formatted Markdown
+* Understands multiple documentation styles (Sphinx, Google, NumPy, Pydoc-Markdown specific) and converts them to 
+  properly formatted Markdown
 * Can parse docstrings for variables thanks to [docspec][] (`#:` block before or string literal after the statement)
 * Generates links to other API objects per the documentation syntax (e.g. `#OtherClass` for the Pydoc-Markdown style)
 * Configure the output using a YAML file or `pyProject.toml`, then you're only one command away to generate the

diff --git a/src/pydoc_markdown/contrib/processors/numpy.py b/src/pydoc_markdown/contrib/processors/numpy.py
@@ -0,0 +1,255 @@
+# -*- coding: utf8 -*-
+# Copyright (c) 2019 Niklas Rosenstein
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to
+# deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+# sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+
+from __future__ import annotations
+
+import dataclasses
+import itertools
+import re
+import typing as t
+import warnings
+from contextlib import contextmanager
+
+import docspec
+from numpydoc.docscrape import NumpyDocString, Parameter  # type: ignore[import]
+from numpydoc.validate import validate  # type: ignore[import]
+
+from pydoc_markdown.interfaces import Processor, Resolver
+
+
+@contextmanager
+def _filter_numpydoc_warnings(action: warnings._ActionKind):
+    warnings.filterwarnings(action, module="numpydoc.docscrape")
+    yield
+    warnings.resetwarnings()
+
+
+class _DocstringWrapper:
+    # Wraps docstrings for use with numpydoc.validate.validate().
+    __qualname__ = "pydoc_markdown.contrib.processors.numpy._DocstringWrapper"
+
+
+@dataclasses.dataclass
+class NumpyProcessor(Processor):
+    # numpydoc doesn't like when a heading appears twice in the same docstring so we have to use <span> tags to
+    # keep numpydoc from recognizing the example headings. This also means the example code block has to be
+    # delineated with HTML tags instead of Markdown syntax.
+    """
+    This processor parses NumPy-style docstrings and converts them to Markdown syntax.
+
+    References
+    ----------
+    - https://numpydoc.readthedocs.io/en/latest/format.html
+
+    Examples
+    --------
+    <pre>
+    <code>
+    <span>Parameters</span>
+    ----------
+    arg: str
+        This argument should be a string.
+
+    <span>Raises</span>
+    ------
+    ValueError
+        If *arg* is not a string.
+
+    <span>Returns</span>
+    -------
+    int
+        The length of the string.
+    </code>
+    </pre>
+
+    Renders as:
+
+    Parameters
+    ----------
+    arg : str
+        This argument should be a string.
+
+    Raises
+    ------
+    ValueError
+        If *arg* is not a string.
+
+    Returns
+    -------
+    int
+        The length of the string.
+
+    @doc:fmt:numpy
+    """
+
+    _SECTION_MAP = {
+        "Summary": ["Summary", "Extended Summary"],
+        "Arguments": ["Parameters", "Other Parameters"],
+        "Returns": ["Returns"],
+        "Yields": ["Yields"],
+        "Receives": ["Receives"],
+        "Attributes": ["Attributes"],
+        "Methods": ["Methods"],
+        "Raises": ["Raises"],
+        "Warns": ["Warns"],
+        "Warnings": ["Warnings"],
+        "See Also": ["See Also"],
+        "Notes": ["Notes"],
+        "References": ["References"],
+        "Examples": ["Examples"],
+    }
+
+    @staticmethod
+    def check_docstring_format(docstring: str) -> bool:
+        _DocstringWrapper.__doc__ = docstring
+
+        with _filter_numpydoc_warnings("error"):
+            try:
+                return not validate(_DocstringWrapper.__qualname__).get("Errors")
+            except Warning:
+                return False
+
+    def process(self, modules: t.List[docspec.Module], resolver: t.Optional[Resolver]) -> None:
+        docspec.visit(modules, self._process)
+
+    def _process(self, node: docspec.ApiObject):
+        if not node.docstring:
+            return
+
+        docstring = NumpyDocString(node.docstring.content)
+        lines = []
+
+        # Filter self._SECTION_MAP to only include sections used in the docstring
+        active_sections = {k: v for k, v in self._SECTION_MAP.items() if any(docstring.get(sec) for sec in v)}
+
+        # numpydoc is opinionated when it comes to section order so we have to preserve the order of the original
+        # docstring ourselves
+
+        # First, we create a regex pattern to match all section headings in the docstring
+        keyword_regex = re.compile(
+            "|".join(
+                [rf"{keyword}(?:\r?\n)-{{{len(keyword)}}}" for keyword in itertools.chain(*active_sections.values())]
+            )
+        )
+
+        # Second, we strip each patten match of hyphens and whitespace
+        keyword_matches = [match.replace("-", "").strip() for match in keyword_regex.findall(node.docstring.content)]
+
+        # Third, we determine the section order in the eventual output based on the order of the headings in the
+        # original docstring (but always starting with the summary)
+        section_order = [
+            "Summary",
+            *[next(key for key, value in active_sections.items() if keyword in value) for keyword in keyword_matches],
+        ]
+
+        # Finally, we sort active_sections according to the section order we just determined
+        for section, keywords in sorted(active_sections.items(), key=lambda x: section_order.index(x[0])):
+            lines.extend(self._get_section_contents(docstring, section, keywords))
+
+        node.docstring.content = "\n".join(lines)
+
+    def _get_section_contents(self, docstring: NumpyDocString, section: str, keywords: list) -> list[str]:
+        contents = list(itertools.chain([docstring.get(sec) for sec in keywords]))
+
+        if section == "Summary":
+            return self._parse_summary(contents)
+        else:
+            # contents needs to be flattened for all sections aside from Summary
+            contents = list(itertools.chain(*contents))
+            if section in ["Notes", "References"]:
+                return self._parse_notes_and_references(section, contents)
+            elif section == "Examples":
+                return self._parse_examples(contents)
+            elif section == "See Also":
+                return self._parse_see_also(contents)
+            elif any(isinstance(item, Parameter) for item in contents):
+                return self._parse_parameters(section, contents)
+            else:
+                return [f"\n**{section}**\n", *contents] if contents else []
+
+    @staticmethod
+    def _parse_summary(contents: list[str]) -> list[str]:
+        summary, extended = contents
+        return [*summary, "", *extended] if extended else [*summary]
+
+    @staticmethod
+    def _parse_parameters(section: str, parameters: list[Parameter]) -> list[str]:
+        lines = []
+
+        for param in parameters:
+            name, cls, desc = param
+            desc = "\n".join(desc)
+
+            if name and cls and desc:
+                lines.append(f"* **{name}** (`{cls}`): {desc}")
+            elif name and cls:
+                lines.append(f"* **{name}** (`{cls}`)")
+            elif name and desc:
+                lines.append(f"* **{name}**: {desc}")
+            elif cls and desc:
+                lines.append(f"* `{cls}`: {desc}")
+            elif name:
+                lines.append(f"* **{name}**")
+            elif cls:
+                lines.append(f"* `{cls}`")
+            elif desc:
+                lines.append(f"* {desc}")
+
+        return [f"\n**{section}**\n", *lines] if lines else []
+
+    @staticmethod
+    def _parse_notes_and_references(section: str, contents: list[str]) -> list[str]:
+        content_string = "\n".join(contents)
+        citations = re.compile("(\.\. )?\[(?P<ref_id>\w+)][_ ]?")
+
+        replacements = {"Notes": "<sup>{ref_id}</sup>", "References": "{ref_id}. "}
+
+        for match in citations.finditer(content_string):
+            ref_id = match.group("ref_id")
+            content_string = content_string.replace(match.group(0), replacements[section].format(ref_id=ref_id))
+
+        return [f"\n**{section}**\n", *content_string.splitlines()]
+
+    @staticmethod
+    def _parse_examples(contents: list[str]) -> list[str]:
+        # Wraps doctests in Python codeblocks and leaves all other content as is
+        doctests = re.compile(r"(>>>(?:.+(?:\r?\n|$))+)", flags=re.MULTILINE)
+        return [
+            "\n**Examples**\n",
+            *doctests.sub("```python\n\g<0>\n```", "\n".join(contents)).splitlines(),
+        ]
+
+    @staticmethod
+    def _parse_see_also(contents: list[tuple]) -> list[str]:
+        lines = []
+
+        for group in contents:
+            sublines = []
+            objs, desc = group
+
+            sublines.append("* " + ", ".join([f":{obj[1]}:`{obj[0]}`" if obj[1] else f"{obj[0]}" for obj in objs]))
+
+            if desc:
+                sublines[-1] += ": " + "\n".join(desc)
+
+            lines.extend(sublines)
+
+        return [f"\n**See Also**\n", *lines]
diff --git a/src/pydoc_markdown/contrib/processors/pydocmd.py b/src/pydoc_markdown/contrib/processors/pydocmd.py
@@ -73,7 +73,7 @@ class PydocmdProcessor(Processor):
     def process(self, modules: t.List[docspec.Module], resolver: t.Optional[Resolver]) -> None:
         docspec.visit(modules, self._process)
 
-    def _process(self, node: docspec.ApiObject):
+    def _process(self, node: docspec.ApiObject) -> None:
         if not node.docstring:
             return
         lines = []

diff --git a/src/pydoc_markdown/contrib/processors/smart.py b/src/pydoc_markdown/contrib/processors/smart.py
@@ -20,26 +20,42 @@
 # IN THE SOFTWARE.
 
 import dataclasses
+import logging
 import typing as t
 
 import docspec
+from typing_extensions import Protocol
 
 from pydoc_markdown.contrib.processors.google import GoogleProcessor
+from pydoc_markdown.contrib.processors.numpy import NumpyProcessor
 from pydoc_markdown.contrib.processors.pydocmd import PydocmdProcessor
 from pydoc_markdown.contrib.processors.sphinx import SphinxProcessor
 from pydoc_markdown.interfaces import Processor, Resolver
 
+logger = logging.getLogger(__name__)
+
+
+class DelegatableProcessor(Protocol):
+    def _process(self, node: docspec.ApiObject) -> None:
+        ...
+
+
+class CheckCapableProcessor(DelegatableProcessor, Protocol):
+    def check_docstring_format(self, docstring: str) -> bool:
+        ...
+
 
 @dataclasses.dataclass
 class SmartProcessor(Processor):
     """
-    This processor picks the #GoogleProcessor, #SphinxProcessor or #PydocmdProcessor after
+    This processor picks the #GoogleProcessor, #SphinxProcessor, #PydocmdProcessor, or #NumpyProcessor after
     guessing which is appropriate from the syntax it finds in the docstring.
     """
 
     google: GoogleProcessor = dataclasses.field(default_factory=GoogleProcessor)
     pydocmd: PydocmdProcessor = dataclasses.field(default_factory=PydocmdProcessor)
     sphinx: SphinxProcessor = dataclasses.field(default_factory=SphinxProcessor)
+    numpy: NumpyProcessor = dataclasses.field(default_factory=NumpyProcessor)
 
     def process(self, modules: t.List[docspec.Module], resolver: t.Optional[Resolver]) -> None:
         docspec.visit(modules, self._process)
@@ -48,14 +64,33 @@ def _process(self, obj: docspec.ApiObject):
         if not obj.docstring:
             return None
 
-        for name in ("google", "pydocmd", "sphinx"):
+        object_name = ".".join(x.name for x in obj.path)
+        object_type = type(obj).__name__
+
+        processors: t.List[t.Tuple[str, DelegatableProcessor]] = [
+            ("sphinx", self.sphinx),
+            ("google", self.google),
+            ("numpy", self.numpy),
+            ("pydocmd", self.pydocmd),
+        ]
+
+        checkable_processors: t.List[t.Tuple[str, CheckCapableProcessor]] = [
+            ("sphinx", self.sphinx),
+            ("google", self.google),
+            ("numpy", self.numpy),
+        ]
+
+        for name, processor in processors:
             indicator = "@doc:fmt:" + name
             if indicator in obj.docstring.content:
+                logger.info("Using `%s` processor for %s `%s` (explicit)", name, object_type, object_name)
                 obj.docstring.content = obj.docstring.content.replace(indicator, "")
-                return getattr(self, name)._process(obj)
+                return processor._process(obj)
+
+        for name, processor in checkable_processors:
+            if processor.check_docstring_format(obj.docstring.content):
+                logger.info("Using `%s` processor for %s `%s` (detected)", name, object_type, object_name)
+                return processor._process(obj)
 
-        if self.sphinx.check_docstring_format(obj.docstring.content):
-            return self.sphinx._process(obj)
-        if self.google.check_docstring_format(obj.docstring.content):
-            return self.google._process(obj)
+        logger.info("Using `pydocmd` processor for %s `%s` (default)", name, object_type, object_name)
         return self.pydocmd._process(obj)
diff --git a/test/processors/__init__.py b/test/processors/__init__.py
@@ -11,4 +11,5 @@ def assert_processor_result(processor, docstring, expected_output):
     )
     processor.process([module], None)
     assert module.docstring
+    print(module.docstring.content)
     assert_text_equals(module.docstring.content, textwrap.dedent(expected_output))