docs: add Google-style docstring parser (#283)

barjin · web-flow · commit 2e000e9c6889 · 2024-09-25T10:53:11.000+02:00
Replaces the default `pydoc-markdown` shell script with a custom Python
script calling the `pydoc-markdown` API directly. A custom patch of
`GoogleProcessor` allows us to parse the Google-style docstrings and
render the parameter comments with the actual parameters.
diff --git a/website/build_api_reference.sh b/website/build_api_reference.sh
@@ -11,7 +11,7 @@ sed_no_backup() {
 }
 
 # Create docspec dump of this package's source code through pydoc-markdown
-pydoc-markdown --quiet --dump > docspec-dump.jsonl
+python ./pydoc-markdown/generate_ast.py > docspec-dump.jsonl
 sed_no_backup "s#${PWD}/..#REPO_ROOT_PLACEHOLDER#g" docspec-dump.jsonl
 
 # Create docpec dump from the right version of the apify-shared package
diff --git a/website/docusaurus.config.js b/website/docusaurus.config.js
@@ -66,6 +66,9 @@ module.exports = {
                     rehypePlugins: [externalLinkProcessor],
                     editUrl: 'https://github.com/apify/apify-sdk-python/edit/master/website/',
                 },
+                theme: {
+                    customCss: require.resolve('./src/css/custom.css'),
+                },
             }),
         ],
     ]),
diff --git a/website/pydoc-markdown/__init__.py b/website/pydoc-markdown/__init__.py
diff --git a/website/pydoc-markdown/generate_ast.py b/website/pydoc-markdown/generate_ast.py
@@ -0,0 +1,46 @@
+"""
+Replaces the default pydoc-markdown shell script with a custom Python script calling the pydoc-markdown API directly.
+
+This script generates an AST from the Python source code in the `src` directory and prints it as a JSON object.
+"""
+
+from pydoc_markdown.interfaces import Context
+from pydoc_markdown.contrib.loaders.python import PythonLoader
+from pydoc_markdown.contrib.processors.filter import FilterProcessor
+from pydoc_markdown.contrib.processors.crossref import CrossrefProcessor
+from pydoc_markdown.contrib.renderers.markdown import MarkdownReferenceResolver
+from google_docstring_processor import ApifyGoogleProcessor
+from docspec import dump_module
+
+import json
+import os
+
+project_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../src')
+
+context = Context(directory='.')
+loader = PythonLoader(search_path=[project_path])
+filter = FilterProcessor(
+    documented_only=False,
+    skip_empty_modules=False,
+)
+crossref = CrossrefProcessor()
+google = ApifyGoogleProcessor()
+
+loader.init(context)
+filter.init(context)
+google.init(context)
+crossref.init(context)
+
+processors = [filter, google, crossref]
+
+dump = []
+
+modules = list(loader.load())
+
+for processor in processors:
+    processor.process(modules, None)
+
+for module in modules:
+    dump.append(dump_module(module))
+
+print(json.dumps(dump, indent=4))
diff --git a/website/pydoc-markdown/google_docstring_processor.py b/website/pydoc-markdown/google_docstring_processor.py
@@ -0,0 +1,183 @@
+# -*- coding: utf8 -*-
+# Copyright (c) 2019 Niklas Rosenstein
+# !!! Modified 2024 Jindřich Bär
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to
+# deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+# sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+
+import dataclasses
+import re
+import typing as t
+
+import docspec
+
+from pydoc_markdown.contrib.processors.sphinx import generate_sections_markdown
+from pydoc_markdown.interfaces import Processor, Resolver
+
+import json
+
+
+@dataclasses.dataclass
+class ApifyGoogleProcessor(Processor):
+    """
+    This class implements the preprocessor for Google and PEP 257 docstrings. It converts
+    docstrings formatted in the Google docstyle to Markdown syntax.
+
+    References:
+
+    * https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html
+    * https://www.python.org/dev/peps/pep-0257/
+
+    Example:
+
+    ```
+    Attributes:
+        module_level_variable1 (int): Module level variables may be documented in
+            either the ``Attributes`` section of the module docstring, or in an
+            inline docstring immediately following the variable.
+
+            Either form is acceptable, but the two should not be mixed. Choose
+            one convention to document module level variables and be consistent
+            with it.
+
+    Todo:
+        * For module TODOs
+        * You have to also use ``sphinx.ext.todo`` extension
+    ```
+
+    Renders as:
+
+    Attributes:
+        module_level_variable1 (int): Module level variables may be documented in
+            either the ``Attributes`` section of the module docstring, or in an
+            inline docstring immediately following the variable.
+
+            Either form is acceptable, but the two should not be mixed. Choose
+            one convention to document module level variables and be consistent
+            with it.
+
+    Todo:
+        * For module TODOs
+        * You have to also use ``sphinx.ext.todo`` extension
+
+    @doc:fmt:google
+    """
+
+    _param_res = [
+        re.compile(r"^(?P<param>\S+):\s+(?P<desc>.+)$"),
+        re.compile(r"^(?P<param>\S+)\s+\((?P<type>[^)]+)\):\s+(?P<desc>.+)$"),
+        re.compile(r"^(?P<param>\S+)\s+--\s+(?P<desc>.+)$"),
+        re.compile(r"^(?P<param>\S+)\s+\{\[(?P<type>\S+)\]\}\s+--\s+(?P<desc>.+)$"),
+        re.compile(r"^(?P<param>\S+)\s+\{(?P<type>\S+)\}\s+--\s+(?P<desc>.+)$"),
+    ]
+
+    _keywords_map = {
+        "Args:": "Arguments",
+        "Arguments:": "Arguments",
+        "Attributes:": "Attributes",
+        "Example:": "Example",
+        "Examples:": "Examples",
+        "Keyword Args:": "Arguments",
+        "Keyword Arguments:": "Arguments",
+        "Methods:": "Methods",
+        "Note:": "Notes",
+        "Notes:": "Notes",
+        "Other Parameters:": "Arguments",
+        "Parameters:": "Arguments",
+        "Return:": "Returns",
+        "Returns:": "Returns",
+        "Raises:": "Raises",
+        "References:": "References",
+        "See Also:": "See Also",
+        "Todo:": "Todo",
+        "Warning:": "Warnings",
+        "Warnings:": "Warnings",
+        "Warns:": "Warns",
+        "Yield:": "Yields",
+        "Yields:": "Yields",
+    }
+
+    def check_docstring_format(self, docstring: str) -> bool:
+        for section_name in self._keywords_map:
+            if section_name in docstring:
+                return True
+        return False
+
+    def process(self, modules: t.List[docspec.Module], resolver: t.Optional[Resolver]) -> None:
+        docspec.visit(modules, self._process)
+
+    def _process(self, node: docspec.ApiObject):
+        if not node.docstring:
+            return
+
+        lines = []
+        sections = []
+        current_lines: t.List[str] = []
+        in_codeblock = False
+        keyword = None
+        multiline_argument_offset = -1
+
+        def _commit():
+            if keyword:
+                sections.append({keyword: list(current_lines)})
+            else:
+                lines.extend(current_lines)
+            current_lines.clear()
+
+        for line in node.docstring.content.split("\n"):
+            multiline_argument_offset += 1
+            if line.lstrip().startswith("```"):
+                in_codeblock = not in_codeblock
+                current_lines.append(line)
+                continue
+
+            if in_codeblock:
+                current_lines.append(line)
+                continue
+
+            line = line.strip()
+            if line in self._keywords_map:
+                _commit()
+                keyword = self._keywords_map[line]
+                continue
+
+            if keyword is None:
+                lines.append(line)
+                continue
+
+            for param_re in self._param_res:
+                param_match = param_re.match(line)
+                if param_match:
+                    current_lines.append(param_match.groupdict())
+                    multiline_argument_offset = 0
+                    break
+
+            if not param_match:
+                if multiline_argument_offset == 1:
+                    current_lines[-1]["desc"] += "\n" + line
+                    multiline_argument_offset = 0
+                else:
+                    current_lines.append(line)
+
+        _commit()
+        node.docstring.content = json.dumps({
+            "text": "\n".join(lines),
+            "sections": sections,
+        }, indent=None)
+        
+
diff --git a/website/src/css/custom.css b/website/src/css/custom.css
@@ -0,0 +1,12 @@
+.tsd-parameters li {
+    margin-bottom: 16px;
+}
+
+.tsd-parameters-title {
+    font-size: 16px;
+    margin-bottom: 16px !important;
+}
+
+.tsd-returns-title {
+    font-size: 16px;
+}
diff --git a/website/transformDocs.js b/website/transformDocs.js
@@ -134,27 +134,6 @@ function sortChildren(typedocMember) {
     typedocMember.groups.sort((a, b) => groupSort(a.title, b.title));
 }
 
-// Parses the arguments and return value description of a method from its docstring
-function extractArgsAndReturns(docstring) {
-    const parameters = (docstring
-        .split('Args:')[1] ?? '').split('Returns:')[0] // Get the part between Args: and Returns:
-        .split(/(^|\n)\s*([\w]+)\s*\(.*?\)\s*:\s*/) // Magic regex which splits the arguments into an array, and removes the argument types
-        .filter(x => x.length > 1) // Remove empty strings
-        .reduce((acc, curr, idx, arr) => { // Collect the argument names and types into an object
-            if(idx % 2 === 0){
-                return {...acc, [curr]: arr[idx+1]} // If the index is even, the current string is an argument name, and the next string is its type
-            }
-            return acc;
-        }, {});
-
-    const returns = (docstring
-        .split('Returns:')[1] ?? '').split('Raises:')[0] // Get the part between Returns: and Raises:
-        .split(':')[1]?.trim() || undefined; // Split the return value into its type and description, return description
-
-
-    return { parameters, returns };
-}
-
 // Objects with decorators named 'ignore_docs' or with empty docstrings will be ignored
 function isHidden(member) {
     return member.decorations?.some(d => d.name === 'ignore_docs') 
@@ -211,6 +190,24 @@ function convertObject(obj, parent, module) {
                 member.name = 'Actor';
             }
 
+            let docstring = { text: member.docstring?.content ?? '' };
+            try {
+                docstring = JSON.parse(docstring.text);
+
+                docstring.args = docstring.sections.find((section) => Object.keys(section)[0] === 'Arguments')['Arguments'] ?? [];
+
+                docstring.args = docstring.args.reduce((acc, arg) => {
+                    acc[arg.param] = arg.desc;
+                    return acc;
+                }, {});
+
+                docstring.returns = docstring.sections.find((section) => Object.keys(section)[0] === 'Returns')['Returns'] ?? [];
+
+                docstring.returns = docstring.returns.join('\n');
+            } catch {
+                // Do nothing
+            }
+
             // Create the Typedoc member object
             let typedocMember = {
                 id: oid++,
@@ -222,7 +219,7 @@ function convertObject(obj, parent, module) {
                 comment: member.docstring ? {
                     summary: [{
                         kind: 'text',
-                        text: member.docstring?.content,
+                        text: docstring.text,
                     }],
                 } : undefined,
                 type: typedocType,
@@ -241,23 +238,20 @@ function convertObject(obj, parent, module) {
             }
 
             if(typedocMember.kindString === 'Method') {
-                const { parameters, returns } = extractArgsAndReturns(member.docstring?.content ?? '');
-
                 typedocMember.signatures = [{
                     id: oid++,
                     name: member.name,
                     modifiers: member.modifiers ?? [],
                     kind: 4096,
                     kindString: 'Call signature',
                     flags: {},
-                    comment: member.docstring ? {
+                    comment: docstring.text ? {
                         summary: [{
                             kind: 'text',
-                            text: member.docstring?.content
-                                .replace(/\**(Args|Arguments|Returns)[\s\S]+/, ''),
+                            text: docstring?.text,
                         }],
-                        blockTags: returns ? [
-                            { tag: '@returns', content: [{ kind: 'text', text: returns }] },
+                        blockTags: docstring?.returns ? [
+                            { tag: '@returns', content: [{ kind: 'text', text: docstring.returns }] },
                         ] : undefined,
                     } : undefined,
                     type: inferTypedocType(member.return_type),
@@ -271,10 +265,10 @@ function convertObject(obj, parent, module) {
                             'keyword-only': arg.type === 'KEYWORD_ONLY' ? 'true' : undefined,
                         },
                         type: inferTypedocType(arg.datatype),
-                        comment: parameters[arg.name] ? {
+                        comment: docstring.args?.[arg.name] ? {
                             summary: [{
                                 kind: 'text',
-                                text: parameters[arg.name]
+                                text: docstring.args[arg.name]
                             }]
                         } : undefined,
                         defaultValue: arg.default_value,
@@ -330,15 +324,14 @@ function main() {
 
     // Load the docspec dump files of this module and of apify-shared
     const thisPackageDocspecDump = fs.readFileSync('docspec-dump.jsonl', 'utf8');
-    const thisPackageModules = thisPackageDocspecDump.split('\n').filter((line) => line !== '');
+    const thisPackageModules = JSON.parse(thisPackageDocspecDump)
 
     const apifySharedDocspecDump = fs.readFileSync('apify-shared-docspec-dump.jsonl', 'utf8');
     const apifySharedModules = apifySharedDocspecDump.split('\n').filter((line) => line !== '');
 
     // Convert all the modules, store them in the root object
-    for (const module of [...thisPackageModules, ...apifySharedModules]) {
-        const parsedModule = JSON.parse(module);
-        convertObject(parsedModule, typedocApiReference, parsedModule);
+    for (const module of thisPackageModules) {
+        convertObject(module, typedocApiReference, module);
     };
 
     // Recursively fix references (collect names->ids of all the named entities and then inject those in the reference objects)

Original file line number	Diff line number	Diff line change
`@@ -11,7 +11,7 @@ sed_no_backup() {`
`11`	`11`	`}`
`12`	`12`
`13`	`13`	`# Create docspec dump of this package's source code through pydoc-markdown`
`14`		`-pydoc-markdown --quiet --dump > docspec-dump.jsonl`
	`14`	`+python ./pydoc-markdown/generate_ast.py > docspec-dump.jsonl`
`15`	`15`	`sed_no_backup "s#${PWD}/..#REPO_ROOT_PLACEHOLDER#g" docspec-dump.jsonl`
`16`	`16`
`17`	`17`	`# Create docpec dump from the right version of the apify-shared package`