Edit on GitHub

pdoc.docstrings

This module handles the conversion of docstring flavors to Markdown.

The conversion from docstring flavors to Markdown is mostly done with regular expressions. This is not particularly beautiful, but good enough for our purposes. The alternative would be to depend on https://github.com/rr-/docstring_parser or a similar project, but that introduces more complexity than we are comfortable with.

If you miss a particular feature for your favorite flavor, contributions are welcome. That being said, please keep the complexity low and make sure that changes are accompanied by matching snapshot tests in test/testdata/.

  1"""
  2This module handles the conversion of docstring flavors to Markdown.
  3
  4The conversion from docstring flavors to Markdown is mostly done with regular expressions.
  5This is not particularly beautiful, but good enough for our purposes.
  6The alternative would be to depend on <https://github.com/rr-/docstring_parser> or a similar project,
  7but that introduces more complexity than we are comfortable with.
  8
  9If you miss a particular feature for your favorite flavor, contributions are welcome.
 10That being said, please keep the complexity low and make sure that changes are
 11accompanied by matching snapshot tests in `test/testdata/`.
 12"""
 13
 14from __future__ import annotations
 15
 16import base64
 17from functools import cache
 18import inspect
 19import mimetypes
 20import os
 21from pathlib import Path
 22import re
 23from textwrap import dedent
 24from textwrap import indent
 25import warnings
 26
 27AnyException = (SystemExit, GeneratorExit, Exception)
 28"""BaseException, but excluding KeyboardInterrupt.
 29
 30Modules may raise SystemExit on import (which we want to catch),
 31but we don't want to catch a user's KeyboardInterrupt.
 32"""
 33
 34
 35@cache
 36def convert(docstring: str, docformat: str, source_file: Path | None) -> str:
 37    """
 38    Convert `docstring` from `docformat` to Markdown.
 39    """
 40    docformat = docformat.lower()
 41
 42    try:
 43        if any(x in docformat for x in ["google", "numpy", "restructuredtext"]):
 44            docstring = rst(docstring, source_file)
 45
 46        if "google" in docformat:
 47            docstring = google(docstring)
 48
 49        if "numpy" in docformat:
 50            docstring = numpy(docstring)
 51
 52        if source_file is not None and os.environ.get("PDOC_EMBED_IMAGES") != "0":
 53            docstring = embed_images(docstring, source_file)
 54
 55    except AnyException as e:
 56        raise RuntimeError(
 57            'Docstring processing failed for docstring=\n"""\n'
 58            + docstring
 59            + f'\n"""\n{source_file=}\n{docformat=}'
 60        ) from e
 61
 62    return docstring
 63
 64
 65def embed_images(docstring: str, source_file: Path) -> str:
 66    def local_image_to_data_uri(href: str) -> str:
 67        image_path = source_file.parent / href
 68        image_data = image_path.read_bytes()
 69        image_mime = mimetypes.guess_type(image_path)[0]
 70        image_data_b64 = base64.b64encode(image_data).decode()
 71        return f"data:{image_mime};base64,{image_data_b64}"
 72
 73    def embed_local_image(m: re.Match) -> str:
 74        try:
 75            href = local_image_to_data_uri(m["href"])
 76        except Exception:
 77            return m[0]
 78        else:
 79            return m["before"] + href + m["after"]
 80
 81    # TODO: Could probably do more here, e.g. support rST replacements.
 82    for regex in [
 83        r"(?P<before>!\[\s*.*?\s*]\(\s*)(?P<href>.+?)(?P<after>\s*\))",
 84        r"""(?P<before>src=['"])(?P<href>.+?)(?P<after>['"])""",
 85    ]:
 86        docstring = re.sub(regex, embed_local_image, docstring)
 87    return docstring
 88
 89
 90def google(docstring: str) -> str:
 91    """Convert Google-style docstring sections into Markdown."""
 92    return re.sub(
 93        r"""
 94        ^(?P<name>[A-Z][A-Z a-z]+):\n
 95        (?P<contents>(
 96            \n        # empty lines
 97            |         # or
 98            [ \t]+.+  # lines with indentation
 99        )+)$
100        """,
101        _google_section,
102        docstring,
103        flags=re.VERBOSE | re.MULTILINE,
104    )
105
106
107GOOGLE_LIST_SECTIONS = ["Args", "Raises", "Attributes"]
108"""Section headers listed in the official Google docstring style guide."""
109
110GOOGLE_LIST_SECTION_ALIASES = {
111    "Parameters": "Args",
112    "Params": "Args",
113    "Arguments": "Args",
114}
115"""
116Alternative section headers that are not listed in the official Google
117docstring style guide but that we recognize as sections containing lists
118nevertheless.
119"""
120
121
122def _google_section(m: re.Match[str]) -> str:
123    name = m.group("name")
124    contents = dedent(m.group("contents")).lstrip()
125
126    if name in GOOGLE_LIST_SECTION_ALIASES:
127        name = GOOGLE_LIST_SECTION_ALIASES[name]
128
129    if name in GOOGLE_LIST_SECTIONS:
130        items = _indented_list(contents)
131        contents = ""
132        for item in items:
133            try:
134                # first ":" on the first line
135                _, attr, desc = re.split(r"^(.+?:)", item, maxsplit=1)
136            except ValueError:
137                contents += " - " + indent(item, "   ")[3:]
138            else:
139                contents += f" - **{attr}** " + indent(desc, "   ")[3:]
140            contents += "\n"
141    else:
142        contents = indent(contents, "> ", lambda line: True)
143
144    if name == "Args":
145        name = "Arguments"
146
147    return f"\n###### {name}:\n{contents}\n"
148
149
150def _indented_list(contents: str) -> list[str]:
151    """
152    Convert a list string into individual (dedented) elements. For example,
153
154    foo:
155        desc
156    bar: int
157        more desc
158    baz:
159        desc
160            indented
161
162    returns [
163        "foo:\ndesc",
164        "bar: int\nmore desc",
165        "baz:\ndesc\n    indented",
166    ]
167    """
168    # we expect this to be through cleandoc() already.
169    assert not contents.startswith(" "), contents
170    assert not contents.startswith("\n"), contents
171
172    ret: list[str] = []
173    for line in contents.splitlines(keepends=True):
174        empty = not line.strip()
175        indented = line.startswith(" ")
176        if not (empty or indented):
177            # new section
178            ret.append(line)
179        else:
180            # append to current section
181            ret[-1] += line
182
183    return [inspect.cleandoc(x) for x in ret]
184
185
186def numpy(docstring: str) -> str:
187    """Convert NumPy-style docstring sections into Markdown.
188
189    See <https://numpydoc.readthedocs.io/en/latest/format.html> for details.
190    """
191    sections = re.split(
192        r"""
193        ^([A-Z][A-Za-z ]+)\n  # a heading
194        ---+\n+              # followed by a dashed line
195        """,
196        docstring,
197        flags=re.VERBOSE | re.MULTILINE,
198    )
199    contents = sections[0]
200    for heading, content in zip(sections[1::2], sections[2::2]):
201        if content.startswith(" ") and re.search(r"\n(?![ \n])", content):
202            # If the first line of section content is indented, we consider the section to be finished
203            # on the first non-indented line. We take out the rest - the tail - here.
204            content, tail = re.split(r"\n(?![ \n])", content, maxsplit=1)
205        else:
206            tail = ""
207
208        content = dedent(content)
209
210        if heading in (
211            "Parameters",
212            "Returns",
213            "Yields",
214            "Receives",
215            "Other Parameters",
216            "Raises",
217            "Warns",
218            "Attributes",
219        ):
220            contents += f"###### {heading}\n{_numpy_parameters(content)}"
221        elif heading == "See Also":
222            contents += f"###### {heading}\n{_numpy_seealso(content)}"
223        else:
224            contents += f"###### {heading}\n{content}"
225        contents += tail
226    return contents
227
228
229def _numpy_seealso(content: str) -> str:
230    """Convert a NumPy-style "See Also" section into Markdown"""
231    contents = ""
232    for item in _indented_list(content):
233        if ":" in item:
234            funcstr, desc = item.split(":", maxsplit=1)
235            desc = f": {desc}"
236        else:
237            funcstr, desc = item, ""
238
239        funclist = [f.strip() for f in funcstr.split(" ")]
240        funcs = ", ".join(f"`{f}`" for f in funclist if f)
241        contents += f"{funcs}{desc}  \n"
242    return contents
243
244
245def _numpy_parameters(content: str) -> str:
246    """Convert a NumPy-style parameter section into Markdown"""
247    contents = ""
248    for item in _indented_list(content):
249        m = re.match(r"^(.+):(.+)([\s\S]*)", item)
250        if m:
251            contents += (
252                f" - **{m.group(1).strip()}** ({m.group(2).strip()}):\n"
253                f"{indent(m.group(3).strip(), '   ')}\n"
254            )
255        else:
256            if "\n" in item:
257                name, desc = item.split("\n", maxsplit=1)
258                name = name.strip()
259                desc = desc.strip()
260            else:
261                name, desc = item.strip(), ""
262
263            if desc:
264                contents += f" - **{name}**: {desc}\n"
265            else:
266                contents += f" - **{name}**\n"
267    return f"{contents}\n"
268
269
270def rst(contents: str, source_file: Path | None) -> str:
271    """
272    Convert reStructuredText elements to Markdown.
273    We support the most common elements, but we do not aim to mirror the full complexity of the spec here.
274    """
275    contents = _rst_admonitions(contents, source_file)
276    contents = _rst_links(contents)
277
278    def replace_reference(m):
279        _, kind, name = m.groups()
280        if kind in ("meth", "func"):
281            return f"`{name}()`"
282        else:
283            return f"`{name}`"
284
285    # Code References: :obj:`foo` -> `foo`
286    contents = re.sub(
287        r"(:py)?:(mod|func|data|const|class|meth|attr|exc|obj):`([^`]+)`",
288        replace_reference,
289        contents,
290    )
291
292    # Math: :math:`foo` -> \\( foo \\)
293    # We don't use $ as that's not enabled by MathJax by default.
294    contents = re.sub(r":math:`(.+?)`", r"\\\\( \1 \\\\)", contents)
295
296    contents = _rst_footnotes(contents)
297
298    contents = _rst_fields(contents)
299
300    return contents
301
302
303def _rst_footnotes(contents: str) -> str:
304    """Convert reStructuredText footnotes"""
305    footnotes: set[str] = set()
306    autonum: int
307
308    def register_footnote(m: re.Match[str]) -> str:
309        nonlocal autonum
310        fn_id = m.group("id")
311        if fn_id in "*#":
312            fn_id = f"fn-{autonum}"
313            autonum += 1
314        fn_id = fn_id.lstrip("#*")
315        footnotes.add(fn_id)
316        content = indent(m.group("content"), "   ").lstrip()
317        return f"{m.group('indent')}[^{fn_id}]: {content}"
318
319    # Register footnotes
320    autonum = 1
321    contents = re.sub(
322        r"""
323            ^(?P<indent>[ ]*)\.\.[ ]+\[(?P<id>\d+|[#*]\w*)](?P<content>.*
324            (
325                \n                 # empty lines
326                |                  # or
327                (?P=indent)[ ]+.+  # lines with indentation
328            )*)$
329            """,
330        register_footnote,
331        contents,
332        flags=re.MULTILINE | re.VERBOSE,
333    )
334
335    def replace_references(m: re.Match[str]) -> str:
336        nonlocal autonum
337        fn_id = m.group("id")
338        if fn_id in "*#":
339            fn_id = f"fn-{autonum}"
340            autonum += 1
341        fn_id = fn_id.lstrip("#*")
342        if fn_id in footnotes:
343            return f"[^{fn_id}]"
344        else:
345            return m.group(0)
346
347    autonum = 1
348    contents = re.sub(r"\[(?P<id>\d+|[#*]\w*)]_", replace_references, contents)
349    return contents
350
351
352def _rst_links(contents: str) -> str:
353    """Convert reStructuredText hyperlinks"""
354    links = {}
355
356    def register_link(m: re.Match[str]) -> str:
357        refid = re.sub(r"\s", "", m.group("id").lower())
358        links[refid] = m.group("url")
359        return ""
360
361    def replace_link(m: re.Match[str]) -> str:
362        text = m.group("id")
363        refid = re.sub(r"[\s`]", "", text.lower())
364        try:
365            return f"[{text.strip('`')}]({links[refid]})"
366        except KeyError:
367            return m.group(0)
368
369    # Embedded URIs
370    contents = re.sub(
371        r"`(?P<text>[^`]+)<(?P<url>.+?)>`_", r"[\g<text>](\g<url>)", contents
372    )
373    # External Hyperlink Targets
374    contents = re.sub(
375        r"^\s*..\s+_(?P<id>[^\n:]+):\s*(?P<url>http\S+)",
376        register_link,
377        contents,
378        flags=re.MULTILINE,
379    )
380    contents = re.sub(r"(?P<id>[A-Za-z0-9_\-.:+]|`[^`]+`)_", replace_link, contents)
381    return contents
382
383
384def _rst_extract_options(contents: str) -> tuple[str, dict[str, str]]:
385    """
386    Extract options from the beginning of reStructuredText directives.
387
388    Return the trimmed content and a dict of options.
389    """
390    options = {}
391    while match := re.match(r"^\s*:(.+?):(.*)([\s\S]*)", contents):
392        key, value, contents = match.groups()
393        options[key] = value.strip()
394
395    return contents, options
396
397
398def _rst_include_trim(contents: str, options: dict[str, str]) -> str:
399    """
400    <https://docutils.sourceforge.io/docs/ref/rst/directives.html#include-options>
401    """
402    if "end-line" in options or "start-line" in options:
403        lines = contents.splitlines()
404        if i := options.get("end-line"):
405            lines = lines[: int(i)]
406        if i := options.get("start-line"):
407            lines = lines[int(i) :]
408        contents = "\n".join(lines)
409    if x := options.get("end-before"):
410        contents = contents[: contents.index(x)]
411    if x := options.get("start-after"):
412        contents = contents[contents.index(x) + len(x) :]
413    return contents
414
415
416def _rst_admonitions(contents: str, source_file: Path | None) -> str:
417    """
418    Convert reStructuredText admonitions - a bit tricky because they may already be indented themselves.
419    <https://www.sphinx-doc.org/en/master/usage/restructuredtext/directives.html>
420    """
421
422    def _rst_admonition(m: re.Match[str]) -> str:
423        ind = m.group("indent")
424        type = m.group("type")
425        val = m.group("val").strip()
426        contents = dedent(m.group("contents")).strip()
427        contents, options = _rst_extract_options(contents)
428
429        if type == "include":
430            loc = source_file or Path(".")
431            try:
432                included = (loc.parent / val).read_text("utf8", "replace")
433            except OSError as e:
434                warnings.warn(f"Cannot include {val!r}: {e}")
435                included = "\n"
436            try:
437                included = _rst_include_trim(included, options) + "\n"
438            except ValueError as e:
439                warnings.warn(f"Failed to process include options for {val!r}: {e}")
440            included = _rst_admonitions(included, loc.parent / val)
441            included = embed_images(included, loc.parent / val)
442            return indent(included, ind)
443        if type == "math":
444            return f"{ind}$${val}{contents}$$\n"
445        if type in ("note", "warning", "danger"):
446            if val:
447                heading = f"{ind}###### {val}\n"
448            else:
449                heading = ""
450            return (
451                f'{ind}<div class="alert {type}" markdown="1">\n'
452                f"{heading}"
453                f"{indent(contents, ind)}\n"
454                f"{ind}</div>\n"
455            )
456        if type == "code-block":
457            return f"{ind}```{val}\n{contents}\n```\n"
458        if type == "versionadded":
459            text = f"New in version {val}"
460        elif type == "versionchanged":
461            text = f"Changed in version {val}"
462        elif type == "deprecated":
463            text = f"Deprecated since version {val}"
464        else:
465            text = f"{type} {val}".strip()
466
467        if contents:
468            text = f"{ind}*{text}:*\n{indent(contents, ind)}\n\n"
469        else:
470            text = f"{ind}*{text}.*\n"
471
472        return text
473
474    admonition = "note|warning|danger|versionadded|versionchanged|deprecated|seealso|math|include|code-block"
475    return re.sub(
476        rf"""
477            ^(?P<indent>[ ]*)\.\.[ ]+(?P<type>{admonition})::(?P<val>.*)
478            (?P<contents>(
479                \n                 # empty lines
480                |                  # or
481                (?P=indent)[ ]+.+  # lines with indentation
482            )*)$
483        """,
484        _rst_admonition,
485        contents,
486        flags=re.MULTILINE | re.VERBOSE,
487    )
488
489
490def _rst_fields(contents: str) -> str:
491    """
492    Convert reStructuredText fields to Markdown.
493    <https://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html#rst-field-lists>
494    """
495
496    _has_parameter_section = False
497    _has_raises_section = False
498
499    def _rst_field(m: re.Match[str]) -> str:
500        type = m["type"]
501        body = m["body"]
502
503        if m["name"]:
504            name = f"**{m['name'].strip()}**: "
505        else:
506            name = ""
507
508        if type == "param":
509            nonlocal _has_parameter_section
510            text = f" - {name}{body}"
511            if not _has_parameter_section:
512                _has_parameter_section = True
513                text = "\n###### Parameters\n" + text
514            return text
515        elif type == "type":
516            return ""  # we expect users to use modern type annotations.
517        elif type == "return":
518            body = indent(body, "> ", lambda line: True)
519            return f"\n###### Returns\n{body}"
520        elif type == "rtype":
521            return ""  # we expect users to use modern type annotations.
522        elif type == "raises":
523            nonlocal _has_raises_section
524            text = f" - {name}{body}"
525            if not _has_raises_section:
526                _has_raises_section = True
527                text = "\n###### Raises\n" + text
528            return text
529        else:  # pragma: no cover
530            raise AssertionError("unreachable")
531
532    field = "param|type|return|rtype|raises"
533    return re.sub(
534        rf"""
535            ^:(?P<type>{field})(?:[ ]+(?P<name>.+))?:
536            (?P<body>.*(
537                (?:\n[ ]*)*  # maybe some empty lines followed by
538                [ ]+.+       # lines with indentation
539            )*(?:\n|$))
540        """,
541        _rst_field,
542        contents,
543        flags=re.MULTILINE | re.VERBOSE,
544    )
AnyException = (<class 'SystemExit'>, <class 'GeneratorExit'>, <class 'Exception'>)

BaseException, but excluding KeyboardInterrupt.

Modules may raise SystemExit on import (which we want to catch), but we don't want to catch a user's KeyboardInterrupt.

@cache
def convert( docstring: str, docformat: str, source_file: pathlib._local.Path | None) -> str:
36@cache
37def convert(docstring: str, docformat: str, source_file: Path | None) -> str:
38    """
39    Convert `docstring` from `docformat` to Markdown.
40    """
41    docformat = docformat.lower()
42
43    try:
44        if any(x in docformat for x in ["google", "numpy", "restructuredtext"]):
45            docstring = rst(docstring, source_file)
46
47        if "google" in docformat:
48            docstring = google(docstring)
49
50        if "numpy" in docformat:
51            docstring = numpy(docstring)
52
53        if source_file is not None and os.environ.get("PDOC_EMBED_IMAGES") != "0":
54            docstring = embed_images(docstring, source_file)
55
56    except AnyException as e:
57        raise RuntimeError(
58            'Docstring processing failed for docstring=\n"""\n'
59            + docstring
60            + f'\n"""\n{source_file=}\n{docformat=}'
61        ) from e
62
63    return docstring

Convert docstring from docformat to Markdown.

def embed_images(docstring: str, source_file: pathlib._local.Path) -> str:
66def embed_images(docstring: str, source_file: Path) -> str:
67    def local_image_to_data_uri(href: str) -> str:
68        image_path = source_file.parent / href
69        image_data = image_path.read_bytes()
70        image_mime = mimetypes.guess_type(image_path)[0]
71        image_data_b64 = base64.b64encode(image_data).decode()
72        return f"data:{image_mime};base64,{image_data_b64}"
73
74    def embed_local_image(m: re.Match) -> str:
75        try:
76            href = local_image_to_data_uri(m["href"])
77        except Exception:
78            return m[0]
79        else:
80            return m["before"] + href + m["after"]
81
82    # TODO: Could probably do more here, e.g. support rST replacements.
83    for regex in [
84        r"(?P<before>!\[\s*.*?\s*]\(\s*)(?P<href>.+?)(?P<after>\s*\))",
85        r"""(?P<before>src=['"])(?P<href>.+?)(?P<after>['"])""",
86    ]:
87        docstring = re.sub(regex, embed_local_image, docstring)
88    return docstring
def google(docstring: str) -> str:
 91def google(docstring: str) -> str:
 92    """Convert Google-style docstring sections into Markdown."""
 93    return re.sub(
 94        r"""
 95        ^(?P<name>[A-Z][A-Z a-z]+):\n
 96        (?P<contents>(
 97            \n        # empty lines
 98            |         # or
 99            [ \t]+.+  # lines with indentation
100        )+)$
101        """,
102        _google_section,
103        docstring,
104        flags=re.VERBOSE | re.MULTILINE,
105    )

Convert Google-style docstring sections into Markdown.

GOOGLE_LIST_SECTIONS = ['Args', 'Raises', 'Attributes']

Section headers listed in the official Google docstring style guide.

GOOGLE_LIST_SECTION_ALIASES = {'Parameters': 'Args', 'Params': 'Args', 'Arguments': 'Args'}

Alternative section headers that are not listed in the official Google docstring style guide but that we recognize as sections containing lists nevertheless.

def numpy(docstring: str) -> str:
187def numpy(docstring: str) -> str:
188    """Convert NumPy-style docstring sections into Markdown.
189
190    See <https://numpydoc.readthedocs.io/en/latest/format.html> for details.
191    """
192    sections = re.split(
193        r"""
194        ^([A-Z][A-Za-z ]+)\n  # a heading
195        ---+\n+              # followed by a dashed line
196        """,
197        docstring,
198        flags=re.VERBOSE | re.MULTILINE,
199    )
200    contents = sections[0]
201    for heading, content in zip(sections[1::2], sections[2::2]):
202        if content.startswith(" ") and re.search(r"\n(?![ \n])", content):
203            # If the first line of section content is indented, we consider the section to be finished
204            # on the first non-indented line. We take out the rest - the tail - here.
205            content, tail = re.split(r"\n(?![ \n])", content, maxsplit=1)
206        else:
207            tail = ""
208
209        content = dedent(content)
210
211        if heading in (
212            "Parameters",
213            "Returns",
214            "Yields",
215            "Receives",
216            "Other Parameters",
217            "Raises",
218            "Warns",
219            "Attributes",
220        ):
221            contents += f"###### {heading}\n{_numpy_parameters(content)}"
222        elif heading == "See Also":
223            contents += f"###### {heading}\n{_numpy_seealso(content)}"
224        else:
225            contents += f"###### {heading}\n{content}"
226        contents += tail
227    return contents

Convert NumPy-style docstring sections into Markdown.

See https://numpydoc.readthedocs.io/en/latest/format.html for details.

def rst(contents: str, source_file: pathlib._local.Path | None) -> str:
271def rst(contents: str, source_file: Path | None) -> str:
272    """
273    Convert reStructuredText elements to Markdown.
274    We support the most common elements, but we do not aim to mirror the full complexity of the spec here.
275    """
276    contents = _rst_admonitions(contents, source_file)
277    contents = _rst_links(contents)
278
279    def replace_reference(m):
280        _, kind, name = m.groups()
281        if kind in ("meth", "func"):
282            return f"`{name}()`"
283        else:
284            return f"`{name}`"
285
286    # Code References: :obj:`foo` -> `foo`
287    contents = re.sub(
288        r"(:py)?:(mod|func|data|const|class|meth|attr|exc|obj):`([^`]+)`",
289        replace_reference,
290        contents,
291    )
292
293    # Math: :math:`foo` -> \\( foo \\)
294    # We don't use $ as that's not enabled by MathJax by default.
295    contents = re.sub(r":math:`(.+?)`", r"\\\\( \1 \\\\)", contents)
296
297    contents = _rst_footnotes(contents)
298
299    contents = _rst_fields(contents)
300
301    return contents

Convert reStructuredText elements to Markdown. We support the most common elements, but we do not aim to mirror the full complexity of the spec here.