Edit on GitHub

pdoc.docstrings

This module handles the conversion of docstring flavors to Markdown.

The conversion from docstring flavors to Markdown is mostly done with regular expressions. This is not particularly beautiful, but good enough for our purposes. The alternative would be to depend on https://github.com/rr-/docstring_parser or a similar project, but that introduces more complexity than we are comfortable with.

If you miss a particular feature for your favorite flavor, contributions are welcome. That being said, please keep the complexity low and make sure that changes are accompanied by matching snapshot tests in test/testdata/.

  1"""
  2This module handles the conversion of docstring flavors to Markdown.
  3
  4The conversion from docstring flavors to Markdown is mostly done with regular expressions.
  5This is not particularly beautiful, but good enough for our purposes.
  6The alternative would be to depend on <https://github.com/rr-/docstring_parser> or a similar project,
  7but that introduces more complexity than we are comfortable with.
  8
  9If you miss a particular feature for your favorite flavor, contributions are welcome.
 10That being said, please keep the complexity low and make sure that changes are
 11accompanied by matching snapshot tests in `test/testdata/`.
 12"""
 13
 14from __future__ import annotations
 15
 16import base64
 17from functools import cache
 18import inspect
 19import mimetypes
 20import os
 21from pathlib import Path
 22import re
 23from textwrap import dedent
 24from textwrap import indent
 25import warnings
 26
 27AnyException = (SystemExit, GeneratorExit, Exception)
 28"""BaseException, but excluding KeyboardInterrupt.
 29
 30Modules may raise SystemExit on import (which we want to catch),
 31but we don't want to catch a user's KeyboardInterrupt.
 32"""
 33
 34
 35@cache
 36def convert(docstring: str, docformat: str, source_file: Path | None) -> str:
 37    """
 38    Convert `docstring` from `docformat` to Markdown.
 39    """
 40    docformat = docformat.lower()
 41
 42    try:
 43        if any(x in docformat for x in ["google", "numpy", "restructuredtext"]):
 44            docstring = rst(docstring, source_file)
 45
 46        if "google" in docformat:
 47            docstring = google(docstring)
 48
 49        if "numpy" in docformat:
 50            docstring = numpy(docstring)
 51
 52        if source_file is not None and os.environ.get("PDOC_EMBED_IMAGES") != "0":
 53            docstring = embed_images(docstring, source_file)
 54
 55    except AnyException as e:
 56        raise RuntimeError(
 57            'Docstring processing failed for docstring=\n"""\n'
 58            + docstring
 59            + f'\n"""\n{source_file=}\n{docformat=}'
 60        ) from e
 61
 62    return docstring
 63
 64
 65def embed_images(docstring: str, source_file: Path) -> str:
 66    def local_image_to_data_uri(href: str) -> str:
 67        image_path = source_file.parent / href
 68        image_data = image_path.read_bytes()
 69        image_mime = mimetypes.guess_type(image_path)[0]
 70        image_data_b64 = base64.b64encode(image_data).decode()
 71        return f"data:{image_mime};base64,{image_data_b64}"
 72
 73    def embed_local_image(m: re.Match) -> str:
 74        try:
 75            href = local_image_to_data_uri(m["href"])
 76        except Exception:
 77            return m[0]
 78        else:
 79            return m["before"] + href + m["after"]
 80
 81    # TODO: Could probably do more here, e.g. support rST replacements.
 82    for regex in [
 83        r"(?P<before>!\[\s*.*?\s*]\(\s*)(?P<href>.+?)(?P<after>\s*\))",
 84        r"""(?P<before>src=['"])(?P<href>.+?)(?P<after>['"])""",
 85    ]:
 86        docstring = re.sub(regex, embed_local_image, docstring)
 87    return docstring
 88
 89
 90def google(docstring: str) -> str:
 91    """Convert Google-style docstring sections into Markdown."""
 92    return re.sub(
 93        r"""
 94        ^(?P<name>[A-Z][A-Z a-z]+):\n
 95        (?P<contents>(
 96            \n        # empty lines
 97            |         # or
 98            [ \t]+.+  # lines with indentation
 99        )+)$
100        """,
101        _google_section,
102        docstring,
103        flags=re.VERBOSE | re.MULTILINE,
104    )
105
106
107GOOGLE_LIST_SECTIONS = ["Args", "Raises", "Attributes", "Keyword Args"]
108"""Section headers listed in the official Google docstring style guide."""
109
110GOOGLE_LIST_SECTION_ALIASES = {
111    "Parameters": "Args",
112    "Params": "Args",
113    "Arguments": "Args",
114    "Raise": "Raises",
115    "Keyword Arguments": "Keyword Args",
116}
117"""
118Alternative section headers that are not listed in the official Google
119docstring style guide but that we recognize as sections containing lists
120nevertheless.
121"""
122
123
124def _google_section(m: re.Match[str]) -> str:
125    name = m.group("name")
126    contents = dedent(m.group("contents")).lstrip()
127
128    if name in GOOGLE_LIST_SECTION_ALIASES:
129        name = GOOGLE_LIST_SECTION_ALIASES[name]
130
131    if name in GOOGLE_LIST_SECTIONS:
132        items = _indented_list(contents)
133        contents = ""
134        for item in items:
135            try:
136                # first ":" on the first line
137                _, attr, desc = re.split(r"^(.+?:)", item, maxsplit=1)
138            except ValueError:
139                contents += " - " + indent(item, "   ")[3:]
140            else:
141                contents += f" - **{attr}** " + indent(desc, "   ")[3:]
142            contents += "\n"
143    else:
144        contents = indent(contents, "> ", lambda line: True)
145
146    if name == "Args":
147        name = "Arguments"
148
149    return f"\n###### {name}:\n{contents}\n"
150
151
152def _indented_list(contents: str) -> list[str]:
153    """
154    Convert a list string into individual (dedented) elements. For example,
155
156    foo:
157        desc
158    bar: int
159        more desc
160    baz:
161        desc
162            indented
163
164    returns [
165        "foo:\ndesc",
166        "bar: int\nmore desc",
167        "baz:\ndesc\n    indented",
168    ]
169    """
170    # we expect this to be through cleandoc() already.
171    assert not contents.startswith(" "), contents
172    assert not contents.startswith("\n"), contents
173
174    ret: list[str] = []
175    for line in contents.splitlines(keepends=True):
176        empty = not line.strip()
177        indented = line.startswith(" ")
178        if not (empty or indented):
179            # new section
180            ret.append(line)
181        else:
182            # append to current section
183            ret[-1] += line
184
185    return [inspect.cleandoc(x) for x in ret]
186
187
188def numpy(docstring: str) -> str:
189    """Convert NumPy-style docstring sections into Markdown.
190
191    See <https://numpydoc.readthedocs.io/en/latest/format.html> for details.
192    """
193    sections = re.split(
194        r"""
195        ^([A-Z][A-Za-z ]+)\n  # a heading
196        ---+\n+              # followed by a dashed line
197        """,
198        docstring,
199        flags=re.VERBOSE | re.MULTILINE,
200    )
201    contents = sections[0]
202    for heading, content in zip(sections[1::2], sections[2::2]):
203        if content.startswith(" ") and re.search(r"\n(?![ \n])", content):
204            # If the first line of section content is indented, we consider the section to be finished
205            # on the first non-indented line. We take out the rest - the tail - here.
206            content, tail = re.split(r"\n(?![ \n])", content, maxsplit=1)
207        else:
208            tail = ""
209
210        content = dedent(content)
211
212        if heading in (
213            "Parameters",
214            "Returns",
215            "Yields",
216            "Receives",
217            "Other Parameters",
218            "Raises",
219            "Warns",
220            "Attributes",
221        ):
222            contents += f"###### {heading}\n{_numpy_parameters(content)}"
223        elif heading == "See Also":
224            contents += f"###### {heading}\n{_numpy_seealso(content)}"
225        else:
226            contents += f"###### {heading}\n{content}"
227        contents += tail
228    return contents
229
230
231def _numpy_seealso(content: str) -> str:
232    """Convert a NumPy-style "See Also" section into Markdown"""
233    contents = ""
234    for item in _indented_list(content):
235        if ":" in item:
236            funcstr, desc = item.split(":", maxsplit=1)
237            desc = f": {desc}"
238        else:
239            funcstr, desc = item, ""
240
241        funclist = [f.strip() for f in funcstr.split(" ")]
242        funcs = ", ".join(f"`{f}`" for f in funclist if f)
243        contents += f"{funcs}{desc}  \n"
244    return contents
245
246
247def _numpy_parameters(content: str) -> str:
248    """Convert a NumPy-style parameter section into Markdown"""
249    contents = ""
250    for item in _indented_list(content):
251        m = re.match(r"^(.+):(.+)([\s\S]*)", item)
252        if m:
253            contents += (
254                f" - **{m.group(1).strip()}** ({m.group(2).strip()}):\n"
255                f"{indent(m.group(3).strip(), '   ')}\n"
256            )
257        else:
258            if "\n" in item:
259                name, desc = item.split("\n", maxsplit=1)
260                name = name.strip()
261                desc = desc.strip()
262            else:
263                name, desc = item.strip(), ""
264
265            if desc:
266                contents += f" - **{name}**: {desc}\n"
267            else:
268                contents += f" - **{name}**\n"
269    return f"{contents}\n"
270
271
272def rst(contents: str, source_file: Path | None) -> str:
273    """
274    Convert reStructuredText elements to Markdown.
275    We support the most common elements, but we do not aim to mirror the full complexity of the spec here.
276    """
277    contents = _rst_admonitions(contents, source_file)
278    contents = _rst_links(contents)
279
280    def replace_reference(m):
281        _, kind, name = m.groups()
282        if kind in ("meth", "func"):
283            return f"`{name}()`"
284        else:
285            return f"`{name}`"
286
287    # Code References: :obj:`foo` -> `foo`
288    contents = re.sub(
289        r"(:py)?:(mod|func|data|const|class|meth|attr|exc|obj):`([^`]+)`",
290        replace_reference,
291        contents,
292    )
293
294    # Math: :math:`foo` -> \\( foo \\)
295    # We don't use $ as that's not enabled by MathJax by default.
296    contents = re.sub(r":math:`(.+?)`", r"\\\\( \1 \\\\)", contents)
297
298    contents = _rst_footnotes(contents)
299
300    contents = _rst_fields(contents)
301
302    return contents
303
304
305def _rst_footnotes(contents: str) -> str:
306    """Convert reStructuredText footnotes"""
307    footnotes: set[str] = set()
308    autonum: int
309
310    def register_footnote(m: re.Match[str]) -> str:
311        nonlocal autonum
312        fn_id = m.group("id")
313        if fn_id in "*#":
314            fn_id = f"fn-{autonum}"
315            autonum += 1
316        fn_id = fn_id.lstrip("#*")
317        footnotes.add(fn_id)
318        content = indent(m.group("content"), "   ").lstrip()
319        return f"{m.group('indent')}[^{fn_id}]: {content}"
320
321    # Register footnotes
322    autonum = 1
323    contents = re.sub(
324        r"""
325            ^(?P<indent>[ ]*)\.\.[ ]+\[(?P<id>\d+|[#*]\w*)](?P<content>.*
326            (
327                \n                 # empty lines
328                |                  # or
329                (?P=indent)[ ]+.+  # lines with indentation
330            )*)$
331            """,
332        register_footnote,
333        contents,
334        flags=re.MULTILINE | re.VERBOSE,
335    )
336
337    def replace_references(m: re.Match[str]) -> str:
338        nonlocal autonum
339        fn_id = m.group("id")
340        if fn_id in "*#":
341            fn_id = f"fn-{autonum}"
342            autonum += 1
343        fn_id = fn_id.lstrip("#*")
344        if fn_id in footnotes:
345            return f"[^{fn_id}]"
346        else:
347            return m.group(0)
348
349    autonum = 1
350    contents = re.sub(r"\[(?P<id>\d+|[#*]\w*)]_", replace_references, contents)
351    return contents
352
353
354def _rst_links(contents: str) -> str:
355    """Convert reStructuredText hyperlinks"""
356    links = {}
357
358    def register_link(m: re.Match[str]) -> str:
359        refid = re.sub(r"\s", "", m.group("id").lower())
360        links[refid] = m.group("url")
361        return ""
362
363    def replace_link(m: re.Match[str]) -> str:
364        text = m.group("id")
365        refid = re.sub(r"[\s`]", "", text.lower())
366        try:
367            return f"[{text.strip('`')}]({links[refid]})"
368        except KeyError:
369            return m.group(0)
370
371    # Embedded URIs
372    contents = re.sub(
373        r"`(?P<text>[^`]+)<(?P<url>.+?)>`_", r"[\g<text>](\g<url>)", contents
374    )
375    # External Hyperlink Targets
376    contents = re.sub(
377        r"^\s*..\s+_(?P<id>[^\n:]+):\s*(?P<url>http\S+)",
378        register_link,
379        contents,
380        flags=re.MULTILINE,
381    )
382    contents = re.sub(r"(?P<id>[A-Za-z0-9_\-.:+]|`[^`]+`)_", replace_link, contents)
383    return contents
384
385
386def _rst_extract_options(contents: str) -> tuple[str, dict[str, str]]:
387    """
388    Extract options from the beginning of reStructuredText directives.
389
390    Return the trimmed content and a dict of options.
391    """
392    options = {}
393    while match := re.match(r"^\s*:(.+?):(.*)([\s\S]*)", contents):
394        key, value, contents = match.groups()
395        options[key] = value.strip()
396
397    return contents, options
398
399
400def _rst_include_trim(contents: str, options: dict[str, str]) -> str:
401    """
402    <https://docutils.sourceforge.io/docs/ref/rst/directives.html#include-options>
403    """
404    if "end-line" in options or "start-line" in options:
405        lines = contents.splitlines()
406        if i := options.get("end-line"):
407            lines = lines[: int(i)]
408        if i := options.get("start-line"):
409            lines = lines[int(i) :]
410        contents = "\n".join(lines)
411    if x := options.get("end-before"):
412        contents = contents[: contents.index(x)]
413    if x := options.get("start-after"):
414        contents = contents[contents.index(x) + len(x) :]
415    return contents
416
417
418def _rst_admonitions(contents: str, source_file: Path | None) -> str:
419    """
420    Convert reStructuredText admonitions - a bit tricky because they may already be indented themselves.
421    <https://www.sphinx-doc.org/en/master/usage/restructuredtext/directives.html>
422    """
423
424    def _rst_admonition(m: re.Match[str]) -> str:
425        ind = m.group("indent")
426        type = m.group("type")
427        val = m.group("val").strip()
428        contents = dedent(m.group("contents")).strip()
429        contents, options = _rst_extract_options(contents)
430
431        if type == "include":
432            loc = source_file or Path(".")
433            try:
434                included = (loc.parent / val).read_text("utf8", "replace")
435            except OSError as e:
436                warnings.warn(f"Cannot include {val!r}: {e}")
437                included = "\n"
438            try:
439                included = _rst_include_trim(included, options) + "\n"
440            except ValueError as e:
441                warnings.warn(f"Failed to process include options for {val!r}: {e}")
442            included = _rst_admonitions(included, loc.parent / val)
443            included = embed_images(included, loc.parent / val)
444            return indent(included, ind)
445        if type == "math":
446            return f"{ind}$${val}{contents}$$\n"
447        if type in ("note", "warning", "danger"):
448            if val:
449                heading = f"{ind}###### {val}\n"
450            else:
451                heading = ""
452            return (
453                f'{ind}<div class="alert {type}" markdown="1">\n'
454                f"{heading}"
455                f"{indent(contents, ind)}\n"
456                f"{ind}</div>\n"
457            )
458        if type == "code-block":
459            return f"{ind}```{val}\n{contents}\n```\n"
460        if type == "versionadded":
461            text = f"New in version {val}"
462        elif type == "versionchanged":
463            text = f"Changed in version {val}"
464        elif type == "deprecated":
465            text = f"Deprecated since version {val}"
466        else:
467            text = f"{type} {val}".strip()
468
469        if contents:
470            text = f"{ind}*{text}:*\n{indent(contents, ind)}\n\n"
471        else:
472            text = f"{ind}*{text}.*\n"
473
474        return text
475
476    admonition = "note|warning|danger|versionadded|versionchanged|deprecated|seealso|math|include|code-block"
477    return re.sub(
478        rf"""
479            ^(?P<indent>[ ]*)\.\.[ ]+(?P<type>{admonition})::(?P<val>.*)
480            (?P<contents>(
481                \n                 # empty lines
482                |                  # or
483                (?P=indent)[ ]+.+  # lines with indentation
484            )*)$
485        """,
486        _rst_admonition,
487        contents,
488        flags=re.MULTILINE | re.VERBOSE,
489    )
490
491
492def _rst_fields(contents: str) -> str:
493    """
494    Convert reStructuredText fields to Markdown.
495    <https://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html#rst-field-lists>
496    """
497
498    _has_parameter_section = False
499    _has_raises_section = False
500
501    def _rst_field(m: re.Match[str]) -> str:
502        type = m["type"]
503        body = m["body"]
504
505        if m["name"]:
506            name = f"**{m['name'].strip()}**: "
507        else:
508            name = ""
509
510        if type == "param":
511            nonlocal _has_parameter_section
512            text = f" - {name}{body}"
513            if not _has_parameter_section:
514                _has_parameter_section = True
515                text = "\n###### Parameters\n" + text
516            return text
517        elif type == "type":
518            return ""  # we expect users to use modern type annotations.
519        elif type == "return":
520            body = indent(body, "> ", lambda line: True)
521            return f"\n###### Returns\n{body}"
522        elif type == "rtype":
523            return ""  # we expect users to use modern type annotations.
524        elif type == "raises":
525            nonlocal _has_raises_section
526            text = f" - {name}{body}"
527            if not _has_raises_section:
528                _has_raises_section = True
529                text = "\n###### Raises\n" + text
530            return text
531        else:  # pragma: no cover
532            raise AssertionError("unreachable")
533
534    field = "param|type|return|rtype|raises"
535    return re.sub(
536        rf"""
537            ^:(?P<type>{field})(?:[ ]+(?P<name>.+))?:
538            (?P<body>.*(
539                (?:\n[ ]*)*  # maybe some empty lines followed by
540                [ ]+.+       # lines with indentation
541            )*(?:\n|$))
542        """,
543        _rst_field,
544        contents,
545        flags=re.MULTILINE | re.VERBOSE,
546    )
AnyException = (<class 'SystemExit'>, <class 'GeneratorExit'>, <class 'Exception'>)

BaseException, but excluding KeyboardInterrupt.

Modules may raise SystemExit on import (which we want to catch), but we don't want to catch a user's KeyboardInterrupt.

@cache
def convert(docstring: str, docformat: str, source_file: pathlib.Path | None) -> str:
36@cache
37def convert(docstring: str, docformat: str, source_file: Path | None) -> str:
38    """
39    Convert `docstring` from `docformat` to Markdown.
40    """
41    docformat = docformat.lower()
42
43    try:
44        if any(x in docformat for x in ["google", "numpy", "restructuredtext"]):
45            docstring = rst(docstring, source_file)
46
47        if "google" in docformat:
48            docstring = google(docstring)
49
50        if "numpy" in docformat:
51            docstring = numpy(docstring)
52
53        if source_file is not None and os.environ.get("PDOC_EMBED_IMAGES") != "0":
54            docstring = embed_images(docstring, source_file)
55
56    except AnyException as e:
57        raise RuntimeError(
58            'Docstring processing failed for docstring=\n"""\n'
59            + docstring
60            + f'\n"""\n{source_file=}\n{docformat=}'
61        ) from e
62
63    return docstring

Convert docstring from docformat to Markdown.

def embed_images(docstring: str, source_file: pathlib.Path) -> str:
66def embed_images(docstring: str, source_file: Path) -> str:
67    def local_image_to_data_uri(href: str) -> str:
68        image_path = source_file.parent / href
69        image_data = image_path.read_bytes()
70        image_mime = mimetypes.guess_type(image_path)[0]
71        image_data_b64 = base64.b64encode(image_data).decode()
72        return f"data:{image_mime};base64,{image_data_b64}"
73
74    def embed_local_image(m: re.Match) -> str:
75        try:
76            href = local_image_to_data_uri(m["href"])
77        except Exception:
78            return m[0]
79        else:
80            return m["before"] + href + m["after"]
81
82    # TODO: Could probably do more here, e.g. support rST replacements.
83    for regex in [
84        r"(?P<before>!\[\s*.*?\s*]\(\s*)(?P<href>.+?)(?P<after>\s*\))",
85        r"""(?P<before>src=['"])(?P<href>.+?)(?P<after>['"])""",
86    ]:
87        docstring = re.sub(regex, embed_local_image, docstring)
88    return docstring
def google(docstring: str) -> str:
 91def google(docstring: str) -> str:
 92    """Convert Google-style docstring sections into Markdown."""
 93    return re.sub(
 94        r"""
 95        ^(?P<name>[A-Z][A-Z a-z]+):\n
 96        (?P<contents>(
 97            \n        # empty lines
 98            |         # or
 99            [ \t]+.+  # lines with indentation
100        )+)$
101        """,
102        _google_section,
103        docstring,
104        flags=re.VERBOSE | re.MULTILINE,
105    )

Convert Google-style docstring sections into Markdown.

GOOGLE_LIST_SECTIONS = ['Args', 'Raises', 'Attributes', 'Keyword Args']

Section headers listed in the official Google docstring style guide.

GOOGLE_LIST_SECTION_ALIASES = {'Parameters': 'Args', 'Params': 'Args', 'Arguments': 'Args', 'Raise': 'Raises', 'Keyword Arguments': 'Keyword Args'}

Alternative section headers that are not listed in the official Google docstring style guide but that we recognize as sections containing lists nevertheless.

def numpy(docstring: str) -> str:
189def numpy(docstring: str) -> str:
190    """Convert NumPy-style docstring sections into Markdown.
191
192    See <https://numpydoc.readthedocs.io/en/latest/format.html> for details.
193    """
194    sections = re.split(
195        r"""
196        ^([A-Z][A-Za-z ]+)\n  # a heading
197        ---+\n+              # followed by a dashed line
198        """,
199        docstring,
200        flags=re.VERBOSE | re.MULTILINE,
201    )
202    contents = sections[0]
203    for heading, content in zip(sections[1::2], sections[2::2]):
204        if content.startswith(" ") and re.search(r"\n(?![ \n])", content):
205            # If the first line of section content is indented, we consider the section to be finished
206            # on the first non-indented line. We take out the rest - the tail - here.
207            content, tail = re.split(r"\n(?![ \n])", content, maxsplit=1)
208        else:
209            tail = ""
210
211        content = dedent(content)
212
213        if heading in (
214            "Parameters",
215            "Returns",
216            "Yields",
217            "Receives",
218            "Other Parameters",
219            "Raises",
220            "Warns",
221            "Attributes",
222        ):
223            contents += f"###### {heading}\n{_numpy_parameters(content)}"
224        elif heading == "See Also":
225            contents += f"###### {heading}\n{_numpy_seealso(content)}"
226        else:
227            contents += f"###### {heading}\n{content}"
228        contents += tail
229    return contents

Convert NumPy-style docstring sections into Markdown.

See https://numpydoc.readthedocs.io/en/latest/format.html for details.

def rst(contents: str, source_file: pathlib.Path | None) -> str:
273def rst(contents: str, source_file: Path | None) -> str:
274    """
275    Convert reStructuredText elements to Markdown.
276    We support the most common elements, but we do not aim to mirror the full complexity of the spec here.
277    """
278    contents = _rst_admonitions(contents, source_file)
279    contents = _rst_links(contents)
280
281    def replace_reference(m):
282        _, kind, name = m.groups()
283        if kind in ("meth", "func"):
284            return f"`{name}()`"
285        else:
286            return f"`{name}`"
287
288    # Code References: :obj:`foo` -> `foo`
289    contents = re.sub(
290        r"(:py)?:(mod|func|data|const|class|meth|attr|exc|obj):`([^`]+)`",
291        replace_reference,
292        contents,
293    )
294
295    # Math: :math:`foo` -> \\( foo \\)
296    # We don't use $ as that's not enabled by MathJax by default.
297    contents = re.sub(r":math:`(.+?)`", r"\\\\( \1 \\\\)", contents)
298
299    contents = _rst_footnotes(contents)
300
301    contents = _rst_fields(contents)
302
303    return contents

Convert reStructuredText elements to Markdown. We support the most common elements, but we do not aim to mirror the full complexity of the spec here.