Edit on GitHub

pdoc.docstrings

This module handles the conversion of docstring flavors to Markdown.

The conversion from docstring flavors to Markdown is mostly done with regular expressions. This is not particularly beautiful, but good enough for our purposes. The alternative would be to depend on https://github.com/rr-/docstring_parser or a similar project, but that introduces more complexity than we are comfortable with.

If you miss a particular feature for your favorite flavor, contributions are welcome. That being said, please keep the complexity low and make sure that changes are accompanied by matching snapshot tests in test/testdata/.

  1"""
  2This module handles the conversion of docstring flavors to Markdown.
  3
  4The conversion from docstring flavors to Markdown is mostly done with regular expressions.
  5This is not particularly beautiful, but good enough for our purposes.
  6The alternative would be to depend on <https://github.com/rr-/docstring_parser> or a similar project,
  7but that introduces more complexity than we are comfortable with.
  8
  9If you miss a particular feature for your favorite flavor, contributions are welcome.
 10That being said, please keep the complexity low and make sure that changes are
 11accompanied by matching snapshot tests in `test/testdata/`.
 12"""
 13
 14from __future__ import annotations
 15
 16import base64
 17from functools import cache
 18import inspect
 19import mimetypes
 20import os
 21from pathlib import Path
 22import re
 23from textwrap import dedent
 24from textwrap import indent
 25import warnings
 26
 27
 28@cache
 29def convert(docstring: str, docformat: str, source_file: Path | None) -> str:
 30    """
 31    Convert `docstring` from `docformat` to Markdown.
 32    """
 33    docformat = docformat.lower()
 34
 35    if any(x in docformat for x in ["google", "numpy", "restructuredtext"]):
 36        docstring = rst(docstring, source_file)
 37
 38    if "google" in docformat:
 39        docstring = google(docstring)
 40
 41    if "numpy" in docformat:
 42        docstring = numpy(docstring)
 43
 44    if source_file is not None and os.environ.get("PDOC_EMBED_IMAGES") != "0":
 45        docstring = embed_images(docstring, source_file)
 46
 47    return docstring
 48
 49
 50def embed_images(docstring: str, source_file: Path) -> str:
 51    def embed_local_image(m: re.Match) -> str:
 52        image_path = source_file.parent / m["href"]
 53        try:
 54            image_data = image_path.read_bytes()
 55            image_mime = mimetypes.guess_type(image_path)[0]
 56        except Exception:
 57            return m[0]
 58        else:
 59            data = base64.b64encode(image_data).decode()
 60            return f"![{m['alt']}](data:{image_mime};base64,{data})"
 61
 62    return re.sub(
 63        r"!\[\s*(?P<alt>.*?)\s*]\(\s*(?P<href>.+?)\s*\)",
 64        embed_local_image,
 65        docstring,
 66    )
 67    # TODO: Could probably do more here, e.g. support rST or raw HTML replacements.
 68
 69
 70def google(docstring: str) -> str:
 71    """Convert Google-style docstring sections into Markdown."""
 72    return re.sub(
 73        r"""
 74        ^(?P<name>[A-Z][A-Z a-z]+):\n
 75        (?P<contents>(
 76            \n        # empty lines
 77            |         # or
 78            [ \t]+.+  # lines with indentation
 79        )+)$
 80        """,
 81        _google_section,
 82        docstring,
 83        flags=re.VERBOSE | re.MULTILINE,
 84    )
 85
 86
 87GOOGLE_LIST_SECTIONS = ["Args", "Raises", "Attributes"]
 88"""Section headers listed in the official Google docstring style guide."""
 89
 90GOOGLE_LIST_SECTION_ALIASES = {
 91    "Parameters": "Args",
 92    "Params": "Args",
 93    "Arguments": "Args",
 94}
 95"""
 96Alternative section headers that are not listed in the official Google
 97docstring style guide but that we recognize as sections containing lists
 98nevertheless.
 99"""
100
101
102def _google_section(m: re.Match[str]) -> str:
103    name = m.group("name")
104    contents = dedent(m.group("contents")).lstrip()
105
106    if name in GOOGLE_LIST_SECTION_ALIASES:
107        name = GOOGLE_LIST_SECTION_ALIASES[name]
108
109    if name in GOOGLE_LIST_SECTIONS:
110        items = _indented_list(contents)
111        contents = ""
112        for item in items:
113            try:
114                # first ":" on the first line
115                _, attr, desc = re.split(r"^(.+?:)", item, maxsplit=1)
116            except ValueError:
117                contents += " - " + indent(item, "   ")[3:]
118            else:
119                contents += f" - **{attr}** " + indent(desc, "   ")[3:]
120            contents += "\n"
121    else:
122        contents = indent(contents, "> ", lambda line: True)
123
124    if name == "Args":
125        name = "Arguments"
126
127    return f"\n###### {name}:\n{contents}\n"
128
129
130def _indented_list(contents: str) -> list[str]:
131    """
132    Convert a list string into individual (dedented) elements. For example,
133
134    foo:
135        desc
136    bar: int
137        more desc
138    baz:
139        desc
140            indented
141
142    returns [
143        "foo:\ndesc",
144        "bar: int\nmore desc",
145        "baz:\ndesc\n    indented",
146    ]
147    """
148    # we expect this to be through cleandoc() already.
149    assert not contents.startswith(" "), contents
150    assert not contents.startswith("\n"), contents
151
152    ret: list[str] = []
153    for line in contents.splitlines(keepends=True):
154        empty = not line.strip()
155        indented = line.startswith(" ")
156        if not (empty or indented):
157            # new section
158            ret.append(line)
159        else:
160            # append to current section
161            ret[-1] += line
162
163    return [inspect.cleandoc(x) for x in ret]
164
165
166def numpy(docstring: str) -> str:
167    """Convert NumPy-style docstring sections into Markdown.
168
169    See <https://numpydoc.readthedocs.io/en/latest/format.html> for details.
170    """
171    sections = re.split(
172        r"""
173        ^([A-Z][A-Za-z ]+)\n  # a heading
174        ---+\n+              # followed by a dashed line
175        """,
176        docstring,
177        flags=re.VERBOSE | re.MULTILINE,
178    )
179    contents = sections[0]
180    for heading, content in zip(sections[1::2], sections[2::2]):
181        if content.startswith(" "):
182            # If the first line of section content is indented, we consider the section to be finished
183            # on the first non-indented line. We take out the rest - the tail - here.
184            content, tail = re.split(r"\n(?![ \n])", content, maxsplit=1)
185        else:
186            tail = ""
187
188        if heading in (
189            "Parameters",
190            "Returns",
191            "Yields",
192            "Receives",
193            "Other Parameters",
194            "Raises",
195            "Warns",
196            "Attributes",
197        ):
198            contents += f"###### {heading}\n{_numpy_parameters(content)}"
199        elif heading == "See Also":
200            contents += f"###### {heading}\n{_numpy_seealso(content)}"
201        else:
202            contents += f"###### {heading}\n{dedent(content)}"
203        contents += tail
204    return contents
205
206
207def _numpy_seealso(content: str) -> str:
208    """Convert a NumPy-style "See Also" section into Markdown"""
209    contents = ""
210    for item in _indented_list(content):
211        if ":" in item:
212            funcstr, desc = item.split(":", maxsplit=1)
213            desc = f": {desc}"
214        else:
215            funcstr, desc = item, ""
216
217        funclist = [f.strip() for f in funcstr.split(" ")]
218        funcs = ", ".join(f"`{f}`" for f in funclist if f)
219        contents += f"{funcs}{desc}  \n"
220    return contents
221
222
223def _numpy_parameters(content: str) -> str:
224    """Convert a NumPy-style parameter section into Markdown"""
225    contents = ""
226    for item in _indented_list(content):
227        m = re.match(r"^(.+):(.+)([\s\S]*)", item)
228        if m:
229            contents += (
230                f" - **{m.group(1).strip()}** ({m.group(2).strip()}):\n"
231                f"{indent(m.group(3).strip(), '   ')}\n"
232            )
233        else:
234            if "\n" in item:
235                name, desc = item.split("\n", maxsplit=1)
236                name = name.strip()
237                desc = desc.strip()
238            else:
239                name, desc = item.strip(), ""
240
241            if desc:
242                contents += f" - **{name}**: {desc}\n"
243            else:
244                contents += f" - **{name}**\n"
245    return f"{contents}\n"
246
247
248def rst(contents: str, source_file: Path | None) -> str:
249    """
250    Convert reStructuredText elements to Markdown.
251    We support the most common elements, but we do not aim to mirror the full complexity of the spec here.
252    """
253    contents = _rst_admonitions(contents, source_file)
254    contents = _rst_links(contents)
255
256    def replace_reference(m):
257        _, kind, name = m.groups()
258        if kind in ("meth", "func"):
259            return f"`{name}()`"
260        else:
261            return f"`{name}`"
262
263    # Code References: :obj:`foo` -> `foo`
264    contents = re.sub(
265        r"(:py)?:(mod|func|data|const|class|meth|attr|exc|obj):`([^`]+)`",
266        replace_reference,
267        contents,
268    )
269
270    # Math: :math:`foo` -> \\( foo \\)
271    # We don't use $ as that's not enabled by MathJax by default.
272    contents = re.sub(r":math:`(.+?)`", r"\\\\( \1 \\\\)", contents)
273
274    contents = _rst_footnotes(contents)
275
276    contents = _rst_fields(contents)
277
278    return contents
279
280
281def _rst_footnotes(contents: str) -> str:
282    """Convert reStructuredText footnotes"""
283    footnotes: set[str] = set()
284    autonum: int
285
286    def register_footnote(m: re.Match[str]) -> str:
287        nonlocal autonum
288        fn_id = m.group("id")
289        if fn_id in "*#":
290            fn_id = f"fn-{autonum}"
291            autonum += 1
292        fn_id = fn_id.lstrip("#*")
293        footnotes.add(fn_id)
294        content = indent(m.group("content"), "   ").lstrip()
295        return f"{m.group('indent')}[^{fn_id}]: {content}"
296
297    # Register footnotes
298    autonum = 1
299    contents = re.sub(
300        r"""
301            ^(?P<indent>[ ]*)\.\.[ ]+\[(?P<id>\d+|[#*]\w*)](?P<content>.*
302            (
303                \n                 # empty lines
304                |                  # or
305                (?P=indent)[ ]+.+  # lines with indentation
306            )*)$
307            """,
308        register_footnote,
309        contents,
310        flags=re.MULTILINE | re.VERBOSE,
311    )
312
313    def replace_references(m: re.Match[str]) -> str:
314        nonlocal autonum
315        fn_id = m.group("id")
316        if fn_id in "*#":
317            fn_id = f"fn-{autonum}"
318            autonum += 1
319        fn_id = fn_id.lstrip("#*")
320        if fn_id in footnotes:
321            return f"[^{fn_id}]"
322        else:
323            return m.group(0)
324
325    autonum = 1
326    contents = re.sub(r"\[(?P<id>\d+|[#*]\w*)]_", replace_references, contents)
327    return contents
328
329
330def _rst_links(contents: str) -> str:
331    """Convert reStructuredText hyperlinks"""
332    links = {}
333
334    def register_link(m: re.Match[str]) -> str:
335        refid = re.sub(r"\s", "", m.group("id").lower())
336        links[refid] = m.group("url")
337        return ""
338
339    def replace_link(m: re.Match[str]) -> str:
340        text = m.group("id")
341        refid = re.sub(r"[\s`]", "", text.lower())
342        try:
343            return f"[{text.strip('`')}]({links[refid]})"
344        except KeyError:
345            return m.group(0)
346
347    # Embedded URIs
348    contents = re.sub(
349        r"`(?P<text>[^`]+)<(?P<url>.+?)>`_", r"[\g<text>](\g<url>)", contents
350    )
351    # External Hyperlink Targets
352    contents = re.sub(
353        r"^\s*..\s+_(?P<id>[^\n:]+):\s*(?P<url>http\S+)",
354        register_link,
355        contents,
356        flags=re.MULTILINE,
357    )
358    contents = re.sub(r"(?P<id>[A-Za-z0-9_\-.:+]|`[^`]+`)_", replace_link, contents)
359    return contents
360
361
362def _rst_extract_options(contents: str) -> tuple[str, dict[str, str]]:
363    """
364    Extract options from the beginning of reStructuredText directives.
365
366    Return the trimmed content and a dict of options.
367    """
368    options = {}
369    while match := re.match(r"^\s*:(.+?):(.*)([\s\S]*)", contents):
370        key, value, contents = match.groups()
371        options[key] = value.strip()
372
373    return contents, options
374
375
376def _rst_include_trim(contents: str, options: dict[str, str]) -> str:
377    """
378    <https://docutils.sourceforge.io/docs/ref/rst/directives.html#include-options>
379    """
380    if "end-line" in options or "start-line" in options:
381        lines = contents.splitlines()
382        if i := options.get("end-line"):
383            lines = lines[: int(i)]
384        if i := options.get("start-line"):
385            lines = lines[int(i) :]
386        contents = "\n".join(lines)
387    if x := options.get("end-before"):
388        contents = contents[: contents.index(x)]
389    if x := options.get("start-after"):
390        contents = contents[contents.index(x) + len(x) :]
391    return contents
392
393
394def _rst_admonitions(contents: str, source_file: Path | None) -> str:
395    """
396    Convert reStructuredText admonitions - a bit tricky because they may already be indented themselves.
397    <https://www.sphinx-doc.org/en/master/usage/restructuredtext/directives.html>
398    """
399
400    def _rst_admonition(m: re.Match[str]) -> str:
401        ind = m.group("indent")
402        type = m.group("type")
403        val = m.group("val").strip()
404        contents = dedent(m.group("contents")).strip()
405        contents, options = _rst_extract_options(contents)
406
407        if type == "include":
408            loc = source_file or Path(".")
409            try:
410                included = (loc.parent / val).read_text("utf8", "replace")
411            except OSError as e:
412                warnings.warn(f"Cannot include {val!r}: {e}")
413                included = "\n"
414            try:
415                included = _rst_include_trim(included, options) + "\n"
416            except ValueError as e:
417                warnings.warn(f"Failed to process include options for {val!r}: {e}")
418            included = _rst_admonitions(included, loc.parent / val)
419            included = embed_images(included, loc.parent / val)
420            return indent(included, ind)
421        if type == "math":
422            return f"{ind}$${val}{contents}$$\n"
423        if type in ("note", "warning", "danger"):
424            if val:
425                heading = f"{ind}###### {val}\n"
426            else:
427                heading = ""
428            return (
429                f'{ind}<div class="alert {type}" markdown="1">\n'
430                f"{heading}"
431                f"{indent(contents, ind)}\n"
432                f"{ind}</div>\n"
433            )
434        if type == "code-block":
435            return f"{ind}```{val}\n{contents}\n```\n"
436        if type == "versionadded":
437            text = f"New in version {val}"
438        elif type == "versionchanged":
439            text = f"Changed in version {val}"
440        elif type == "deprecated":
441            text = f"Deprecated since version {val}"
442        else:
443            text = f"{type} {val}".strip()
444
445        if contents:
446            text = f"{ind}*{text}:*\n{indent(contents, ind)}\n\n"
447        else:
448            text = f"{ind}*{text}.*\n"
449
450        return text
451
452    admonition = "note|warning|danger|versionadded|versionchanged|deprecated|seealso|math|include|code-block"
453    return re.sub(
454        rf"""
455            ^(?P<indent>[ ]*)\.\.[ ]+(?P<type>{admonition})::(?P<val>.*)
456            (?P<contents>(
457                \n                 # empty lines
458                |                  # or
459                (?P=indent)[ ]+.+  # lines with indentation
460            )*)$
461        """,
462        _rst_admonition,
463        contents,
464        flags=re.MULTILINE | re.VERBOSE,
465    )
466
467
468def _rst_fields(contents: str) -> str:
469    """
470    Convert reStructuredText fields to Markdown.
471    <https://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html#rst-field-lists>
472    """
473
474    _has_parameter_section = False
475    _has_raises_section = False
476
477    def _rst_field(m: re.Match[str]) -> str:
478        type = m["type"]
479        body = m["body"]
480
481        if m["name"]:
482            name = f"**{m['name'].strip()}**: "
483        else:
484            name = ""
485
486        if type == "param":
487            nonlocal _has_parameter_section
488            text = f" - {name}{body}"
489            if not _has_parameter_section:
490                _has_parameter_section = True
491                text = "\n###### Parameters\n" + text
492            return text
493        elif type == "type":
494            return ""  # we expect users to use modern type annotations.
495        elif type == "return":
496            body = indent(body, "> ", lambda line: True)
497            return f"\n###### Returns\n{body}"
498        elif type == "rtype":
499            return ""  # we expect users to use modern type annotations.
500        elif type == "raises":
501            nonlocal _has_raises_section
502            text = f" - {name}{body}"
503            if not _has_raises_section:
504                _has_raises_section = True
505                text = "\n###### Raises\n" + text
506            return text
507        else:  # pragma: no cover
508            raise AssertionError("unreachable")
509
510    field = "param|type|return|rtype|raises"
511    return re.sub(
512        rf"""
513            ^:(?P<type>{field})(?:[ ]+(?P<name>.+))?:
514            (?P<body>.*(
515                (?:\n[ ]*)*  # maybe some empty lines followed by
516                [ ]+.+       # lines with indentation
517            )*(?:\n|$))
518        """,
519        _rst_field,
520        contents,
521        flags=re.MULTILINE | re.VERBOSE,
522    )
@cache
def convert( docstring: str, docformat: str, source_file: pathlib._local.Path | None) -> str:
29@cache
30def convert(docstring: str, docformat: str, source_file: Path | None) -> str:
31    """
32    Convert `docstring` from `docformat` to Markdown.
33    """
34    docformat = docformat.lower()
35
36    if any(x in docformat for x in ["google", "numpy", "restructuredtext"]):
37        docstring = rst(docstring, source_file)
38
39    if "google" in docformat:
40        docstring = google(docstring)
41
42    if "numpy" in docformat:
43        docstring = numpy(docstring)
44
45    if source_file is not None and os.environ.get("PDOC_EMBED_IMAGES") != "0":
46        docstring = embed_images(docstring, source_file)
47
48    return docstring

Convert docstring from docformat to Markdown.

def embed_images(docstring: str, source_file: pathlib._local.Path) -> str:
51def embed_images(docstring: str, source_file: Path) -> str:
52    def embed_local_image(m: re.Match) -> str:
53        image_path = source_file.parent / m["href"]
54        try:
55            image_data = image_path.read_bytes()
56            image_mime = mimetypes.guess_type(image_path)[0]
57        except Exception:
58            return m[0]
59        else:
60            data = base64.b64encode(image_data).decode()
61            return f"![{m['alt']}](data:{image_mime};base64,{data})"
62
63    return re.sub(
64        r"!\[\s*(?P<alt>.*?)\s*]\(\s*(?P<href>.+?)\s*\)",
65        embed_local_image,
66        docstring,
67    )
68    # TODO: Could probably do more here, e.g. support rST or raw HTML replacements.
def google(docstring: str) -> str:
71def google(docstring: str) -> str:
72    """Convert Google-style docstring sections into Markdown."""
73    return re.sub(
74        r"""
75        ^(?P<name>[A-Z][A-Z a-z]+):\n
76        (?P<contents>(
77            \n        # empty lines
78            |         # or
79            [ \t]+.+  # lines with indentation
80        )+)$
81        """,
82        _google_section,
83        docstring,
84        flags=re.VERBOSE | re.MULTILINE,
85    )

Convert Google-style docstring sections into Markdown.

GOOGLE_LIST_SECTIONS = ['Args', 'Raises', 'Attributes']

Section headers listed in the official Google docstring style guide.

GOOGLE_LIST_SECTION_ALIASES = {'Parameters': 'Args', 'Params': 'Args', 'Arguments': 'Args'}

Alternative section headers that are not listed in the official Google docstring style guide but that we recognize as sections containing lists nevertheless.

def numpy(docstring: str) -> str:
167def numpy(docstring: str) -> str:
168    """Convert NumPy-style docstring sections into Markdown.
169
170    See <https://numpydoc.readthedocs.io/en/latest/format.html> for details.
171    """
172    sections = re.split(
173        r"""
174        ^([A-Z][A-Za-z ]+)\n  # a heading
175        ---+\n+              # followed by a dashed line
176        """,
177        docstring,
178        flags=re.VERBOSE | re.MULTILINE,
179    )
180    contents = sections[0]
181    for heading, content in zip(sections[1::2], sections[2::2]):
182        if content.startswith(" "):
183            # If the first line of section content is indented, we consider the section to be finished
184            # on the first non-indented line. We take out the rest - the tail - here.
185            content, tail = re.split(r"\n(?![ \n])", content, maxsplit=1)
186        else:
187            tail = ""
188
189        if heading in (
190            "Parameters",
191            "Returns",
192            "Yields",
193            "Receives",
194            "Other Parameters",
195            "Raises",
196            "Warns",
197            "Attributes",
198        ):
199            contents += f"###### {heading}\n{_numpy_parameters(content)}"
200        elif heading == "See Also":
201            contents += f"###### {heading}\n{_numpy_seealso(content)}"
202        else:
203            contents += f"###### {heading}\n{dedent(content)}"
204        contents += tail
205    return contents

Convert NumPy-style docstring sections into Markdown.

See https://numpydoc.readthedocs.io/en/latest/format.html for details.

def rst(contents: str, source_file: pathlib._local.Path | None) -> str:
249def rst(contents: str, source_file: Path | None) -> str:
250    """
251    Convert reStructuredText elements to Markdown.
252    We support the most common elements, but we do not aim to mirror the full complexity of the spec here.
253    """
254    contents = _rst_admonitions(contents, source_file)
255    contents = _rst_links(contents)
256
257    def replace_reference(m):
258        _, kind, name = m.groups()
259        if kind in ("meth", "func"):
260            return f"`{name}()`"
261        else:
262            return f"`{name}`"
263
264    # Code References: :obj:`foo` -> `foo`
265    contents = re.sub(
266        r"(:py)?:(mod|func|data|const|class|meth|attr|exc|obj):`([^`]+)`",
267        replace_reference,
268        contents,
269    )
270
271    # Math: :math:`foo` -> \\( foo \\)
272    # We don't use $ as that's not enabled by MathJax by default.
273    contents = re.sub(r":math:`(.+?)`", r"\\\\( \1 \\\\)", contents)
274
275    contents = _rst_footnotes(contents)
276
277    contents = _rst_fields(contents)
278
279    return contents

Convert reStructuredText elements to Markdown. We support the most common elements, but we do not aim to mirror the full complexity of the spec here.