Edit on GitHub

pdoc.docstrings

This module handles the conversion of docstring flavors to Markdown.

The conversion from docstring flavors to Markdown is mostly done with regular expressions. This is not particularly beautiful, but good enough for our purposes. The alternative would be to depend on https://github.com/rr-/docstring_parser or a similar project, but that introduces more complexity than we are comfortable with.

If you miss a particular feature for your favorite flavor, contributions are welcome. That being said, please keep the complexity low and make sure that changes are accompanied by matching snapshot tests in test/testdata/.

  1"""
  2This module handles the conversion of docstring flavors to Markdown.
  3
  4The conversion from docstring flavors to Markdown is mostly done with regular expressions.
  5This is not particularly beautiful, but good enough for our purposes.
  6The alternative would be to depend on <https://github.com/rr-/docstring_parser> or a similar project,
  7but that introduces more complexity than we are comfortable with.
  8
  9If you miss a particular feature for your favorite flavor, contributions are welcome.
 10That being said, please keep the complexity low and make sure that changes are
 11accompanied by matching snapshot tests in `test/testdata/`.
 12"""
 13
 14from __future__ import annotations
 15
 16import base64
 17import inspect
 18import mimetypes
 19import os
 20from pathlib import Path
 21import re
 22from textwrap import dedent
 23from textwrap import indent
 24import warnings
 25
 26from ._compat import cache
 27
 28
 29@cache
 30def convert(docstring: str, docformat: str, source_file: Path | None) -> str:
 31    """
 32    Convert `docstring` from `docformat` to Markdown.
 33    """
 34    docformat = docformat.lower()
 35
 36    if any(x in docformat for x in ["google", "numpy", "restructuredtext"]):
 37        docstring = rst(docstring, source_file)
 38
 39    if "google" in docformat:
 40        docstring = google(docstring)
 41
 42    if "numpy" in docformat:
 43        docstring = numpy(docstring)
 44
 45    if source_file is not None and os.environ.get("PDOC_EMBED_IMAGES") != "0":
 46        docstring = embed_images(docstring, source_file)
 47
 48    return docstring
 49
 50
 51def embed_images(docstring: str, source_file: Path) -> str:
 52    def embed_local_image(m: re.Match) -> str:
 53        image_path = source_file.parent / m["href"]
 54        try:
 55            image_data = image_path.read_bytes()
 56            image_mime = mimetypes.guess_type(image_path)[0]
 57        except Exception:
 58            return m[0]
 59        else:
 60            data = base64.b64encode(image_data).decode()
 61            return f"![{m['alt']}](data:{image_mime};base64,{data})"
 62
 63    return re.sub(
 64        r"!\[\s*(?P<alt>.*?)\s*]\(\s*(?P<href>.+?)\s*\)",
 65        embed_local_image,
 66        docstring,
 67    )
 68    # TODO: Could probably do more here, e.g. support rST or raw HTML replacements.
 69
 70
 71def google(docstring: str) -> str:
 72    """Convert Google-style docstring sections into Markdown."""
 73    return re.sub(
 74        r"""
 75        ^(?P<name>[A-Z][A-Z a-z]+):\n
 76        (?P<contents>(
 77            \n        # empty lines
 78            |         # or
 79            [ \t]+.+  # lines with indentation
 80        )+)$
 81        """,
 82        _google_section,
 83        docstring,
 84        flags=re.VERBOSE | re.MULTILINE,
 85    )
 86
 87
 88GOOGLE_LIST_SECTIONS = ["Args", "Raises", "Attributes"]
 89"""Section headers listed in the official Google docstring style guide."""
 90
 91GOOGLE_LIST_SECTION_ALIASES = {
 92    "Parameters": "Args",
 93    "Params": "Args",
 94    "Arguments": "Args",
 95}
 96"""
 97Alternative section headers that are not listed in the official Google
 98docstring style guide but that we recognize as sections containing lists
 99nevertheless.
100"""
101
102
103def _google_section(m: re.Match[str]) -> str:
104    name = m.group("name")
105    contents = dedent(m.group("contents")).lstrip()
106
107    if name in GOOGLE_LIST_SECTION_ALIASES:
108        name = GOOGLE_LIST_SECTION_ALIASES[name]
109
110    if name in GOOGLE_LIST_SECTIONS:
111        items = _indented_list(contents)
112        contents = ""
113        for item in items:
114            try:
115                # first ":" on the first line
116                _, attr, desc = re.split(r"^(.+?:)", item, maxsplit=1)
117            except ValueError:
118                contents += " - " + indent(item, "   ")[3:]
119            else:
120                contents += f" - **{attr}** " + indent(desc, "   ")[3:]
121            contents += "\n"
122    else:
123        contents = indent(contents, "> ", lambda line: True)
124
125    if name == "Args":
126        name = "Arguments"
127
128    return f"\n###### {name}:\n{contents}\n"
129
130
131def _indented_list(contents: str) -> list[str]:
132    """
133    Convert a list string into individual (dedented) elements. For example,
134
135    foo:
136        desc
137    bar: int
138        more desc
139    baz:
140        desc
141            indented
142
143    returns [
144        "foo:\ndesc",
145        "bar: int\nmore desc",
146        "baz:\ndesc\n    indented",
147    ]
148    """
149    # we expect this to be through cleandoc() already.
150    assert not contents.startswith(" "), contents
151    assert not contents.startswith("\n"), contents
152
153    ret: list[str] = []
154    for line in contents.splitlines(keepends=True):
155        empty = not line.strip()
156        indented = line.startswith(" ")
157        if not (empty or indented):
158            # new section
159            ret.append(line)
160        else:
161            # append to current section
162            ret[-1] += line
163
164    return [inspect.cleandoc(x) for x in ret]
165
166
167def numpy(docstring: str) -> str:
168    """Convert NumPy-style docstring sections into Markdown.
169
170    See <https://numpydoc.readthedocs.io/en/latest/format.html> for details.
171    """
172    sections = re.split(
173        r"""
174        ^([A-Z][A-Za-z ]+)\n  # a heading
175        ---+\n+              # followed by a dashed line
176        """,
177        docstring,
178        flags=re.VERBOSE | re.MULTILINE,
179    )
180    contents = sections[0]
181    for heading, content in zip(sections[1::2], sections[2::2]):
182        if content.startswith(" "):
183            # If the first line of section content is indented, we consider the section to be finished
184            # on the first non-indented line. We take out the rest - the tail - here.
185            content, tail = re.split(r"\n(?![ \n])", content, maxsplit=1)
186        else:
187            tail = ""
188
189        if heading in (
190            "Parameters",
191            "Returns",
192            "Yields",
193            "Receives",
194            "Other Parameters",
195            "Raises",
196            "Warns",
197            "Attributes",
198        ):
199            contents += f"###### {heading}\n{_numpy_parameters(content)}"
200        elif heading == "See Also":
201            contents += f"###### {heading}\n{_numpy_seealso(content)}"
202        else:
203            contents += f"###### {heading}\n{dedent(content)}"
204        contents += tail
205    return contents
206
207
208def _numpy_seealso(content: str) -> str:
209    """Convert a NumPy-style "See Also" section into Markdown"""
210    contents = ""
211    for item in _indented_list(content):
212        if ":" in item:
213            funcstr, desc = item.split(":", maxsplit=1)
214            desc = f": {desc}"
215        else:
216            funcstr, desc = item, ""
217
218        funclist = [f.strip() for f in funcstr.split(" ")]
219        funcs = ", ".join(f"`{f}`" for f in funclist if f)
220        contents += f"{funcs}{desc}  \n"
221    return contents
222
223
224def _numpy_parameters(content: str) -> str:
225    """Convert a NumPy-style parameter section into Markdown"""
226    contents = ""
227    for item in _indented_list(content):
228        m = re.match(r"^(.+):(.+)([\s\S]*)", item)
229        if m:
230            contents += (
231                f" - **{m.group(1).strip()}** ({m.group(2).strip()}):\n"
232                f"{indent(m.group(3).strip(), '   ')}\n"
233            )
234        else:
235            if "\n" in item:
236                name, desc = item.split("\n", maxsplit=1)
237                name = name.strip()
238                desc = desc.strip()
239            else:
240                name, desc = item.strip(), ""
241
242            if desc:
243                contents += f" - **{name}**: {desc}\n"
244            else:
245                contents += f" - **{name}**\n"
246    return f"{contents}\n"
247
248
249def rst(contents: str, source_file: Path | None) -> str:
250    """
251    Convert reStructuredText elements to Markdown.
252    We support the most common elements, but we do not aim to mirror the full complexity of the spec here.
253    """
254    contents = _rst_admonitions(contents, source_file)
255    contents = _rst_links(contents)
256
257    def replace_reference(m):
258        _, kind, name = m.groups()
259        if kind in ("meth", "func"):
260            return f"`{name}()`"
261        else:
262            return f"`{name}`"
263
264    # Code References: :obj:`foo` -> `foo`
265    contents = re.sub(
266        r"(:py)?:(mod|func|data|const|class|meth|attr|exc|obj):`([^`]+)`",
267        replace_reference,
268        contents,
269    )
270
271    # Math: :math:`foo` -> \\( foo \\)
272    # We don't use $ as that's not enabled by MathJax by default.
273    contents = re.sub(r":math:`(.+?)`", r"\\\\( \1 \\\\)", contents)
274
275    contents = _rst_footnotes(contents)
276
277    contents = _rst_fields(contents)
278
279    return contents
280
281
282def _rst_footnotes(contents: str) -> str:
283    """Convert reStructuredText footnotes"""
284    footnotes: set[str] = set()
285    autonum: int
286
287    def register_footnote(m: re.Match[str]) -> str:
288        nonlocal autonum
289        fn_id = m.group("id")
290        if fn_id in "*#":
291            fn_id = f"fn-{autonum}"
292            autonum += 1
293        fn_id = fn_id.lstrip("#*")
294        footnotes.add(fn_id)
295        content = indent(m.group("content"), "   ").lstrip()
296        return f"{m.group('indent')}[^{fn_id}]: {content}"
297
298    # Register footnotes
299    autonum = 1
300    contents = re.sub(
301        r"""
302            ^(?P<indent>[ ]*)\.\.[ ]+\[(?P<id>\d+|[#*]\w*)](?P<content>.*
303            (
304                \n                 # empty lines
305                |                  # or
306                (?P=indent)[ ]+.+  # lines with indentation
307            )*)$
308            """,
309        register_footnote,
310        contents,
311        flags=re.MULTILINE | re.VERBOSE,
312    )
313
314    def replace_references(m: re.Match[str]) -> str:
315        nonlocal autonum
316        fn_id = m.group("id")
317        if fn_id in "*#":
318            fn_id = f"fn-{autonum}"
319            autonum += 1
320        fn_id = fn_id.lstrip("#*")
321        if fn_id in footnotes:
322            return f"[^{fn_id}]"
323        else:
324            return m.group(0)
325
326    autonum = 1
327    contents = re.sub(r"\[(?P<id>\d+|[#*]\w*)]_", replace_references, contents)
328    return contents
329
330
331def _rst_links(contents: str) -> str:
332    """Convert reStructuredText hyperlinks"""
333    links = {}
334
335    def register_link(m: re.Match[str]) -> str:
336        refid = re.sub(r"\s", "", m.group("id").lower())
337        links[refid] = m.group("url")
338        return ""
339
340    def replace_link(m: re.Match[str]) -> str:
341        text = m.group("id")
342        refid = re.sub(r"[\s`]", "", text.lower())
343        try:
344            return f"[{text.strip('`')}]({links[refid]})"
345        except KeyError:
346            return m.group(0)
347
348    # Embedded URIs
349    contents = re.sub(
350        r"`(?P<text>[^`]+)<(?P<url>.+?)>`_", r"[\g<text>](\g<url>)", contents
351    )
352    # External Hyperlink Targets
353    contents = re.sub(
354        r"^\s*..\s+_(?P<id>[^\n:]+):\s*(?P<url>http\S+)",
355        register_link,
356        contents,
357        flags=re.MULTILINE,
358    )
359    contents = re.sub(r"(?P<id>[A-Za-z0-9_\-.:+]|`[^`]+`)_", replace_link, contents)
360    return contents
361
362
363def _rst_extract_options(contents: str) -> tuple[str, dict[str, str]]:
364    """
365    Extract options from the beginning of reStructuredText directives.
366
367    Return the trimmed content and a dict of options.
368    """
369    options = {}
370    while match := re.match(r"^\s*:(.+?):(.*)([\s\S]*)", contents):
371        key, value, contents = match.groups()
372        options[key] = value.strip()
373
374    return contents, options
375
376
377def _rst_include_trim(contents: str, options: dict[str, str]) -> str:
378    """
379    <https://docutils.sourceforge.io/docs/ref/rst/directives.html#include-options>
380    """
381    if "end-line" in options or "start-line" in options:
382        lines = contents.splitlines()
383        if i := options.get("end-line"):
384            lines = lines[: int(i)]
385        if i := options.get("start-line"):
386            lines = lines[int(i) :]
387        contents = "\n".join(lines)
388    if x := options.get("end-before"):
389        contents = contents[: contents.index(x)]
390    if x := options.get("start-after"):
391        contents = contents[contents.index(x) + len(x) :]
392    return contents
393
394
395def _rst_admonitions(contents: str, source_file: Path | None) -> str:
396    """
397    Convert reStructuredText admonitions - a bit tricky because they may already be indented themselves.
398    <https://www.sphinx-doc.org/en/master/usage/restructuredtext/directives.html>
399    """
400
401    def _rst_admonition(m: re.Match[str]) -> str:
402        ind = m.group("indent")
403        type = m.group("type")
404        val = m.group("val").strip()
405        contents = dedent(m.group("contents")).strip()
406        contents, options = _rst_extract_options(contents)
407
408        if type == "include":
409            loc = source_file or Path(".")
410            try:
411                included = (loc.parent / val).read_text("utf8", "replace")
412            except OSError as e:
413                warnings.warn(f"Cannot include {val!r}: {e}")
414                included = "\n"
415            try:
416                included = _rst_include_trim(included, options) + "\n"
417            except ValueError as e:
418                warnings.warn(f"Failed to process include options for {val!r}: {e}")
419            included = _rst_admonitions(included, loc.parent / val)
420            included = embed_images(included, loc.parent / val)
421            return indent(included, ind)
422        if type == "math":
423            return f"{ind}$${val}{contents}$$\n"
424        if type in ("note", "warning", "danger"):
425            if val:
426                heading = f"{ind}###### {val}\n"
427            else:
428                heading = ""
429            return (
430                f'{ind}<div class="pdoc-alert pdoc-alert-{type}" markdown="1">\n'
431                f"{heading}"
432                f"{indent(contents, ind)}\n"
433                f"{ind}</div>\n"
434            )
435        if type == "code-block":
436            return f"{ind}```{val}\n{contents}\n```\n"
437        if type == "versionadded":
438            text = f"New in version {val}"
439        elif type == "versionchanged":
440            text = f"Changed in version {val}"
441        elif type == "deprecated":
442            text = f"Deprecated since version {val}"
443        else:
444            text = f"{type} {val}".strip()
445
446        if contents:
447            text = f"{ind}*{text}:*\n{indent(contents, ind)}\n\n"
448        else:
449            text = f"{ind}*{text}.*\n"
450
451        return text
452
453    admonition = "note|warning|danger|versionadded|versionchanged|deprecated|seealso|math|include|code-block"
454    return re.sub(
455        rf"""
456            ^(?P<indent>[ ]*)\.\.[ ]+(?P<type>{admonition})::(?P<val>.*)
457            (?P<contents>(
458                \n                 # empty lines
459                |                  # or
460                (?P=indent)[ ]+.+  # lines with indentation
461            )*)$
462        """,
463        _rst_admonition,
464        contents,
465        flags=re.MULTILINE | re.VERBOSE,
466    )
467
468
469def _rst_fields(contents: str) -> str:
470    """
471    Convert reStructuredText fields to Markdown.
472    <https://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html#rst-field-lists>
473    """
474
475    _has_parameter_section = False
476    _has_raises_section = False
477
478    def _rst_field(m: re.Match[str]) -> str:
479        type = m["type"]
480        body = m["body"]
481
482        if m["name"]:
483            name = f"**{m['name'].strip()}**: "
484        else:
485            name = ""
486
487        if type == "param":
488            nonlocal _has_parameter_section
489            text = f" - {name}{body}"
490            if not _has_parameter_section:
491                _has_parameter_section = True
492                text = "\n###### Parameters\n" + text
493            return text
494        elif type == "type":
495            return ""  # we expect users to use modern type annotations.
496        elif type == "return":
497            body = indent(body, "> ", lambda line: True)
498            return f"\n###### Returns\n{body}"
499        elif type == "rtype":
500            return ""  # we expect users to use modern type annotations.
501        elif type == "raises":
502            nonlocal _has_raises_section
503            text = f" - {name}{body}"
504            if not _has_raises_section:
505                _has_raises_section = True
506                text = "\n###### Raises\n" + text
507            return text
508        else:  # pragma: no cover
509            raise AssertionError("unreachable")
510
511    field = "param|type|return|rtype|raises"
512    return re.sub(
513        rf"""
514            ^:(?P<type>{field})(?:[ ]+(?P<name>.+))?:
515            (?P<body>.*(
516                (?:\n[ ]*)*  # maybe some empty lines followed by
517                [ ]+.+       # lines with indentation
518            )*(?:\n|$))
519        """,
520        _rst_field,
521        contents,
522        flags=re.MULTILINE | re.VERBOSE,
523    )
@cache
def convert(docstring: str, docformat: str, source_file: pathlib.Path | None) -> str:
30@cache
31def convert(docstring: str, docformat: str, source_file: Path | None) -> str:
32    """
33    Convert `docstring` from `docformat` to Markdown.
34    """
35    docformat = docformat.lower()
36
37    if any(x in docformat for x in ["google", "numpy", "restructuredtext"]):
38        docstring = rst(docstring, source_file)
39
40    if "google" in docformat:
41        docstring = google(docstring)
42
43    if "numpy" in docformat:
44        docstring = numpy(docstring)
45
46    if source_file is not None and os.environ.get("PDOC_EMBED_IMAGES") != "0":
47        docstring = embed_images(docstring, source_file)
48
49    return docstring

Convert docstring from docformat to Markdown.

def embed_images(docstring: str, source_file: pathlib.Path) -> str:
52def embed_images(docstring: str, source_file: Path) -> str:
53    def embed_local_image(m: re.Match) -> str:
54        image_path = source_file.parent / m["href"]
55        try:
56            image_data = image_path.read_bytes()
57            image_mime = mimetypes.guess_type(image_path)[0]
58        except Exception:
59            return m[0]
60        else:
61            data = base64.b64encode(image_data).decode()
62            return f"![{m['alt']}](data:{image_mime};base64,{data})"
63
64    return re.sub(
65        r"!\[\s*(?P<alt>.*?)\s*]\(\s*(?P<href>.+?)\s*\)",
66        embed_local_image,
67        docstring,
68    )
69    # TODO: Could probably do more here, e.g. support rST or raw HTML replacements.
def google(docstring: str) -> str:
72def google(docstring: str) -> str:
73    """Convert Google-style docstring sections into Markdown."""
74    return re.sub(
75        r"""
76        ^(?P<name>[A-Z][A-Z a-z]+):\n
77        (?P<contents>(
78            \n        # empty lines
79            |         # or
80            [ \t]+.+  # lines with indentation
81        )+)$
82        """,
83        _google_section,
84        docstring,
85        flags=re.VERBOSE | re.MULTILINE,
86    )

Convert Google-style docstring sections into Markdown.

GOOGLE_LIST_SECTIONS = ['Args', 'Raises', 'Attributes']

Section headers listed in the official Google docstring style guide.

GOOGLE_LIST_SECTION_ALIASES = {'Parameters': 'Args', 'Params': 'Args', 'Arguments': 'Args'}

Alternative section headers that are not listed in the official Google docstring style guide but that we recognize as sections containing lists nevertheless.

def numpy(docstring: str) -> str:
168def numpy(docstring: str) -> str:
169    """Convert NumPy-style docstring sections into Markdown.
170
171    See <https://numpydoc.readthedocs.io/en/latest/format.html> for details.
172    """
173    sections = re.split(
174        r"""
175        ^([A-Z][A-Za-z ]+)\n  # a heading
176        ---+\n+              # followed by a dashed line
177        """,
178        docstring,
179        flags=re.VERBOSE | re.MULTILINE,
180    )
181    contents = sections[0]
182    for heading, content in zip(sections[1::2], sections[2::2]):
183        if content.startswith(" "):
184            # If the first line of section content is indented, we consider the section to be finished
185            # on the first non-indented line. We take out the rest - the tail - here.
186            content, tail = re.split(r"\n(?![ \n])", content, maxsplit=1)
187        else:
188            tail = ""
189
190        if heading in (
191            "Parameters",
192            "Returns",
193            "Yields",
194            "Receives",
195            "Other Parameters",
196            "Raises",
197            "Warns",
198            "Attributes",
199        ):
200            contents += f"###### {heading}\n{_numpy_parameters(content)}"
201        elif heading == "See Also":
202            contents += f"###### {heading}\n{_numpy_seealso(content)}"
203        else:
204            contents += f"###### {heading}\n{dedent(content)}"
205        contents += tail
206    return contents

Convert NumPy-style docstring sections into Markdown.

See https://numpydoc.readthedocs.io/en/latest/format.html for details.

def rst(contents: str, source_file: pathlib.Path | None) -> str:
250def rst(contents: str, source_file: Path | None) -> str:
251    """
252    Convert reStructuredText elements to Markdown.
253    We support the most common elements, but we do not aim to mirror the full complexity of the spec here.
254    """
255    contents = _rst_admonitions(contents, source_file)
256    contents = _rst_links(contents)
257
258    def replace_reference(m):
259        _, kind, name = m.groups()
260        if kind in ("meth", "func"):
261            return f"`{name}()`"
262        else:
263            return f"`{name}`"
264
265    # Code References: :obj:`foo` -> `foo`
266    contents = re.sub(
267        r"(:py)?:(mod|func|data|const|class|meth|attr|exc|obj):`([^`]+)`",
268        replace_reference,
269        contents,
270    )
271
272    # Math: :math:`foo` -> \\( foo \\)
273    # We don't use $ as that's not enabled by MathJax by default.
274    contents = re.sub(r":math:`(.+?)`", r"\\\\( \1 \\\\)", contents)
275
276    contents = _rst_footnotes(contents)
277
278    contents = _rst_fields(contents)
279
280    return contents

Convert reStructuredText elements to Markdown. We support the most common elements, but we do not aim to mirror the full complexity of the spec here.