Edit on GitHub

pdoc.docstrings

This module handles the conversion of docstring flavors to Markdown.

The conversion from docstring flavors to Markdown is mostly done with regular expressions. This is not particularly beautiful, but good enough for our purposes. The alternative would be to depend on https://github.com/rr-/docstring_parser or a similar project, but that introduces more complexity than we are comfortable with.

If you miss a particular feature for your favorite flavor, contributions are welcome. That being said, please keep the complexity low and make sure that changes are accompanied by matching snapshot tests in test/testdata/.

  1"""
  2This module handles the conversion of docstring flavors to Markdown.
  3
  4The conversion from docstring flavors to Markdown is mostly done with regular expressions.
  5This is not particularly beautiful, but good enough for our purposes.
  6The alternative would be to depend on <https://github.com/rr-/docstring_parser> or a similar project,
  7but that introduces more complexity than we are comfortable with.
  8
  9If you miss a particular feature for your favorite flavor, contributions are welcome.
 10That being said, please keep the complexity low and make sure that changes are
 11accompanied by matching snapshot tests in `test/testdata/`.
 12"""
 13from __future__ import annotations
 14
 15import base64
 16import inspect
 17import mimetypes
 18import os
 19import re
 20import warnings
 21from pathlib import Path
 22from textwrap import dedent
 23from textwrap import indent
 24
 25from ._compat import cache
 26
 27
 28@cache
 29def convert(docstring: str, docformat: str, source_file: Path | None) -> str:
 30    """
 31    Convert `docstring` from `docformat` to Markdown.
 32    """
 33    docformat = docformat.lower()
 34
 35    if any(x in docformat for x in ["google", "numpy", "restructuredtext"]):
 36        docstring = rst(docstring, source_file)
 37
 38    if "google" in docformat:
 39        docstring = google(docstring)
 40
 41    if "numpy" in docformat:
 42        docstring = numpy(docstring)
 43
 44    if source_file is not None and os.environ.get("PDOC_EMBED_IMAGES") != "0":
 45        docstring = embed_images(docstring, source_file)
 46
 47    return docstring
 48
 49
 50def embed_images(docstring: str, source_file: Path) -> str:
 51    def embed_local_image(m: re.Match) -> str:
 52        image_path = source_file.parent / m["href"]
 53        try:
 54            image_data = image_path.read_bytes()
 55            image_mime = mimetypes.guess_type(image_path)[0]
 56        except Exception:
 57            return m[0]
 58        else:
 59            data = base64.b64encode(image_data).decode()
 60            return f"![{m['alt']}](data:{image_mime};base64,{data})"
 61
 62    return re.sub(
 63        r"!\[\s*(?P<alt>.*?)\s*]\(\s*(?P<href>.+?)\s*\)",
 64        embed_local_image,
 65        docstring,
 66    )
 67    # TODO: Could probably do more here, e.g. support rST or raw HTML replacements.
 68
 69
 70def google(docstring: str) -> str:
 71    """Convert Google-style docstring sections into Markdown."""
 72    return re.sub(
 73        r"""
 74        ^(?P<name>[A-Z][A-Z a-z]+):\n
 75        (?P<contents>(
 76            \n        # empty lines
 77            |         # or
 78            [ \t]+.+  # lines with indentation
 79        )+)$
 80        """,
 81        _google_section,
 82        docstring,
 83        flags=re.VERBOSE | re.MULTILINE,
 84    )
 85
 86
 87GOOGLE_LIST_SECTIONS = ["Args", "Raises", "Attributes"]
 88"""Section headers listed in the official Google docstring style guide."""
 89
 90GOOGLE_LIST_SECTION_ALIASES = {
 91    "Parameters": "Args",
 92    "Params": "Args",
 93    "Arguments": "Args",
 94}
 95"""
 96Alternative section headers that are not listed in the official Google
 97docstring style guide but that we recognize as sections containing lists
 98nevertheless.
 99"""
100
101
102def _google_section(m: re.Match[str]) -> str:
103    name = m.group("name")
104    contents = dedent(m.group("contents")).lstrip()
105
106    if name in GOOGLE_LIST_SECTION_ALIASES:
107        name = GOOGLE_LIST_SECTION_ALIASES[name]
108
109    if name in GOOGLE_LIST_SECTIONS:
110        items = _indented_list(contents)
111        contents = ""
112        for item in items:
113            try:
114                # first ":" on the first line
115                _, attr, desc = re.split(r"^(.+?:)", item, maxsplit=1)
116            except ValueError:
117                contents += " - " + indent(item, "   ")[3:]
118            else:
119                contents += f" - **{attr}** " + indent(desc, "   ")[3:]
120            contents += "\n"
121    else:
122        contents = indent(contents, "> ", lambda line: True)
123
124    if name == "Args":
125        name = "Arguments"
126
127    return f"\n###### {name}:\n{contents}\n"
128
129
130def _indented_list(contents: str) -> list[str]:
131    """
132    Convert a list string into individual (dedented) elements. For example,
133
134    foo:
135        desc
136    bar: int
137        more desc
138    baz:
139        desc
140            indented
141
142    returns [
143        "foo:\ndesc",
144        "bar: int\nmore desc",
145        "baz:\ndesc\n    indented",
146    ]
147    """
148    # we expect this to be through cleandoc() already.
149    assert not contents.startswith(" "), contents
150    assert not contents.startswith("\n"), contents
151
152    ret: list[str] = []
153    for line in contents.splitlines(keepends=True):
154        empty = not line.strip()
155        indented = line.startswith(" ")
156        if not (empty or indented):
157            # new section
158            ret.append(line)
159        else:
160            # append to current section
161            ret[-1] += line
162
163    return [inspect.cleandoc(x) for x in ret]
164
165
166def numpy(docstring: str) -> str:
167    """Convert NumPy-style docstring sections into Markdown.
168
169    See <https://numpydoc.readthedocs.io/en/latest/format.html> for details.
170    """
171    sections = re.split(
172        r"""
173        ^([A-Z][A-Za-z ]+)\n  # a heading
174        ---+\n+              # followed by a dashed line
175        """,
176        docstring,
177        flags=re.VERBOSE | re.MULTILINE,
178    )
179    contents = sections[0]
180    for heading, content in zip(sections[1::2], sections[2::2]):
181        if content.startswith(" "):
182            # If the first line of section content is indented, we consider the section to be finished
183            # on the first non-indented line. We take out the rest - the tail - here.
184            content, tail = re.split(r"\n(?![ \n])", content, maxsplit=1)
185        else:
186            tail = ""
187
188        if heading in (
189            "Parameters",
190            "Returns",
191            "Yields",
192            "Receives",
193            "Other Parameters",
194            "Raises",
195            "Warns",
196            "Attributes",
197        ):
198            contents += f"###### {heading}\n{_numpy_parameters(content)}"
199        elif heading == "See Also":
200            contents += f"###### {heading}\n{_numpy_seealso(content)}"
201        else:
202            contents += f"###### {heading}\n{dedent(content)}"
203        contents += tail
204    return contents
205
206
207def _numpy_seealso(content: str) -> str:
208    """Convert a NumPy-style "See Also" section into Markdown"""
209    contents = ""
210    for item in _indented_list(content):
211        if ":" in item:
212            funcstr, desc = item.split(":", maxsplit=1)
213            desc = f": {desc}"
214        else:
215            funcstr, desc = item, ""
216
217        funclist = [f.strip() for f in funcstr.split(" ")]
218        funcs = ", ".join(f"`{f}`" for f in funclist if f)
219        contents += f"{funcs}{desc}  \n"
220    return contents
221
222
223def _numpy_parameters(content: str) -> str:
224    """Convert a NumPy-style parameter section into Markdown"""
225    contents = ""
226    for item in _indented_list(content):
227        m = re.match(r"^(.+):(.+)([\s\S]*)", item)
228        if m:
229            contents += (
230                f" - **{m.group(1).strip()}** ({m.group(2).strip()}):\n"
231                f"{indent(m.group(3).strip(), '   ')}\n"
232            )
233        else:
234            if "\n" in item:
235                name, desc = item.split("\n", maxsplit=1)
236                name = name.strip()
237                desc = desc.strip()
238            else:
239                name, desc = item.strip(), ""
240
241            if desc:
242                contents += f" - **{name}**: {desc}\n"
243            else:
244                contents += f" - **{name}**\n"
245    return f"{contents}\n"
246
247
248def rst(contents: str, source_file: Path | None) -> str:
249    """
250    Convert reStructuredText elements to Markdown.
251    We support the most common elements, but we do not aim to mirror the full complexity of the spec here.
252    """
253    contents = _rst_admonitions(contents, source_file)
254    contents = _rst_links(contents)
255
256    def replace_reference(m):
257        _, kind, name = m.groups()
258        if kind in ("meth", "func"):
259            return f"`{name}()`"
260        else:
261            return f"`{name}`"
262
263    # Code References: :obj:`foo` -> `foo`
264    contents = re.sub(
265        r"(:py)?:(mod|func|data|const|class|meth|attr|exc|obj):`([^`]+)`",
266        replace_reference,
267        contents,
268    )
269
270    # Math: :math:`foo` -> \\( foo \\)
271    # We don't use $ as that's not enabled by MathJax by default.
272    contents = re.sub(r":math:`(.+?)`", r"\\\\( \1 \\\\)", contents)
273
274    contents = _rst_footnotes(contents)
275
276    contents = _rst_fields(contents)
277
278    return contents
279
280
281def _rst_footnotes(contents: str) -> str:
282    """Convert reStructuredText footnotes"""
283    footnotes: set[str] = set()
284    autonum: int
285
286    def register_footnote(m: re.Match[str]) -> str:
287        nonlocal autonum
288        fn_id = m.group("id")
289        if fn_id in "*#":
290            fn_id = f"fn-{autonum}"
291            autonum += 1
292        fn_id = fn_id.lstrip("#*")
293        footnotes.add(fn_id)
294        content = indent(m.group("content"), "   ").lstrip()
295        return f"{m.group('indent')}[^{fn_id}]: {content}"
296
297    # Register footnotes
298    autonum = 1
299    contents = re.sub(
300        r"""
301            ^(?P<indent>[ ]*)\.\.[ ]+\[(?P<id>\d+|[#*]\w*)](?P<content>.*
302            (
303                \n                 # empty lines
304                |                  # or
305                (?P=indent)[ ]+.+  # lines with indentation
306            )*)$
307            """,
308        register_footnote,
309        contents,
310        flags=re.MULTILINE | re.VERBOSE,
311    )
312
313    def replace_references(m: re.Match[str]) -> str:
314        nonlocal autonum
315        fn_id = m.group("id")
316        if fn_id in "*#":
317            fn_id = f"fn-{autonum}"
318            autonum += 1
319        fn_id = fn_id.lstrip("#*")
320        if fn_id in footnotes:
321            return f"[^{fn_id}]"
322        else:
323            return m.group(0)
324
325    autonum = 1
326    contents = re.sub(r"\[(?P<id>\d+|[#*]\w*)]_", replace_references, contents)
327    return contents
328
329
330def _rst_links(contents: str) -> str:
331    """Convert reStructuredText hyperlinks"""
332    links = {}
333
334    def register_link(m: re.Match[str]) -> str:
335        refid = re.sub(r"\s", "", m.group("id").lower())
336        links[refid] = m.group("url")
337        return ""
338
339    def replace_link(m: re.Match[str]) -> str:
340        text = m.group("id")
341        refid = re.sub(r"[\s`]", "", text.lower())
342        try:
343            return f"[{text.strip('`')}]({links[refid]})"
344        except KeyError:
345            return m.group(0)
346
347    # Embedded URIs
348    contents = re.sub(
349        r"`(?P<text>[^`]+)<(?P<url>.+?)>`_", r"[\g<text>](\g<url>)", contents
350    )
351    # External Hyperlink Targets
352    contents = re.sub(
353        r"^\s*..\s+_(?P<id>[^\n:]+):\s*(?P<url>http\S+)",
354        register_link,
355        contents,
356        flags=re.MULTILINE,
357    )
358    contents = re.sub(r"(?P<id>[A-Za-z0-9_\-.:+]|`[^`]+`)_", replace_link, contents)
359    return contents
360
361
362def _rst_admonitions(contents: str, source_file: Path | None) -> str:
363    """
364    Convert reStructuredText admonitions - a bit tricky because they may already be indented themselves.
365    <https://www.sphinx-doc.org/en/master/usage/restructuredtext/directives.html>
366    """
367
368    def _rst_admonition(m: re.Match[str]) -> str:
369        ind = m.group("indent")
370        type = m.group("type")
371        val = m.group("val").strip()
372        contents = dedent(m.group("contents")).strip()
373
374        if type == "include":
375            loc = source_file or Path(".")
376            try:
377                included = (loc.parent / val).read_text("utf8", "replace")
378            except OSError as e:
379                warnings.warn(f"Cannot include {val!r}: {e}")
380                included = "\n"
381            included = _rst_admonitions(included, loc.parent / val)
382            return indent(included, ind)
383        if type == "math":
384            return f"{ind}$${val}{contents}$$\n"
385        if type in ("note", "warning", "danger"):
386            if val:
387                heading = f"{ind}###### {val}\n"
388            else:
389                heading = ""
390            return (
391                f'{ind}<div class="pdoc-alert pdoc-alert-{type}" markdown="1">\n'
392                f"{heading}"
393                f"{indent(contents, ind)}\n"
394                f"{ind}</div>\n"
395            )
396        elif type == "versionadded":
397            text = f"New in version {val}"
398        elif type == "versionchanged":
399            text = f"Changed in version {val}"
400        elif type == "deprecated":
401            text = f"Deprecated since version {val}"
402        else:
403            text = f"{type} {val}".strip()
404
405        if contents:
406            text = f"{ind}*{text}:*\n{indent(contents, ind)}\n\n"
407        else:
408            text = f"{ind}*{text}.*\n"
409
410        return text
411
412    admonition = "note|warning|danger|versionadded|versionchanged|deprecated|seealso|math|include"
413    return re.sub(
414        rf"""
415            ^(?P<indent>[ ]*)\.\.[ ]+(?P<type>{admonition})::(?P<val>.*)
416            (?P<contents>(
417                \n                 # empty lines
418                |                  # or
419                (?P=indent)[ ]+.+  # lines with indentation
420            )*)$
421        """,
422        _rst_admonition,
423        contents,
424        flags=re.MULTILINE | re.VERBOSE,
425    )
426
427
428def _rst_fields(contents: str) -> str:
429    """
430    Convert reStructuredText fields to Markdown.
431    <https://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html#rst-field-lists>
432    """
433
434    _has_parameter_section = False
435    _has_raises_section = False
436
437    def _rst_field(m: re.Match[str]) -> str:
438        type = m["type"]
439        body = m["body"]
440
441        if m["name"]:
442            name = f"**{m['name'].strip()}**: "
443        else:
444            name = ""
445
446        if type == "param":
447            nonlocal _has_parameter_section
448            text = f" - {name}{body}"
449            if not _has_parameter_section:
450                _has_parameter_section = True
451                text = "\n###### Parameters\n" + text
452            return text
453        elif type == "type":
454            return ""  # we expect users to use modern type annotations.
455        elif type == "return":
456            body = indent(body, "> ", lambda line: True)
457            return f"\n###### Returns\n{body}"
458        elif type == "rtype":
459            return ""  # we expect users to use modern type annotations.
460        elif type == "raises":
461            nonlocal _has_raises_section
462            text = f" - {name}{body}"
463            if not _has_raises_section:
464                _has_raises_section = True
465                text = "\n###### Raises\n" + text
466            return text
467        else:  # pragma: no cover
468            raise AssertionError("unreachable")
469
470    field = "param|type|return|rtype|raises"
471    return re.sub(
472        rf"""
473            ^:(?P<type>{field})(?:[ ]+(?P<name>.+))?:
474            (?P<body>.*(
475                (?:\n[ ]*)*  # maybe some empty lines followed by
476                [ ]+.+       # lines with indentation
477            )*(?:\n|$))
478        """,
479        _rst_field,
480        contents,
481        flags=re.MULTILINE | re.VERBOSE,
482    )
@cache
def convert(docstring: str, docformat: str, source_file: pathlib.Path | None) -> str:
29@cache
30def convert(docstring: str, docformat: str, source_file: Path | None) -> str:
31    """
32    Convert `docstring` from `docformat` to Markdown.
33    """
34    docformat = docformat.lower()
35
36    if any(x in docformat for x in ["google", "numpy", "restructuredtext"]):
37        docstring = rst(docstring, source_file)
38
39    if "google" in docformat:
40        docstring = google(docstring)
41
42    if "numpy" in docformat:
43        docstring = numpy(docstring)
44
45    if source_file is not None and os.environ.get("PDOC_EMBED_IMAGES") != "0":
46        docstring = embed_images(docstring, source_file)
47
48    return docstring

Convert docstring from docformat to Markdown.

def embed_images(docstring: str, source_file: pathlib.Path) -> str:
51def embed_images(docstring: str, source_file: Path) -> str:
52    def embed_local_image(m: re.Match) -> str:
53        image_path = source_file.parent / m["href"]
54        try:
55            image_data = image_path.read_bytes()
56            image_mime = mimetypes.guess_type(image_path)[0]
57        except Exception:
58            return m[0]
59        else:
60            data = base64.b64encode(image_data).decode()
61            return f"![{m['alt']}](data:{image_mime};base64,{data})"
62
63    return re.sub(
64        r"!\[\s*(?P<alt>.*?)\s*]\(\s*(?P<href>.+?)\s*\)",
65        embed_local_image,
66        docstring,
67    )
68    # TODO: Could probably do more here, e.g. support rST or raw HTML replacements.
def google(docstring: str) -> str:
71def google(docstring: str) -> str:
72    """Convert Google-style docstring sections into Markdown."""
73    return re.sub(
74        r"""
75        ^(?P<name>[A-Z][A-Z a-z]+):\n
76        (?P<contents>(
77            \n        # empty lines
78            |         # or
79            [ \t]+.+  # lines with indentation
80        )+)$
81        """,
82        _google_section,
83        docstring,
84        flags=re.VERBOSE | re.MULTILINE,
85    )

Convert Google-style docstring sections into Markdown.

GOOGLE_LIST_SECTIONS = ['Args', 'Raises', 'Attributes']

Section headers listed in the official Google docstring style guide.

GOOGLE_LIST_SECTION_ALIASES = {'Parameters': 'Args', 'Params': 'Args', 'Arguments': 'Args'}

Alternative section headers that are not listed in the official Google docstring style guide but that we recognize as sections containing lists nevertheless.

def numpy(docstring: str) -> str:
167def numpy(docstring: str) -> str:
168    """Convert NumPy-style docstring sections into Markdown.
169
170    See <https://numpydoc.readthedocs.io/en/latest/format.html> for details.
171    """
172    sections = re.split(
173        r"""
174        ^([A-Z][A-Za-z ]+)\n  # a heading
175        ---+\n+              # followed by a dashed line
176        """,
177        docstring,
178        flags=re.VERBOSE | re.MULTILINE,
179    )
180    contents = sections[0]
181    for heading, content in zip(sections[1::2], sections[2::2]):
182        if content.startswith(" "):
183            # If the first line of section content is indented, we consider the section to be finished
184            # on the first non-indented line. We take out the rest - the tail - here.
185            content, tail = re.split(r"\n(?![ \n])", content, maxsplit=1)
186        else:
187            tail = ""
188
189        if heading in (
190            "Parameters",
191            "Returns",
192            "Yields",
193            "Receives",
194            "Other Parameters",
195            "Raises",
196            "Warns",
197            "Attributes",
198        ):
199            contents += f"###### {heading}\n{_numpy_parameters(content)}"
200        elif heading == "See Also":
201            contents += f"###### {heading}\n{_numpy_seealso(content)}"
202        else:
203            contents += f"###### {heading}\n{dedent(content)}"
204        contents += tail
205    return contents

Convert NumPy-style docstring sections into Markdown.

See https://numpydoc.readthedocs.io/en/latest/format.html for details.

def rst(contents: str, source_file: pathlib.Path | None) -> str:
249def rst(contents: str, source_file: Path | None) -> str:
250    """
251    Convert reStructuredText elements to Markdown.
252    We support the most common elements, but we do not aim to mirror the full complexity of the spec here.
253    """
254    contents = _rst_admonitions(contents, source_file)
255    contents = _rst_links(contents)
256
257    def replace_reference(m):
258        _, kind, name = m.groups()
259        if kind in ("meth", "func"):
260            return f"`{name}()`"
261        else:
262            return f"`{name}`"
263
264    # Code References: :obj:`foo` -> `foo`
265    contents = re.sub(
266        r"(:py)?:(mod|func|data|const|class|meth|attr|exc|obj):`([^`]+)`",
267        replace_reference,
268        contents,
269    )
270
271    # Math: :math:`foo` -> \\( foo \\)
272    # We don't use $ as that's not enabled by MathJax by default.
273    contents = re.sub(r":math:`(.+?)`", r"\\\\( \1 \\\\)", contents)
274
275    contents = _rst_footnotes(contents)
276
277    contents = _rst_fields(contents)
278
279    return contents

Convert reStructuredText elements to Markdown. We support the most common elements, but we do not aim to mirror the full complexity of the spec here.