Edit on GitHub

pdoc.docstrings

This module handles the conversion of docstring flavors to Markdown.

The conversion from docstring flavors to Markdown is mostly done with regular expressions. This is not particularly beautiful, but good enough for our purposes. The alternative would be to depend on https://github.com/rr-/docstring_parser or a similar project, but that introduces more complexity than we are comfortable with.

If you miss a particular feature for your favorite flavor, contributions are welcome. That being said, please keep the complexity low and make sure that changes are accompanied by matching snapshot tests in test/testdata/.

  1"""
  2This module handles the conversion of docstring flavors to Markdown.
  3
  4The conversion from docstring flavors to Markdown is mostly done with regular expressions.
  5This is not particularly beautiful, but good enough for our purposes.
  6The alternative would be to depend on <https://github.com/rr-/docstring_parser> or a similar project,
  7but that introduces more complexity than we are comfortable with.
  8
  9If you miss a particular feature for your favorite flavor, contributions are welcome.
 10That being said, please keep the complexity low and make sure that changes are
 11accompanied by matching snapshot tests in `test/testdata/`.
 12"""
 13
 14from __future__ import annotations
 15
 16import base64
 17import inspect
 18import mimetypes
 19import os
 20from pathlib import Path
 21import re
 22from textwrap import dedent
 23from textwrap import indent
 24import warnings
 25
 26from ._compat import cache
 27
 28
 29@cache
 30def convert(docstring: str, docformat: str, source_file: Path | None) -> str:
 31    """
 32    Convert `docstring` from `docformat` to Markdown.
 33    """
 34    docformat = docformat.lower()
 35
 36    if any(x in docformat for x in ["google", "numpy", "restructuredtext"]):
 37        docstring = rst(docstring, source_file)
 38
 39    if "google" in docformat:
 40        docstring = google(docstring)
 41
 42    if "numpy" in docformat:
 43        docstring = numpy(docstring)
 44
 45    if source_file is not None and os.environ.get("PDOC_EMBED_IMAGES") != "0":
 46        docstring = embed_images(docstring, source_file)
 47
 48    return docstring
 49
 50
 51def embed_images(docstring: str, source_file: Path) -> str:
 52    def embed_local_image(m: re.Match) -> str:
 53        image_path = source_file.parent / m["href"]
 54        try:
 55            image_data = image_path.read_bytes()
 56            image_mime = mimetypes.guess_type(image_path)[0]
 57        except Exception:
 58            return m[0]
 59        else:
 60            data = base64.b64encode(image_data).decode()
 61            return f"![{m['alt']}](data:{image_mime};base64,{data})"
 62
 63    return re.sub(
 64        r"!\[\s*(?P<alt>.*?)\s*]\(\s*(?P<href>.+?)\s*\)",
 65        embed_local_image,
 66        docstring,
 67    )
 68    # TODO: Could probably do more here, e.g. support rST or raw HTML replacements.
 69
 70
 71def google(docstring: str) -> str:
 72    """Convert Google-style docstring sections into Markdown."""
 73    return re.sub(
 74        r"""
 75        ^(?P<name>[A-Z][A-Z a-z]+):\n
 76        (?P<contents>(
 77            \n        # empty lines
 78            |         # or
 79            [ \t]+.+  # lines with indentation
 80        )+)$
 81        """,
 82        _google_section,
 83        docstring,
 84        flags=re.VERBOSE | re.MULTILINE,
 85    )
 86
 87
 88GOOGLE_LIST_SECTIONS = ["Args", "Raises", "Attributes"]
 89"""Section headers listed in the official Google docstring style guide."""
 90
 91GOOGLE_LIST_SECTION_ALIASES = {
 92    "Parameters": "Args",
 93    "Params": "Args",
 94    "Arguments": "Args",
 95}
 96"""
 97Alternative section headers that are not listed in the official Google
 98docstring style guide but that we recognize as sections containing lists
 99nevertheless.
100"""
101
102
103def _google_section(m: re.Match[str]) -> str:
104    name = m.group("name")
105    contents = dedent(m.group("contents")).lstrip()
106
107    if name in GOOGLE_LIST_SECTION_ALIASES:
108        name = GOOGLE_LIST_SECTION_ALIASES[name]
109
110    if name in GOOGLE_LIST_SECTIONS:
111        items = _indented_list(contents)
112        contents = ""
113        for item in items:
114            try:
115                # first ":" on the first line
116                _, attr, desc = re.split(r"^(.+?:)", item, maxsplit=1)
117            except ValueError:
118                contents += " - " + indent(item, "   ")[3:]
119            else:
120                contents += f" - **{attr}** " + indent(desc, "   ")[3:]
121            contents += "\n"
122    else:
123        contents = indent(contents, "> ", lambda line: True)
124
125    if name == "Args":
126        name = "Arguments"
127
128    return f"\n###### {name}:\n{contents}\n"
129
130
131def _indented_list(contents: str) -> list[str]:
132    """
133    Convert a list string into individual (dedented) elements. For example,
134
135    foo:
136        desc
137    bar: int
138        more desc
139    baz:
140        desc
141            indented
142
143    returns [
144        "foo:\ndesc",
145        "bar: int\nmore desc",
146        "baz:\ndesc\n    indented",
147    ]
148    """
149    # we expect this to be through cleandoc() already.
150    assert not contents.startswith(" "), contents
151    assert not contents.startswith("\n"), contents
152
153    ret: list[str] = []
154    for line in contents.splitlines(keepends=True):
155        empty = not line.strip()
156        indented = line.startswith(" ")
157        if not (empty or indented):
158            # new section
159            ret.append(line)
160        else:
161            # append to current section
162            ret[-1] += line
163
164    return [inspect.cleandoc(x) for x in ret]
165
166
167def numpy(docstring: str) -> str:
168    """Convert NumPy-style docstring sections into Markdown.
169
170    See <https://numpydoc.readthedocs.io/en/latest/format.html> for details.
171    """
172    sections = re.split(
173        r"""
174        ^([A-Z][A-Za-z ]+)\n  # a heading
175        ---+\n+              # followed by a dashed line
176        """,
177        docstring,
178        flags=re.VERBOSE | re.MULTILINE,
179    )
180    contents = sections[0]
181    for heading, content in zip(sections[1::2], sections[2::2]):
182        if content.startswith(" "):
183            # If the first line of section content is indented, we consider the section to be finished
184            # on the first non-indented line. We take out the rest - the tail - here.
185            content, tail = re.split(r"\n(?![ \n])", content, maxsplit=1)
186        else:
187            tail = ""
188
189        if heading in (
190            "Parameters",
191            "Returns",
192            "Yields",
193            "Receives",
194            "Other Parameters",
195            "Raises",
196            "Warns",
197            "Attributes",
198        ):
199            contents += f"###### {heading}\n{_numpy_parameters(content)}"
200        elif heading == "See Also":
201            contents += f"###### {heading}\n{_numpy_seealso(content)}"
202        else:
203            contents += f"###### {heading}\n{dedent(content)}"
204        contents += tail
205    return contents
206
207
208def _numpy_seealso(content: str) -> str:
209    """Convert a NumPy-style "See Also" section into Markdown"""
210    contents = ""
211    for item in _indented_list(content):
212        if ":" in item:
213            funcstr, desc = item.split(":", maxsplit=1)
214            desc = f": {desc}"
215        else:
216            funcstr, desc = item, ""
217
218        funclist = [f.strip() for f in funcstr.split(" ")]
219        funcs = ", ".join(f"`{f}`" for f in funclist if f)
220        contents += f"{funcs}{desc}  \n"
221    return contents
222
223
224def _numpy_parameters(content: str) -> str:
225    """Convert a NumPy-style parameter section into Markdown"""
226    contents = ""
227    for item in _indented_list(content):
228        m = re.match(r"^(.+):(.+)([\s\S]*)", item)
229        if m:
230            contents += (
231                f" - **{m.group(1).strip()}** ({m.group(2).strip()}):\n"
232                f"{indent(m.group(3).strip(), '   ')}\n"
233            )
234        else:
235            if "\n" in item:
236                name, desc = item.split("\n", maxsplit=1)
237                name = name.strip()
238                desc = desc.strip()
239            else:
240                name, desc = item.strip(), ""
241
242            if desc:
243                contents += f" - **{name}**: {desc}\n"
244            else:
245                contents += f" - **{name}**\n"
246    return f"{contents}\n"
247
248
249def rst(contents: str, source_file: Path | None) -> str:
250    """
251    Convert reStructuredText elements to Markdown.
252    We support the most common elements, but we do not aim to mirror the full complexity of the spec here.
253    """
254    contents = _rst_admonitions(contents, source_file)
255    contents = _rst_links(contents)
256
257    def replace_reference(m):
258        _, kind, name = m.groups()
259        if kind in ("meth", "func"):
260            return f"`{name}()`"
261        else:
262            return f"`{name}`"
263
264    # Code References: :obj:`foo` -> `foo`
265    contents = re.sub(
266        r"(:py)?:(mod|func|data|const|class|meth|attr|exc|obj):`([^`]+)`",
267        replace_reference,
268        contents,
269    )
270
271    # Math: :math:`foo` -> \\( foo \\)
272    # We don't use $ as that's not enabled by MathJax by default.
273    contents = re.sub(r":math:`(.+?)`", r"\\\\( \1 \\\\)", contents)
274
275    contents = _rst_footnotes(contents)
276
277    contents = _rst_fields(contents)
278
279    return contents
280
281
282def _rst_footnotes(contents: str) -> str:
283    """Convert reStructuredText footnotes"""
284    footnotes: set[str] = set()
285    autonum: int
286
287    def register_footnote(m: re.Match[str]) -> str:
288        nonlocal autonum
289        fn_id = m.group("id")
290        if fn_id in "*#":
291            fn_id = f"fn-{autonum}"
292            autonum += 1
293        fn_id = fn_id.lstrip("#*")
294        footnotes.add(fn_id)
295        content = indent(m.group("content"), "   ").lstrip()
296        return f"{m.group('indent')}[^{fn_id}]: {content}"
297
298    # Register footnotes
299    autonum = 1
300    contents = re.sub(
301        r"""
302            ^(?P<indent>[ ]*)\.\.[ ]+\[(?P<id>\d+|[#*]\w*)](?P<content>.*
303            (
304                \n                 # empty lines
305                |                  # or
306                (?P=indent)[ ]+.+  # lines with indentation
307            )*)$
308            """,
309        register_footnote,
310        contents,
311        flags=re.MULTILINE | re.VERBOSE,
312    )
313
314    def replace_references(m: re.Match[str]) -> str:
315        nonlocal autonum
316        fn_id = m.group("id")
317        if fn_id in "*#":
318            fn_id = f"fn-{autonum}"
319            autonum += 1
320        fn_id = fn_id.lstrip("#*")
321        if fn_id in footnotes:
322            return f"[^{fn_id}]"
323        else:
324            return m.group(0)
325
326    autonum = 1
327    contents = re.sub(r"\[(?P<id>\d+|[#*]\w*)]_", replace_references, contents)
328    return contents
329
330
331def _rst_links(contents: str) -> str:
332    """Convert reStructuredText hyperlinks"""
333    links = {}
334
335    def register_link(m: re.Match[str]) -> str:
336        refid = re.sub(r"\s", "", m.group("id").lower())
337        links[refid] = m.group("url")
338        return ""
339
340    def replace_link(m: re.Match[str]) -> str:
341        text = m.group("id")
342        refid = re.sub(r"[\s`]", "", text.lower())
343        try:
344            return f"[{text.strip('`')}]({links[refid]})"
345        except KeyError:
346            return m.group(0)
347
348    # Embedded URIs
349    contents = re.sub(
350        r"`(?P<text>[^`]+)<(?P<url>.+?)>`_", r"[\g<text>](\g<url>)", contents
351    )
352    # External Hyperlink Targets
353    contents = re.sub(
354        r"^\s*..\s+_(?P<id>[^\n:]+):\s*(?P<url>http\S+)",
355        register_link,
356        contents,
357        flags=re.MULTILINE,
358    )
359    contents = re.sub(r"(?P<id>[A-Za-z0-9_\-.:+]|`[^`]+`)_", replace_link, contents)
360    return contents
361
362
363def _rst_admonitions(contents: str, source_file: Path | None) -> str:
364    """
365    Convert reStructuredText admonitions - a bit tricky because they may already be indented themselves.
366    <https://www.sphinx-doc.org/en/master/usage/restructuredtext/directives.html>
367    """
368
369    def _rst_admonition(m: re.Match[str]) -> str:
370        ind = m.group("indent")
371        type = m.group("type")
372        val = m.group("val").strip()
373        contents = dedent(m.group("contents")).strip()
374
375        if type == "include":
376            loc = source_file or Path(".")
377            try:
378                included = (loc.parent / val).read_text("utf8", "replace")
379            except OSError as e:
380                warnings.warn(f"Cannot include {val!r}: {e}")
381                included = "\n"
382            included = _rst_admonitions(included, loc.parent / val)
383            return indent(included, ind)
384        if type == "math":
385            return f"{ind}$${val}{contents}$$\n"
386        if type in ("note", "warning", "danger"):
387            if val:
388                heading = f"{ind}###### {val}\n"
389            else:
390                heading = ""
391            return (
392                f'{ind}<div class="pdoc-alert pdoc-alert-{type}" markdown="1">\n'
393                f"{heading}"
394                f"{indent(contents, ind)}\n"
395                f"{ind}</div>\n"
396            )
397        if type == "code-block":
398            return f"{ind}```{val}\n{contents}\n```\n"
399        if type == "versionadded":
400            text = f"New in version {val}"
401        elif type == "versionchanged":
402            text = f"Changed in version {val}"
403        elif type == "deprecated":
404            text = f"Deprecated since version {val}"
405        else:
406            text = f"{type} {val}".strip()
407
408        if contents:
409            text = f"{ind}*{text}:*\n{indent(contents, ind)}\n\n"
410        else:
411            text = f"{ind}*{text}.*\n"
412
413        return text
414
415    admonition = "note|warning|danger|versionadded|versionchanged|deprecated|seealso|math|include|code-block"
416    return re.sub(
417        rf"""
418            ^(?P<indent>[ ]*)\.\.[ ]+(?P<type>{admonition})::(?P<val>.*)
419            (?P<contents>(
420                \n                 # empty lines
421                |                  # or
422                (?P=indent)[ ]+.+  # lines with indentation
423            )*)$
424        """,
425        _rst_admonition,
426        contents,
427        flags=re.MULTILINE | re.VERBOSE,
428    )
429
430
431def _rst_fields(contents: str) -> str:
432    """
433    Convert reStructuredText fields to Markdown.
434    <https://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html#rst-field-lists>
435    """
436
437    _has_parameter_section = False
438    _has_raises_section = False
439
440    def _rst_field(m: re.Match[str]) -> str:
441        type = m["type"]
442        body = m["body"]
443
444        if m["name"]:
445            name = f"**{m['name'].strip()}**: "
446        else:
447            name = ""
448
449        if type == "param":
450            nonlocal _has_parameter_section
451            text = f" - {name}{body}"
452            if not _has_parameter_section:
453                _has_parameter_section = True
454                text = "\n###### Parameters\n" + text
455            return text
456        elif type == "type":
457            return ""  # we expect users to use modern type annotations.
458        elif type == "return":
459            body = indent(body, "> ", lambda line: True)
460            return f"\n###### Returns\n{body}"
461        elif type == "rtype":
462            return ""  # we expect users to use modern type annotations.
463        elif type == "raises":
464            nonlocal _has_raises_section
465            text = f" - {name}{body}"
466            if not _has_raises_section:
467                _has_raises_section = True
468                text = "\n###### Raises\n" + text
469            return text
470        else:  # pragma: no cover
471            raise AssertionError("unreachable")
472
473    field = "param|type|return|rtype|raises"
474    return re.sub(
475        rf"""
476            ^:(?P<type>{field})(?:[ ]+(?P<name>.+))?:
477            (?P<body>.*(
478                (?:\n[ ]*)*  # maybe some empty lines followed by
479                [ ]+.+       # lines with indentation
480            )*(?:\n|$))
481        """,
482        _rst_field,
483        contents,
484        flags=re.MULTILINE | re.VERBOSE,
485    )
@cache
def convert(docstring: str, docformat: str, source_file: pathlib.Path | None) -> str:
30@cache
31def convert(docstring: str, docformat: str, source_file: Path | None) -> str:
32    """
33    Convert `docstring` from `docformat` to Markdown.
34    """
35    docformat = docformat.lower()
36
37    if any(x in docformat for x in ["google", "numpy", "restructuredtext"]):
38        docstring = rst(docstring, source_file)
39
40    if "google" in docformat:
41        docstring = google(docstring)
42
43    if "numpy" in docformat:
44        docstring = numpy(docstring)
45
46    if source_file is not None and os.environ.get("PDOC_EMBED_IMAGES") != "0":
47        docstring = embed_images(docstring, source_file)
48
49    return docstring

Convert docstring from docformat to Markdown.

def embed_images(docstring: str, source_file: pathlib.Path) -> str:
52def embed_images(docstring: str, source_file: Path) -> str:
53    def embed_local_image(m: re.Match) -> str:
54        image_path = source_file.parent / m["href"]
55        try:
56            image_data = image_path.read_bytes()
57            image_mime = mimetypes.guess_type(image_path)[0]
58        except Exception:
59            return m[0]
60        else:
61            data = base64.b64encode(image_data).decode()
62            return f"![{m['alt']}](data:{image_mime};base64,{data})"
63
64    return re.sub(
65        r"!\[\s*(?P<alt>.*?)\s*]\(\s*(?P<href>.+?)\s*\)",
66        embed_local_image,
67        docstring,
68    )
69    # TODO: Could probably do more here, e.g. support rST or raw HTML replacements.
def google(docstring: str) -> str:
72def google(docstring: str) -> str:
73    """Convert Google-style docstring sections into Markdown."""
74    return re.sub(
75        r"""
76        ^(?P<name>[A-Z][A-Z a-z]+):\n
77        (?P<contents>(
78            \n        # empty lines
79            |         # or
80            [ \t]+.+  # lines with indentation
81        )+)$
82        """,
83        _google_section,
84        docstring,
85        flags=re.VERBOSE | re.MULTILINE,
86    )

Convert Google-style docstring sections into Markdown.

GOOGLE_LIST_SECTIONS = ['Args', 'Raises', 'Attributes']

Section headers listed in the official Google docstring style guide.

GOOGLE_LIST_SECTION_ALIASES = {'Parameters': 'Args', 'Params': 'Args', 'Arguments': 'Args'}

Alternative section headers that are not listed in the official Google docstring style guide but that we recognize as sections containing lists nevertheless.

def numpy(docstring: str) -> str:
168def numpy(docstring: str) -> str:
169    """Convert NumPy-style docstring sections into Markdown.
170
171    See <https://numpydoc.readthedocs.io/en/latest/format.html> for details.
172    """
173    sections = re.split(
174        r"""
175        ^([A-Z][A-Za-z ]+)\n  # a heading
176        ---+\n+              # followed by a dashed line
177        """,
178        docstring,
179        flags=re.VERBOSE | re.MULTILINE,
180    )
181    contents = sections[0]
182    for heading, content in zip(sections[1::2], sections[2::2]):
183        if content.startswith(" "):
184            # If the first line of section content is indented, we consider the section to be finished
185            # on the first non-indented line. We take out the rest - the tail - here.
186            content, tail = re.split(r"\n(?![ \n])", content, maxsplit=1)
187        else:
188            tail = ""
189
190        if heading in (
191            "Parameters",
192            "Returns",
193            "Yields",
194            "Receives",
195            "Other Parameters",
196            "Raises",
197            "Warns",
198            "Attributes",
199        ):
200            contents += f"###### {heading}\n{_numpy_parameters(content)}"
201        elif heading == "See Also":
202            contents += f"###### {heading}\n{_numpy_seealso(content)}"
203        else:
204            contents += f"###### {heading}\n{dedent(content)}"
205        contents += tail
206    return contents

Convert NumPy-style docstring sections into Markdown.

See https://numpydoc.readthedocs.io/en/latest/format.html for details.

def rst(contents: str, source_file: pathlib.Path | None) -> str:
250def rst(contents: str, source_file: Path | None) -> str:
251    """
252    Convert reStructuredText elements to Markdown.
253    We support the most common elements, but we do not aim to mirror the full complexity of the spec here.
254    """
255    contents = _rst_admonitions(contents, source_file)
256    contents = _rst_links(contents)
257
258    def replace_reference(m):
259        _, kind, name = m.groups()
260        if kind in ("meth", "func"):
261            return f"`{name}()`"
262        else:
263            return f"`{name}`"
264
265    # Code References: :obj:`foo` -> `foo`
266    contents = re.sub(
267        r"(:py)?:(mod|func|data|const|class|meth|attr|exc|obj):`([^`]+)`",
268        replace_reference,
269        contents,
270    )
271
272    # Math: :math:`foo` -> \\( foo \\)
273    # We don't use $ as that's not enabled by MathJax by default.
274    contents = re.sub(r":math:`(.+?)`", r"\\\\( \1 \\\\)", contents)
275
276    contents = _rst_footnotes(contents)
277
278    contents = _rst_fields(contents)
279
280    return contents

Convert reStructuredText elements to Markdown. We support the most common elements, but we do not aim to mirror the full complexity of the spec here.