This module handles the conversion of docstring flavors to Markdown.

The conversion from docstring flavors to Markdown is mostly done with regular expressions. This is not particularly beautiful, but good enough for our purposes. The alternative would be to depend on https://github.com/rr-/docstring_parser or a similar project, but that introduces more complexity than we are comfortable with.

If you miss a particular feature for your favorite flavor, contributions are welcome. That being said, please keep the complexity low and make sure that changes are accompanied by matching snapshot tests in test/testdata/.

 14from __future__ import annotations
 16import base64
 17from functools import cache
 18import inspect
 19import mimetypes
 20import os
 21from pathlib import Path
 22import re
 23from textwrap import dedent
 24from textwrap import indent
 25import warnings
 29def convert(docstring: str, docformat: str, source_file: Path | None) -> str:
 30    """
 31    Convert `docstring` from `docformat` to Markdown.
 32    """
 33    docformat = docformat.lower()
 35    if any(x in docformat for x in ["google", "numpy", "restructuredtext"]):
 36        docstring = rst(docstring, source_file)
 38    if "google" in docformat:
 39        docstring = google(docstring)
 41    if "numpy" in docformat:
 42        docstring = numpy(docstring)
 44    if source_file is not None and os.environ.get("PDOC_EMBED_IMAGES") != "0":
 45        docstring = embed_images(docstring, source_file)
 47    return docstring
 50def embed_images(docstring: str, source_file: Path) -> str:
 51    def embed_local_image(m: re.Match) -> str:
 52        image_path = source_file.parent / m["href"]
 53        try:
 54            image_data = image_path.read_bytes()
 55            image_mime = mimetypes.guess_type(image_path)[0]
 56        except Exception:
 57            return m[0]
 58        else:
 59            data = base64.b64encode(image_data).decode()
 60            return f"![{m['alt']}](data:{image_mime};base64,{data})"
 62    return re.sub(
 63        r"!\[\s*(?P<alt>.*?)\s*]\(\s*(?P<href>.+?)\s*\)",
 64        embed_local_image,
 65        docstring,
 66    )
 67    # TODO: Could probably do more here, e.g. support rST or raw HTML replacements.
 70def google(docstring: str) -> str:
 71    """Convert Google-style docstring sections into Markdown."""
 72    return re.sub(
 73        r"""
 74        ^(?P<name>[A-Z][A-Z a-z]+):\n
 75        (?P<contents>(
 76            \n        # empty lines
 77            |         # or
 78            [ \t]+.+  # lines with indentation
 79        )+)$
 80        """,
 81        _google_section,
 82        docstring,
 83        flags=re.VERBOSE | re.MULTILINE,
 84    )
 87GOOGLE_LIST_SECTIONS = ["Args", "Raises", "Attributes"]
 88"""Section headers listed in the official Google docstring style guide."""
 91    "Parameters": "Args",
 92    "Params": "Args",
 93    "Arguments": "Args",
 96Alternative section headers that are not listed in the official Google
 97docstring style guide but that we recognize as sections containing lists
102def _google_section(m: re.Match[str]) -> str:
103    name = m.group("name")
104    contents = dedent(m.group("contents")).lstrip()
107        name = GOOGLE_LIST_SECTION_ALIASES[name]
109    if name in GOOGLE_LIST_SECTIONS:
110        items = _indented_list(contents)
111        contents = ""
112        for item in items:
113            try:
114                # first ":" on the first line
115                _, attr, desc = re.split(r"^(.+?:)", item, maxsplit=1)
116            except ValueError:
117                contents += " - " + indent(item, "   ")[3:]
118            else:
119                contents += f" - **{attr}** " + indent(desc, "   ")[3:]
120            contents += "\n"
121    else:
122        contents = indent(contents, "> ", lambda line: True)
124    if name == "Args":
125        name = "Arguments"
127    return f"\n###### {name}:\n{contents}\n"
130def _indented_list(contents: str) -> list[str]:
131    """
132    Convert a list string into individual (dedented) elements. For example,
134    foo:
135        desc
136    bar: int
137        more desc
138    baz:
139        desc
140            indented
142    returns [
143        "foo:\ndesc",
144        "bar: int\nmore desc",
145        "baz:\ndesc\n    indented",
146    ]
147    """
148    # we expect this to be through cleandoc() already.
149    assert not contents.startswith(" "), contents
150    assert not contents.startswith("\n"), contents
152    ret: list[str] = []
153    for line in contents.splitlines(keepends=True):
154        empty = not line.strip()
155        indented = line.startswith(" ")
156        if not (empty or indented):
157            # new section
158            ret.append(line)
159        else:
160            # append to current section
161            ret[-1] += line
163    return [inspect.cleandoc(x) for x in ret]
166def numpy(docstring: str) -> str:
167    """Convert NumPy-style docstring sections into Markdown.
169    See <https://numpydoc.readthedocs.io/en/latest/format.html> for details.
170    """
171    sections = re.split(
172        r"""
173        ^([A-Z][A-Za-z ]+)\n  # a heading
174        ---+\n+              # followed by a dashed line
175        """,
176        docstring,
177        flags=re.VERBOSE | re.MULTILINE,
178    )
179    contents = sections[0]
180    for heading, content in zip(sections[1::2], sections[2::2]):
181        if content.startswith(" "):
182            # If the first line of section content is indented, we consider the section to be finished
183            # on the first non-indented line. We take out the rest - the tail - here.
184            content, tail = re.split(r"\n(?![ \n])", content, maxsplit=1)
185        else:
186            tail = ""
188        if heading in (
189            "Parameters",
190            "Returns",
191            "Yields",
192            "Receives",
193            "Other Parameters",
194            "Raises",
195            "Warns",
196            "Attributes",
197        ):
198            contents += f"###### {heading}\n{_numpy_parameters(content)}"
199        elif heading == "See Also":
200            contents += f"###### {heading}\n{_numpy_seealso(content)}"
201        else:
202            contents += f"###### {heading}\n{dedent(content)}"
203        contents += tail
204    return contents
207def _numpy_seealso(content: str) -> str:
208    """Convert a NumPy-style "See Also" section into Markdown"""
209    contents = ""
210    for item in _indented_list(content):
211        if ":" in item:
212            funcstr, desc = item.split(":", maxsplit=1)
213            desc = f": {desc}"
214        else:
215            funcstr, desc = item, ""
217        funclist = [f.strip() for f in funcstr.split(" ")]
218        funcs = ", ".join(f"`{f}`" for f in funclist if f)
219        contents += f"{funcs}{desc}  \n"
220    return contents
223def _numpy_parameters(content: str) -> str:
224    """Convert a NumPy-style parameter section into Markdown"""
225    contents = ""
226    for item in _indented_list(content):
227        m = re.match(r"^(.+):(.+)([\s\S]*)", item)
228        if m:
229            contents += (
230                f" - **{m.group(1).strip()}** ({m.group(2).strip()}):\n"
231                f"{indent(m.group(3).strip(), '   ')}\n"
232            )
233        else:
234            if "\n" in item:
235                name, desc = item.split("\n", maxsplit=1)
236                name = name.strip()
237                desc = desc.strip()
238            else:
239                name, desc = item.strip(), ""
241            if desc:
242                contents += f" - **{name}**: {desc}\n"
243            else:
244                contents += f" - **{name}**\n"
245    return f"{contents}\n"
248def rst(contents: str, source_file: Path | None) -> str:
249    """
250    Convert reStructuredText elements to Markdown.
251    We support the most common elements, but we do not aim to mirror the full complexity of the spec here.
252    """
253    contents = _rst_admonitions(contents, source_file)
254    contents = _rst_links(contents)
256    def replace_reference(m):
257        _, kind, name = m.groups()
258        if kind in ("meth", "func"):
259            return f"`{name}()`"
260        else:
261            return f"`{name}`"
263    # Code References: :obj:`foo` -> `foo`
264    contents = re.sub(
265        r"(:py)?:(mod|func|data|const|class|meth|attr|exc|obj):`([^`]+)`",
266        replace_reference,
267        contents,
268    )
270    # Math: :math:`foo` -> \\( foo \\)
271    # We don't use $ as that's not enabled by MathJax by default.
272    contents = re.sub(r":math:`(.+?)`", r"\\\\( \1 \\\\)", contents)
274    contents = _rst_footnotes(contents)
276    contents = _rst_fields(contents)
278    return contents
281def _rst_footnotes(contents: str) -> str:
282    """Convert reStructuredText footnotes"""
283    footnotes: set[str] = set()
284    autonum: int
286    def register_footnote(m: re.Match[str]) -> str:
287        nonlocal autonum
288        fn_id = m.group("id")
289        if fn_id in "*#":
290            fn_id = f"fn-{autonum}"
291            autonum += 1
292        fn_id = fn_id.lstrip("#*")
293        footnotes.add(fn_id)
294        content = indent(m.group("content"), "   ").lstrip()
295        return f"{m.group('indent')}[^{fn_id}]: {content}"
297    # Register footnotes
298    autonum = 1
299    contents = re.sub(
300        r"""
301            ^(?P<indent>[ ]*)\.\.[ ]+\[(?P<id>\d+|[#*]\w*)](?P<content>.*
302            (
303                \n                 # empty lines
304                |                  # or
305                (?P=indent)[ ]+.+  # lines with indentation
306            )*)$
307            """,
308        register_footnote,
309        contents,
310        flags=re.MULTILINE | re.VERBOSE,
311    )
313    def replace_references(m: re.Match[str]) -> str:
314        nonlocal autonum
315        fn_id = m.group("id")
316        if fn_id in "*#":
317            fn_id = f"fn-{autonum}"
318            autonum += 1
319        fn_id = fn_id.lstrip("#*")
320        if fn_id in footnotes:
321            return f"[^{fn_id}]"
322        else:
323            return m.group(0)
325    autonum = 1
326    contents = re.sub(r"\[(?P<id>\d+|[#*]\w*)]_", replace_references, contents)
327    return contents
330def _rst_links(contents: str) -> str:
331    """Convert reStructuredText hyperlinks"""
332    links = {}
334    def register_link(m: re.Match[str]) -> str:
335        refid = re.sub(r"\s", "", m.group("id").lower())
336        links[refid] = m.group("url")
337        return ""
339    def replace_link(m: re.Match[str]) -> str:
340        text = m.group("id")
341        refid = re.sub(r"[\s`]", "", text.lower())
342        try:
343            return f"[{text.strip('`')}]({links[refid]})"
344        except KeyError:
345            return m.group(0)
347    # Embedded URIs
348    contents = re.sub(
349        r"`(?P<text>[^`]+)<(?P<url>.+?)>`_", r"[\g<text>](\g<url>)", contents
350    )
351    # External Hyperlink Targets
352    contents = re.sub(
353        r"^\s*..\s+_(?P<id>[^\n:]+):\s*(?P<url>http\S+)",
354        register_link,
355        contents,
356        flags=re.MULTILINE,
357    )
358    contents = re.sub(r"(?P<id>[A-Za-z0-9_\-.:+]|`[^`]+`)_", replace_link, contents)
359    return contents
362def _rst_extract_options(contents: str) -> tuple[str, dict[str, str]]:
363    """
364    Extract options from the beginning of reStructuredText directives.
366    Return the trimmed content and a dict of options.
367    """
368    options = {}
369    while match := re.match(r"^\s*:(.+?):(.*)([\s\S]*)", contents):
370        key, value, contents = match.groups()
371        options[key] = value.strip()
373    return contents, options
376def _rst_include_trim(contents: str, options: dict[str, str]) -> str:
377    """
378    <https://docutils.sourceforge.io/docs/ref/rst/directives.html#include-options>
379    """
380    if "end-line" in options or "start-line" in options:
381        lines = contents.splitlines()
382        if i := options.get("end-line"):
383            lines = lines[: int(i)]
384        if i := options.get("start-line"):
385            lines = lines[int(i) :]
386        contents = "\n".join(lines)
387    if x := options.get("end-before"):
388        contents = contents[: contents.index(x)]
389    if x := options.get("start-after"):
390        contents = contents[contents.index(x) + len(x) :]
391    return contents
394def _rst_admonitions(contents: str, source_file: Path | None) -> str:
395    """
396    Convert reStructuredText admonitions - a bit tricky because they may already be indented themselves.
397    <https://www.sphinx-doc.org/en/master/usage/restructuredtext/directives.html>
398    """
400    def _rst_admonition(m: re.Match[str]) -> str:
401        ind = m.group("indent")
402        type = m.group("type")
403        val = m.group("val").strip()
404        contents = dedent(m.group("contents")).strip()
405        contents, options = _rst_extract_options(contents)
407        if type == "include":
408            loc = source_file or Path(".")
409            try:
410                included = (loc.parent / val).read_text("utf8", "replace")
411            except OSError as e:
412                warnings.warn(f"Cannot include {val!r}: {e}")
413                included = "\n"
414            try:
415                included = _rst_include_trim(included, options) + "\n"
416            except ValueError as e:
417                warnings.warn(f"Failed to process include options for {val!r}: {e}")
418            included = _rst_admonitions(included, loc.parent / val)
419            included = embed_images(included, loc.parent / val)
420            return indent(included, ind)
421        if type == "math":
422            return f"{ind}$${val}{contents}$$\n"
423        if type in ("note", "warning", "danger"):
424            if val:
425                heading = f"{ind}###### {val}\n"
426            else:
427                heading = ""
428            return (
429                f'{ind}<div class="alert {type}" markdown="1">\n'
430                f"{heading}"
431                f"{indent(contents, ind)}\n"
432                f"{ind}</div>\n"
433            )
434        if type == "code-block":
435            return f"{ind}```{val}\n{contents}\n```\n"
436        if type == "versionadded":
437            text = f"New in version {val}"
438        elif type == "versionchanged":
439            text = f"Changed in version {val}"
440        elif type == "deprecated":
441            text = f"Deprecated since version {val}"
442        else:
443            text = f"{type} {val}".strip()
445        if contents:
446            text = f"{ind}*{text}:*\n{indent(contents, ind)}\n\n"
447        else:
448            text = f"{ind}*{text}.*\n"
450        return text
452    admonition = "note|warning|danger|versionadded|versionchanged|deprecated|seealso|math|include|code-block"
453    return re.sub(
454        rf"""
455            ^(?P<indent>[ ]*)\.\.[ ]+(?P<type>{admonition})::(?P<val>.*)
456            (?P<contents>(
457                \n                 # empty lines
458                |                  # or
459                (?P=indent)[ ]+.+  # lines with indentation
460            )*)$
461        """,
462        _rst_admonition,
463        contents,
464        flags=re.MULTILINE | re.VERBOSE,
465    )
468def _rst_fields(contents: str) -> str:
469    """
470    Convert reStructuredText fields to Markdown.
471    <https://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html#rst-field-lists>
472    """
474    _has_parameter_section = False
475    _has_raises_section = False
477    def _rst_field(m: re.Match[str]) -> str:
478        type = m["type"]
479        body = m["body"]
481        if m["name"]:
482            name = f"**{m['name'].strip()}**: "
483        else:
484            name = ""
486        if type == "param":
487            nonlocal _has_parameter_section
488            text = f" - {name}{body}"
489            if not _has_parameter_section:
490                _has_parameter_section = True
491                text = "\n###### Parameters\n" + text
492            return text
493        elif type == "type":
494            return ""  # we expect users to use modern type annotations.
495        elif type == "return":
496            body = indent(body, "> ", lambda line: True)
497            return f"\n###### Returns\n{body}"
498        elif type == "rtype":
499            return ""  # we expect users to use modern type annotations.
500        elif type == "raises":
501            nonlocal _has_raises_section
502            text = f" - {name}{body}"
503            if not _has_raises_section:
504                _has_raises_section = True
505                text = "\n###### Raises\n" + text
506            return text
507        else:  # pragma: no cover
508            raise AssertionError("unreachable")
510    field = "param|type|return|rtype|raises"
511    return re.sub(
512        rf"""
513            ^:(?P<type>{field})(?:[ ]+(?P<name>.+))?:
514            (?P<body>.*(
515                (?:\n[ ]*)*  # maybe some empty lines followed by
516                [ ]+.+       # lines with indentation
517            )*(?:\n|$))
518        """,
519        _rst_field,
520        contents,
521        flags=re.MULTILINE | re.VERBOSE,
522    )
