pdoc.docstrings
This module handles the conversion of docstring flavors to Markdown.
The conversion from docstring flavors to Markdown is mostly done with regular expressions. This is not particularly beautiful, but good enough for our purposes. The alternative would be to depend on https://github.com/rr-/docstring_parser or a similar project, but that introduces more complexity than we are comfortable with.
If you miss a particular feature for your favorite flavor, contributions are welcome.
That being said, please keep the complexity low and make sure that changes are
accompanied by matching snapshot tests in test/testdata/
.
1""" 2This module handles the conversion of docstring flavors to Markdown. 3 4The conversion from docstring flavors to Markdown is mostly done with regular expressions. 5This is not particularly beautiful, but good enough for our purposes. 6The alternative would be to depend on <https://github.com/rr-/docstring_parser> or a similar project, 7but that introduces more complexity than we are comfortable with. 8 9If you miss a particular feature for your favorite flavor, contributions are welcome. 10That being said, please keep the complexity low and make sure that changes are 11accompanied by matching snapshot tests in `test/testdata/`. 12""" 13 14from __future__ import annotations 15 16import base64 17from functools import cache 18import inspect 19import mimetypes 20import os 21from pathlib import Path 22import re 23from textwrap import dedent 24from textwrap import indent 25import warnings 26 27AnyException = (SystemExit, GeneratorExit, Exception) 28"""BaseException, but excluding KeyboardInterrupt. 29 30Modules may raise SystemExit on import (which we want to catch), 31but we don't want to catch a user's KeyboardInterrupt. 32""" 33 34 35@cache 36def convert(docstring: str, docformat: str, source_file: Path | None) -> str: 37 """ 38 Convert `docstring` from `docformat` to Markdown. 39 """ 40 docformat = docformat.lower() 41 42 try: 43 if any(x in docformat for x in ["google", "numpy", "restructuredtext"]): 44 docstring = rst(docstring, source_file) 45 46 if "google" in docformat: 47 docstring = google(docstring) 48 49 if "numpy" in docformat: 50 docstring = numpy(docstring) 51 52 if source_file is not None and os.environ.get("PDOC_EMBED_IMAGES") != "0": 53 docstring = embed_images(docstring, source_file) 54 55 except AnyException as e: 56 raise RuntimeError( 57 'Docstring processing failed for docstring=\n"""\n' 58 + docstring 59 + f'\n"""\n{source_file=}\n{docformat=}' 60 ) from e 61 62 return docstring 63 64 65def embed_images(docstring: str, source_file: Path) -> str: 66 def local_image_to_data_uri(href: str) -> str: 67 image_path = source_file.parent / href 68 image_data = image_path.read_bytes() 69 image_mime = mimetypes.guess_type(image_path)[0] 70 image_data_b64 = base64.b64encode(image_data).decode() 71 return f"data:{image_mime};base64,{image_data_b64}" 72 73 def embed_local_image(m: re.Match) -> str: 74 try: 75 href = local_image_to_data_uri(m["href"]) 76 except Exception: 77 return m[0] 78 else: 79 return m["before"] + href + m["after"] 80 81 # TODO: Could probably do more here, e.g. support rST replacements. 82 for regex in [ 83 r"(?P<before>!\[\s*.*?\s*]\(\s*)(?P<href>.+?)(?P<after>\s*\))", 84 r"""(?P<before>src=['"])(?P<href>.+?)(?P<after>['"])""", 85 ]: 86 docstring = re.sub(regex, embed_local_image, docstring) 87 return docstring 88 89 90def google(docstring: str) -> str: 91 """Convert Google-style docstring sections into Markdown.""" 92 return re.sub( 93 r""" 94 ^(?P<name>[A-Z][A-Z a-z]+):\n 95 (?P<contents>( 96 \n # empty lines 97 | # or 98 [ \t]+.+ # lines with indentation 99 )+)$ 100 """, 101 _google_section, 102 docstring, 103 flags=re.VERBOSE | re.MULTILINE, 104 ) 105 106 107GOOGLE_LIST_SECTIONS = ["Args", "Raises", "Attributes"] 108"""Section headers listed in the official Google docstring style guide.""" 109 110GOOGLE_LIST_SECTION_ALIASES = { 111 "Parameters": "Args", 112 "Params": "Args", 113 "Arguments": "Args", 114} 115""" 116Alternative section headers that are not listed in the official Google 117docstring style guide but that we recognize as sections containing lists 118nevertheless. 119""" 120 121 122def _google_section(m: re.Match[str]) -> str: 123 name = m.group("name") 124 contents = dedent(m.group("contents")).lstrip() 125 126 if name in GOOGLE_LIST_SECTION_ALIASES: 127 name = GOOGLE_LIST_SECTION_ALIASES[name] 128 129 if name in GOOGLE_LIST_SECTIONS: 130 items = _indented_list(contents) 131 contents = "" 132 for item in items: 133 try: 134 # first ":" on the first line 135 _, attr, desc = re.split(r"^(.+?:)", item, maxsplit=1) 136 except ValueError: 137 contents += " - " + indent(item, " ")[3:] 138 else: 139 contents += f" - **{attr}** " + indent(desc, " ")[3:] 140 contents += "\n" 141 else: 142 contents = indent(contents, "> ", lambda line: True) 143 144 if name == "Args": 145 name = "Arguments" 146 147 return f"\n###### {name}:\n{contents}\n" 148 149 150def _indented_list(contents: str) -> list[str]: 151 """ 152 Convert a list string into individual (dedented) elements. For example, 153 154 foo: 155 desc 156 bar: int 157 more desc 158 baz: 159 desc 160 indented 161 162 returns [ 163 "foo:\ndesc", 164 "bar: int\nmore desc", 165 "baz:\ndesc\n indented", 166 ] 167 """ 168 # we expect this to be through cleandoc() already. 169 assert not contents.startswith(" "), contents 170 assert not contents.startswith("\n"), contents 171 172 ret: list[str] = [] 173 for line in contents.splitlines(keepends=True): 174 empty = not line.strip() 175 indented = line.startswith(" ") 176 if not (empty or indented): 177 # new section 178 ret.append(line) 179 else: 180 # append to current section 181 ret[-1] += line 182 183 return [inspect.cleandoc(x) for x in ret] 184 185 186def numpy(docstring: str) -> str: 187 """Convert NumPy-style docstring sections into Markdown. 188 189 See <https://numpydoc.readthedocs.io/en/latest/format.html> for details. 190 """ 191 sections = re.split( 192 r""" 193 ^([A-Z][A-Za-z ]+)\n # a heading 194 ---+\n+ # followed by a dashed line 195 """, 196 docstring, 197 flags=re.VERBOSE | re.MULTILINE, 198 ) 199 contents = sections[0] 200 for heading, content in zip(sections[1::2], sections[2::2]): 201 if content.startswith(" ") and re.search(r"\n(?![ \n])", content): 202 # If the first line of section content is indented, we consider the section to be finished 203 # on the first non-indented line. We take out the rest - the tail - here. 204 content, tail = re.split(r"\n(?![ \n])", content, maxsplit=1) 205 else: 206 tail = "" 207 208 content = dedent(content) 209 210 if heading in ( 211 "Parameters", 212 "Returns", 213 "Yields", 214 "Receives", 215 "Other Parameters", 216 "Raises", 217 "Warns", 218 "Attributes", 219 ): 220 contents += f"###### {heading}\n{_numpy_parameters(content)}" 221 elif heading == "See Also": 222 contents += f"###### {heading}\n{_numpy_seealso(content)}" 223 else: 224 contents += f"###### {heading}\n{content}" 225 contents += tail 226 return contents 227 228 229def _numpy_seealso(content: str) -> str: 230 """Convert a NumPy-style "See Also" section into Markdown""" 231 contents = "" 232 for item in _indented_list(content): 233 if ":" in item: 234 funcstr, desc = item.split(":", maxsplit=1) 235 desc = f": {desc}" 236 else: 237 funcstr, desc = item, "" 238 239 funclist = [f.strip() for f in funcstr.split(" ")] 240 funcs = ", ".join(f"`{f}`" for f in funclist if f) 241 contents += f"{funcs}{desc} \n" 242 return contents 243 244 245def _numpy_parameters(content: str) -> str: 246 """Convert a NumPy-style parameter section into Markdown""" 247 contents = "" 248 for item in _indented_list(content): 249 m = re.match(r"^(.+):(.+)([\s\S]*)", item) 250 if m: 251 contents += ( 252 f" - **{m.group(1).strip()}** ({m.group(2).strip()}):\n" 253 f"{indent(m.group(3).strip(), ' ')}\n" 254 ) 255 else: 256 if "\n" in item: 257 name, desc = item.split("\n", maxsplit=1) 258 name = name.strip() 259 desc = desc.strip() 260 else: 261 name, desc = item.strip(), "" 262 263 if desc: 264 contents += f" - **{name}**: {desc}\n" 265 else: 266 contents += f" - **{name}**\n" 267 return f"{contents}\n" 268 269 270def rst(contents: str, source_file: Path | None) -> str: 271 """ 272 Convert reStructuredText elements to Markdown. 273 We support the most common elements, but we do not aim to mirror the full complexity of the spec here. 274 """ 275 contents = _rst_admonitions(contents, source_file) 276 contents = _rst_links(contents) 277 278 def replace_reference(m): 279 _, kind, name = m.groups() 280 if kind in ("meth", "func"): 281 return f"`{name}()`" 282 else: 283 return f"`{name}`" 284 285 # Code References: :obj:`foo` -> `foo` 286 contents = re.sub( 287 r"(:py)?:(mod|func|data|const|class|meth|attr|exc|obj):`([^`]+)`", 288 replace_reference, 289 contents, 290 ) 291 292 # Math: :math:`foo` -> \\( foo \\) 293 # We don't use $ as that's not enabled by MathJax by default. 294 contents = re.sub(r":math:`(.+?)`", r"\\\\( \1 \\\\)", contents) 295 296 contents = _rst_footnotes(contents) 297 298 contents = _rst_fields(contents) 299 300 return contents 301 302 303def _rst_footnotes(contents: str) -> str: 304 """Convert reStructuredText footnotes""" 305 footnotes: set[str] = set() 306 autonum: int 307 308 def register_footnote(m: re.Match[str]) -> str: 309 nonlocal autonum 310 fn_id = m.group("id") 311 if fn_id in "*#": 312 fn_id = f"fn-{autonum}" 313 autonum += 1 314 fn_id = fn_id.lstrip("#*") 315 footnotes.add(fn_id) 316 content = indent(m.group("content"), " ").lstrip() 317 return f"{m.group('indent')}[^{fn_id}]: {content}" 318 319 # Register footnotes 320 autonum = 1 321 contents = re.sub( 322 r""" 323 ^(?P<indent>[ ]*)\.\.[ ]+\[(?P<id>\d+|[#*]\w*)](?P<content>.* 324 ( 325 \n # empty lines 326 | # or 327 (?P=indent)[ ]+.+ # lines with indentation 328 )*)$ 329 """, 330 register_footnote, 331 contents, 332 flags=re.MULTILINE | re.VERBOSE, 333 ) 334 335 def replace_references(m: re.Match[str]) -> str: 336 nonlocal autonum 337 fn_id = m.group("id") 338 if fn_id in "*#": 339 fn_id = f"fn-{autonum}" 340 autonum += 1 341 fn_id = fn_id.lstrip("#*") 342 if fn_id in footnotes: 343 return f"[^{fn_id}]" 344 else: 345 return m.group(0) 346 347 autonum = 1 348 contents = re.sub(r"\[(?P<id>\d+|[#*]\w*)]_", replace_references, contents) 349 return contents 350 351 352def _rst_links(contents: str) -> str: 353 """Convert reStructuredText hyperlinks""" 354 links = {} 355 356 def register_link(m: re.Match[str]) -> str: 357 refid = re.sub(r"\s", "", m.group("id").lower()) 358 links[refid] = m.group("url") 359 return "" 360 361 def replace_link(m: re.Match[str]) -> str: 362 text = m.group("id") 363 refid = re.sub(r"[\s`]", "", text.lower()) 364 try: 365 return f"[{text.strip('`')}]({links[refid]})" 366 except KeyError: 367 return m.group(0) 368 369 # Embedded URIs 370 contents = re.sub( 371 r"`(?P<text>[^`]+)<(?P<url>.+?)>`_", r"[\g<text>](\g<url>)", contents 372 ) 373 # External Hyperlink Targets 374 contents = re.sub( 375 r"^\s*..\s+_(?P<id>[^\n:]+):\s*(?P<url>http\S+)", 376 register_link, 377 contents, 378 flags=re.MULTILINE, 379 ) 380 contents = re.sub(r"(?P<id>[A-Za-z0-9_\-.:+]|`[^`]+`)_", replace_link, contents) 381 return contents 382 383 384def _rst_extract_options(contents: str) -> tuple[str, dict[str, str]]: 385 """ 386 Extract options from the beginning of reStructuredText directives. 387 388 Return the trimmed content and a dict of options. 389 """ 390 options = {} 391 while match := re.match(r"^\s*:(.+?):(.*)([\s\S]*)", contents): 392 key, value, contents = match.groups() 393 options[key] = value.strip() 394 395 return contents, options 396 397 398def _rst_include_trim(contents: str, options: dict[str, str]) -> str: 399 """ 400 <https://docutils.sourceforge.io/docs/ref/rst/directives.html#include-options> 401 """ 402 if "end-line" in options or "start-line" in options: 403 lines = contents.splitlines() 404 if i := options.get("end-line"): 405 lines = lines[: int(i)] 406 if i := options.get("start-line"): 407 lines = lines[int(i) :] 408 contents = "\n".join(lines) 409 if x := options.get("end-before"): 410 contents = contents[: contents.index(x)] 411 if x := options.get("start-after"): 412 contents = contents[contents.index(x) + len(x) :] 413 return contents 414 415 416def _rst_admonitions(contents: str, source_file: Path | None) -> str: 417 """ 418 Convert reStructuredText admonitions - a bit tricky because they may already be indented themselves. 419 <https://www.sphinx-doc.org/en/master/usage/restructuredtext/directives.html> 420 """ 421 422 def _rst_admonition(m: re.Match[str]) -> str: 423 ind = m.group("indent") 424 type = m.group("type") 425 val = m.group("val").strip() 426 contents = dedent(m.group("contents")).strip() 427 contents, options = _rst_extract_options(contents) 428 429 if type == "include": 430 loc = source_file or Path(".") 431 try: 432 included = (loc.parent / val).read_text("utf8", "replace") 433 except OSError as e: 434 warnings.warn(f"Cannot include {val!r}: {e}") 435 included = "\n" 436 try: 437 included = _rst_include_trim(included, options) + "\n" 438 except ValueError as e: 439 warnings.warn(f"Failed to process include options for {val!r}: {e}") 440 included = _rst_admonitions(included, loc.parent / val) 441 included = embed_images(included, loc.parent / val) 442 return indent(included, ind) 443 if type == "math": 444 return f"{ind}$${val}{contents}$$\n" 445 if type in ("note", "warning", "danger"): 446 if val: 447 heading = f"{ind}###### {val}\n" 448 else: 449 heading = "" 450 return ( 451 f'{ind}<div class="alert {type}" markdown="1">\n' 452 f"{heading}" 453 f"{indent(contents, ind)}\n" 454 f"{ind}</div>\n" 455 ) 456 if type == "code-block": 457 return f"{ind}```{val}\n{contents}\n```\n" 458 if type == "versionadded": 459 text = f"New in version {val}" 460 elif type == "versionchanged": 461 text = f"Changed in version {val}" 462 elif type == "deprecated": 463 text = f"Deprecated since version {val}" 464 else: 465 text = f"{type} {val}".strip() 466 467 if contents: 468 text = f"{ind}*{text}:*\n{indent(contents, ind)}\n\n" 469 else: 470 text = f"{ind}*{text}.*\n" 471 472 return text 473 474 admonition = "note|warning|danger|versionadded|versionchanged|deprecated|seealso|math|include|code-block" 475 return re.sub( 476 rf""" 477 ^(?P<indent>[ ]*)\.\.[ ]+(?P<type>{admonition})::(?P<val>.*) 478 (?P<contents>( 479 \n # empty lines 480 | # or 481 (?P=indent)[ ]+.+ # lines with indentation 482 )*)$ 483 """, 484 _rst_admonition, 485 contents, 486 flags=re.MULTILINE | re.VERBOSE, 487 ) 488 489 490def _rst_fields(contents: str) -> str: 491 """ 492 Convert reStructuredText fields to Markdown. 493 <https://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html#rst-field-lists> 494 """ 495 496 _has_parameter_section = False 497 _has_raises_section = False 498 499 def _rst_field(m: re.Match[str]) -> str: 500 type = m["type"] 501 body = m["body"] 502 503 if m["name"]: 504 name = f"**{m['name'].strip()}**: " 505 else: 506 name = "" 507 508 if type == "param": 509 nonlocal _has_parameter_section 510 text = f" - {name}{body}" 511 if not _has_parameter_section: 512 _has_parameter_section = True 513 text = "\n###### Parameters\n" + text 514 return text 515 elif type == "type": 516 return "" # we expect users to use modern type annotations. 517 elif type == "return": 518 body = indent(body, "> ", lambda line: True) 519 return f"\n###### Returns\n{body}" 520 elif type == "rtype": 521 return "" # we expect users to use modern type annotations. 522 elif type == "raises": 523 nonlocal _has_raises_section 524 text = f" - {name}{body}" 525 if not _has_raises_section: 526 _has_raises_section = True 527 text = "\n###### Raises\n" + text 528 return text 529 else: # pragma: no cover 530 raise AssertionError("unreachable") 531 532 field = "param|type|return|rtype|raises" 533 return re.sub( 534 rf""" 535 ^:(?P<type>{field})(?:[ ]+(?P<name>.+))?: 536 (?P<body>.*( 537 (?:\n[ ]*)* # maybe some empty lines followed by 538 [ ]+.+ # lines with indentation 539 )*(?:\n|$)) 540 """, 541 _rst_field, 542 contents, 543 flags=re.MULTILINE | re.VERBOSE, 544 )
BaseException, but excluding KeyboardInterrupt.
Modules may raise SystemExit on import (which we want to catch), but we don't want to catch a user's KeyboardInterrupt.
36@cache 37def convert(docstring: str, docformat: str, source_file: Path | None) -> str: 38 """ 39 Convert `docstring` from `docformat` to Markdown. 40 """ 41 docformat = docformat.lower() 42 43 try: 44 if any(x in docformat for x in ["google", "numpy", "restructuredtext"]): 45 docstring = rst(docstring, source_file) 46 47 if "google" in docformat: 48 docstring = google(docstring) 49 50 if "numpy" in docformat: 51 docstring = numpy(docstring) 52 53 if source_file is not None and os.environ.get("PDOC_EMBED_IMAGES") != "0": 54 docstring = embed_images(docstring, source_file) 55 56 except AnyException as e: 57 raise RuntimeError( 58 'Docstring processing failed for docstring=\n"""\n' 59 + docstring 60 + f'\n"""\n{source_file=}\n{docformat=}' 61 ) from e 62 63 return docstring
Convert docstring
from docformat
to Markdown.
66def embed_images(docstring: str, source_file: Path) -> str: 67 def local_image_to_data_uri(href: str) -> str: 68 image_path = source_file.parent / href 69 image_data = image_path.read_bytes() 70 image_mime = mimetypes.guess_type(image_path)[0] 71 image_data_b64 = base64.b64encode(image_data).decode() 72 return f"data:{image_mime};base64,{image_data_b64}" 73 74 def embed_local_image(m: re.Match) -> str: 75 try: 76 href = local_image_to_data_uri(m["href"]) 77 except Exception: 78 return m[0] 79 else: 80 return m["before"] + href + m["after"] 81 82 # TODO: Could probably do more here, e.g. support rST replacements. 83 for regex in [ 84 r"(?P<before>!\[\s*.*?\s*]\(\s*)(?P<href>.+?)(?P<after>\s*\))", 85 r"""(?P<before>src=['"])(?P<href>.+?)(?P<after>['"])""", 86 ]: 87 docstring = re.sub(regex, embed_local_image, docstring) 88 return docstring
91def google(docstring: str) -> str: 92 """Convert Google-style docstring sections into Markdown.""" 93 return re.sub( 94 r""" 95 ^(?P<name>[A-Z][A-Z a-z]+):\n 96 (?P<contents>( 97 \n # empty lines 98 | # or 99 [ \t]+.+ # lines with indentation 100 )+)$ 101 """, 102 _google_section, 103 docstring, 104 flags=re.VERBOSE | re.MULTILINE, 105 )
Convert Google-style docstring sections into Markdown.
Section headers listed in the official Google docstring style guide.
Alternative section headers that are not listed in the official Google docstring style guide but that we recognize as sections containing lists nevertheless.
187def numpy(docstring: str) -> str: 188 """Convert NumPy-style docstring sections into Markdown. 189 190 See <https://numpydoc.readthedocs.io/en/latest/format.html> for details. 191 """ 192 sections = re.split( 193 r""" 194 ^([A-Z][A-Za-z ]+)\n # a heading 195 ---+\n+ # followed by a dashed line 196 """, 197 docstring, 198 flags=re.VERBOSE | re.MULTILINE, 199 ) 200 contents = sections[0] 201 for heading, content in zip(sections[1::2], sections[2::2]): 202 if content.startswith(" ") and re.search(r"\n(?![ \n])", content): 203 # If the first line of section content is indented, we consider the section to be finished 204 # on the first non-indented line. We take out the rest - the tail - here. 205 content, tail = re.split(r"\n(?![ \n])", content, maxsplit=1) 206 else: 207 tail = "" 208 209 content = dedent(content) 210 211 if heading in ( 212 "Parameters", 213 "Returns", 214 "Yields", 215 "Receives", 216 "Other Parameters", 217 "Raises", 218 "Warns", 219 "Attributes", 220 ): 221 contents += f"###### {heading}\n{_numpy_parameters(content)}" 222 elif heading == "See Also": 223 contents += f"###### {heading}\n{_numpy_seealso(content)}" 224 else: 225 contents += f"###### {heading}\n{content}" 226 contents += tail 227 return contents
Convert NumPy-style docstring sections into Markdown.
See https://numpydoc.readthedocs.io/en/latest/format.html for details.
271def rst(contents: str, source_file: Path | None) -> str: 272 """ 273 Convert reStructuredText elements to Markdown. 274 We support the most common elements, but we do not aim to mirror the full complexity of the spec here. 275 """ 276 contents = _rst_admonitions(contents, source_file) 277 contents = _rst_links(contents) 278 279 def replace_reference(m): 280 _, kind, name = m.groups() 281 if kind in ("meth", "func"): 282 return f"`{name}()`" 283 else: 284 return f"`{name}`" 285 286 # Code References: :obj:`foo` -> `foo` 287 contents = re.sub( 288 r"(:py)?:(mod|func|data|const|class|meth|attr|exc|obj):`([^`]+)`", 289 replace_reference, 290 contents, 291 ) 292 293 # Math: :math:`foo` -> \\( foo \\) 294 # We don't use $ as that's not enabled by MathJax by default. 295 contents = re.sub(r":math:`(.+?)`", r"\\\\( \1 \\\\)", contents) 296 297 contents = _rst_footnotes(contents) 298 299 contents = _rst_fields(contents) 300 301 return contents
Convert reStructuredText elements to Markdown. We support the most common elements, but we do not aim to mirror the full complexity of the spec here.