pdoc.docstrings
This module handles the conversion of docstring flavors to Markdown.
The conversion from docstring flavors to Markdown is mostly done with regular expressions. This is not particularly beautiful, but good enough for our purposes. The alternative would be to depend on https://github.com/rr-/docstring_parser or a similar project, but that introduces more complexity than we are comfortable with.
If you miss a particular feature for your favorite flavor, contributions are welcome.
That being said, please keep the complexity low and make sure that changes are
accompanied by matching snapshot tests in test/testdata/.
1""" 2This module handles the conversion of docstring flavors to Markdown. 3 4The conversion from docstring flavors to Markdown is mostly done with regular expressions. 5This is not particularly beautiful, but good enough for our purposes. 6The alternative would be to depend on <https://github.com/rr-/docstring_parser> or a similar project, 7but that introduces more complexity than we are comfortable with. 8 9If you miss a particular feature for your favorite flavor, contributions are welcome. 10That being said, please keep the complexity low and make sure that changes are 11accompanied by matching snapshot tests in `test/testdata/`. 12""" 13 14from __future__ import annotations 15 16import base64 17from functools import cache 18import inspect 19import mimetypes 20import os 21from pathlib import Path 22import re 23from textwrap import dedent 24from textwrap import indent 25import warnings 26 27AnyException = (SystemExit, GeneratorExit, Exception) 28"""BaseException, but excluding KeyboardInterrupt. 29 30Modules may raise SystemExit on import (which we want to catch), 31but we don't want to catch a user's KeyboardInterrupt. 32""" 33 34 35@cache 36def convert(docstring: str, docformat: str, source_file: Path | None) -> str: 37 """ 38 Convert `docstring` from `docformat` to Markdown. 39 """ 40 docformat = docformat.lower() 41 42 try: 43 if any(x in docformat for x in ["google", "numpy", "restructuredtext"]): 44 docstring = rst(docstring, source_file) 45 46 if "google" in docformat: 47 docstring = google(docstring) 48 49 if "numpy" in docformat: 50 docstring = numpy(docstring) 51 52 if source_file is not None and os.environ.get("PDOC_EMBED_IMAGES") != "0": 53 docstring = embed_images(docstring, source_file) 54 55 except AnyException as e: 56 raise RuntimeError( 57 'Docstring processing failed for docstring=\n"""\n' 58 + docstring 59 + f'\n"""\n{source_file=}\n{docformat=}' 60 ) from e 61 62 return docstring 63 64 65def embed_images(docstring: str, source_file: Path) -> str: 66 def local_image_to_data_uri(href: str) -> str: 67 image_path = source_file.parent / href 68 image_data = image_path.read_bytes() 69 image_mime = mimetypes.guess_type(image_path)[0] 70 image_data_b64 = base64.b64encode(image_data).decode() 71 return f"data:{image_mime};base64,{image_data_b64}" 72 73 def embed_local_image(m: re.Match) -> str: 74 try: 75 href = local_image_to_data_uri(m["href"]) 76 except Exception: 77 return m[0] 78 else: 79 return m["before"] + href + m["after"] 80 81 # TODO: Could probably do more here, e.g. support rST replacements. 82 for regex in [ 83 r"(?P<before>!\[\s*.*?\s*]\(\s*)(?P<href>.+?)(?P<after>\s*\))", 84 r"""(?P<before>src=['"])(?P<href>.+?)(?P<after>['"])""", 85 ]: 86 docstring = re.sub(regex, embed_local_image, docstring) 87 return docstring 88 89 90def google(docstring: str) -> str: 91 """Convert Google-style docstring sections into Markdown.""" 92 return re.sub( 93 r""" 94 ^(?P<name>[A-Z][A-Z a-z]+):\n 95 (?P<contents>( 96 \n # empty lines 97 | # or 98 [ \t]+.+ # lines with indentation 99 )+)$ 100 """, 101 _google_section, 102 docstring, 103 flags=re.VERBOSE | re.MULTILINE, 104 ) 105 106 107GOOGLE_LIST_SECTIONS = ["Args", "Raises", "Attributes", "Keyword Args"] 108"""Section headers listed in the official Google docstring style guide.""" 109 110GOOGLE_LIST_SECTION_ALIASES = { 111 "Parameters": "Args", 112 "Params": "Args", 113 "Arguments": "Args", 114 "Raise": "Raises", 115 "Keyword Arguments": "Keyword Args", 116} 117""" 118Alternative section headers that are not listed in the official Google 119docstring style guide but that we recognize as sections containing lists 120nevertheless. 121""" 122 123 124def _google_section(m: re.Match[str]) -> str: 125 name = m.group("name") 126 contents = dedent(m.group("contents")).lstrip() 127 128 if name in GOOGLE_LIST_SECTION_ALIASES: 129 name = GOOGLE_LIST_SECTION_ALIASES[name] 130 131 if name in GOOGLE_LIST_SECTIONS: 132 items = _indented_list(contents) 133 contents = "" 134 for item in items: 135 try: 136 # first ":" on the first line 137 _, attr, desc = re.split(r"^(.+?:)", item, maxsplit=1) 138 except ValueError: 139 contents += " - " + indent(item, " ")[3:] 140 else: 141 contents += f" - **{attr}** " + indent(desc, " ")[3:] 142 contents += "\n" 143 else: 144 contents = indent(contents, "> ", lambda line: True) 145 146 if name == "Args": 147 name = "Arguments" 148 149 return f"\n###### {name}:\n{contents}\n" 150 151 152def _indented_list(contents: str) -> list[str]: 153 """ 154 Convert a list string into individual (dedented) elements. For example, 155 156 foo: 157 desc 158 bar: int 159 more desc 160 baz: 161 desc 162 indented 163 164 returns [ 165 "foo:\ndesc", 166 "bar: int\nmore desc", 167 "baz:\ndesc\n indented", 168 ] 169 """ 170 # we expect this to be through cleandoc() already. 171 assert not contents.startswith(" "), contents 172 assert not contents.startswith("\n"), contents 173 174 ret: list[str] = [] 175 for line in contents.splitlines(keepends=True): 176 empty = not line.strip() 177 indented = line.startswith(" ") 178 if not (empty or indented): 179 # new section 180 ret.append(line) 181 else: 182 # append to current section 183 ret[-1] += line 184 185 return [inspect.cleandoc(x) for x in ret] 186 187 188def numpy(docstring: str) -> str: 189 """Convert NumPy-style docstring sections into Markdown. 190 191 See <https://numpydoc.readthedocs.io/en/latest/format.html> for details. 192 """ 193 sections = re.split( 194 r""" 195 ^([A-Z][A-Za-z ]+)\n # a heading 196 ---+\n+ # followed by a dashed line 197 """, 198 docstring, 199 flags=re.VERBOSE | re.MULTILINE, 200 ) 201 contents = sections[0] 202 for heading, content in zip(sections[1::2], sections[2::2]): 203 if content.startswith(" ") and re.search(r"\n(?![ \n])", content): 204 # If the first line of section content is indented, we consider the section to be finished 205 # on the first non-indented line. We take out the rest - the tail - here. 206 content, tail = re.split(r"\n(?![ \n])", content, maxsplit=1) 207 else: 208 tail = "" 209 210 content = dedent(content) 211 212 if heading in ( 213 "Parameters", 214 "Returns", 215 "Yields", 216 "Receives", 217 "Other Parameters", 218 "Raises", 219 "Warns", 220 "Attributes", 221 ): 222 contents += f"###### {heading}\n{_numpy_parameters(content)}" 223 elif heading == "See Also": 224 contents += f"###### {heading}\n{_numpy_seealso(content)}" 225 else: 226 contents += f"###### {heading}\n{content}" 227 contents += tail 228 return contents 229 230 231def _numpy_seealso(content: str) -> str: 232 """Convert a NumPy-style "See Also" section into Markdown""" 233 contents = "" 234 for item in _indented_list(content): 235 if ":" in item: 236 funcstr, desc = item.split(":", maxsplit=1) 237 desc = f": {desc}" 238 else: 239 funcstr, desc = item, "" 240 241 funclist = [f.strip() for f in funcstr.split(" ")] 242 funcs = ", ".join(f"`{f}`" for f in funclist if f) 243 contents += f"{funcs}{desc} \n" 244 return contents 245 246 247def _numpy_parameters(content: str) -> str: 248 """Convert a NumPy-style parameter section into Markdown""" 249 contents = "" 250 for item in _indented_list(content): 251 m = re.match(r"^(.+):(.+)([\s\S]*)", item) 252 if m: 253 contents += ( 254 f" - **{m.group(1).strip()}** ({m.group(2).strip()}):\n" 255 f"{indent(m.group(3).strip(), ' ')}\n" 256 ) 257 else: 258 if "\n" in item: 259 name, desc = item.split("\n", maxsplit=1) 260 name = name.strip() 261 desc = desc.strip() 262 else: 263 name, desc = item.strip(), "" 264 265 if desc: 266 contents += f" - **{name}**: {desc}\n" 267 else: 268 contents += f" - **{name}**\n" 269 return f"{contents}\n" 270 271 272def rst(contents: str, source_file: Path | None) -> str: 273 """ 274 Convert reStructuredText elements to Markdown. 275 We support the most common elements, but we do not aim to mirror the full complexity of the spec here. 276 """ 277 contents = _rst_admonitions(contents, source_file) 278 contents = _rst_links(contents) 279 280 def replace_reference(m): 281 _, kind, name = m.groups() 282 if kind in ("meth", "func"): 283 return f"`{name}()`" 284 else: 285 return f"`{name}`" 286 287 # Code References: :obj:`foo` -> `foo` 288 contents = re.sub( 289 r"(:py)?:(mod|func|data|const|class|meth|attr|exc|obj):`([^`]+)`", 290 replace_reference, 291 contents, 292 ) 293 294 # Math: :math:`foo` -> \\( foo \\) 295 # We don't use $ as that's not enabled by MathJax by default. 296 contents = re.sub(r":math:`(.+?)`", r"\\\\( \1 \\\\)", contents) 297 298 contents = _rst_footnotes(contents) 299 300 contents = _rst_fields(contents) 301 302 return contents 303 304 305def _rst_footnotes(contents: str) -> str: 306 """Convert reStructuredText footnotes""" 307 footnotes: set[str] = set() 308 autonum: int 309 310 def register_footnote(m: re.Match[str]) -> str: 311 nonlocal autonum 312 fn_id = m.group("id") 313 if fn_id in "*#": 314 fn_id = f"fn-{autonum}" 315 autonum += 1 316 fn_id = fn_id.lstrip("#*") 317 footnotes.add(fn_id) 318 content = indent(m.group("content"), " ").lstrip() 319 return f"{m.group('indent')}[^{fn_id}]: {content}" 320 321 # Register footnotes 322 autonum = 1 323 contents = re.sub( 324 r""" 325 ^(?P<indent>[ ]*)\.\.[ ]+\[(?P<id>\d+|[#*]\w*)](?P<content>.* 326 ( 327 \n # empty lines 328 | # or 329 (?P=indent)[ ]+.+ # lines with indentation 330 )*)$ 331 """, 332 register_footnote, 333 contents, 334 flags=re.MULTILINE | re.VERBOSE, 335 ) 336 337 def replace_references(m: re.Match[str]) -> str: 338 nonlocal autonum 339 fn_id = m.group("id") 340 if fn_id in "*#": 341 fn_id = f"fn-{autonum}" 342 autonum += 1 343 fn_id = fn_id.lstrip("#*") 344 if fn_id in footnotes: 345 return f"[^{fn_id}]" 346 else: 347 return m.group(0) 348 349 autonum = 1 350 contents = re.sub(r"\[(?P<id>\d+|[#*]\w*)]_", replace_references, contents) 351 return contents 352 353 354def _rst_links(contents: str) -> str: 355 """Convert reStructuredText hyperlinks""" 356 links = {} 357 358 def register_link(m: re.Match[str]) -> str: 359 refid = re.sub(r"\s", "", m.group("id").lower()) 360 links[refid] = m.group("url") 361 return "" 362 363 def replace_link(m: re.Match[str]) -> str: 364 text = m.group("id") 365 refid = re.sub(r"[\s`]", "", text.lower()) 366 try: 367 return f"[{text.strip('`')}]({links[refid]})" 368 except KeyError: 369 return m.group(0) 370 371 # Embedded URIs 372 contents = re.sub( 373 r"`(?P<text>[^`]+)<(?P<url>.+?)>`_", r"[\g<text>](\g<url>)", contents 374 ) 375 # External Hyperlink Targets 376 contents = re.sub( 377 r"^\s*..\s+_(?P<id>[^\n:]+):\s*(?P<url>http\S+)", 378 register_link, 379 contents, 380 flags=re.MULTILINE, 381 ) 382 contents = re.sub(r"(?P<id>[A-Za-z0-9_\-.:+]|`[^`]+`)_", replace_link, contents) 383 return contents 384 385 386def _rst_extract_options(contents: str) -> tuple[str, dict[str, str]]: 387 """ 388 Extract options from the beginning of reStructuredText directives. 389 390 Return the trimmed content and a dict of options. 391 """ 392 options = {} 393 while match := re.match(r"^\s*:(.+?):(.*)([\s\S]*)", contents): 394 key, value, contents = match.groups() 395 options[key] = value.strip() 396 397 return contents, options 398 399 400def _rst_include_trim(contents: str, options: dict[str, str]) -> str: 401 """ 402 <https://docutils.sourceforge.io/docs/ref/rst/directives.html#include-options> 403 """ 404 if "end-line" in options or "start-line" in options: 405 lines = contents.splitlines() 406 if i := options.get("end-line"): 407 lines = lines[: int(i)] 408 if i := options.get("start-line"): 409 lines = lines[int(i) :] 410 contents = "\n".join(lines) 411 if x := options.get("end-before"): 412 contents = contents[: contents.index(x)] 413 if x := options.get("start-after"): 414 contents = contents[contents.index(x) + len(x) :] 415 return contents 416 417 418def _rst_admonitions(contents: str, source_file: Path | None) -> str: 419 """ 420 Convert reStructuredText admonitions - a bit tricky because they may already be indented themselves. 421 <https://www.sphinx-doc.org/en/master/usage/restructuredtext/directives.html> 422 """ 423 424 def _rst_admonition(m: re.Match[str]) -> str: 425 ind = m.group("indent") 426 type = m.group("type") 427 val = m.group("val").strip() 428 contents = dedent(m.group("contents")).strip() 429 contents, options = _rst_extract_options(contents) 430 431 if type == "include": 432 loc = source_file or Path(".") 433 try: 434 included = (loc.parent / val).read_text("utf8", "replace") 435 except OSError as e: 436 warnings.warn(f"Cannot include {val!r}: {e}") 437 included = "\n" 438 try: 439 included = _rst_include_trim(included, options) + "\n" 440 except ValueError as e: 441 warnings.warn(f"Failed to process include options for {val!r}: {e}") 442 included = _rst_admonitions(included, loc.parent / val) 443 included = embed_images(included, loc.parent / val) 444 return indent(included, ind) 445 if type == "math": 446 return f"{ind}$${val}{contents}$$\n" 447 if type in ("note", "warning", "danger"): 448 if val: 449 heading = f"{ind}###### {val}\n" 450 else: 451 heading = "" 452 return ( 453 f'{ind}<div class="alert {type}" markdown="1">\n' 454 f"{heading}" 455 f"{indent(contents, ind)}\n" 456 f"{ind}</div>\n" 457 ) 458 if type == "code-block": 459 return f"{ind}```{val}\n{contents}\n```\n" 460 if type == "versionadded": 461 text = f"New in version {val}" 462 elif type == "versionchanged": 463 text = f"Changed in version {val}" 464 elif type == "deprecated": 465 text = f"Deprecated since version {val}" 466 else: 467 text = f"{type} {val}".strip() 468 469 if contents: 470 text = f"{ind}*{text}:*\n{indent(contents, ind)}\n\n" 471 else: 472 text = f"{ind}*{text}.*\n" 473 474 return text 475 476 admonition = "note|warning|danger|versionadded|versionchanged|deprecated|seealso|math|include|code-block" 477 return re.sub( 478 rf""" 479 ^(?P<indent>[ ]*)\.\.[ ]+(?P<type>{admonition})::(?P<val>.*) 480 (?P<contents>( 481 \n # empty lines 482 | # or 483 (?P=indent)[ ]+.+ # lines with indentation 484 )*)$ 485 """, 486 _rst_admonition, 487 contents, 488 flags=re.MULTILINE | re.VERBOSE, 489 ) 490 491 492def _rst_fields(contents: str) -> str: 493 """ 494 Convert reStructuredText fields to Markdown. 495 <https://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html#rst-field-lists> 496 """ 497 498 _has_parameter_section = False 499 _has_raises_section = False 500 501 def _rst_field(m: re.Match[str]) -> str: 502 type = m["type"] 503 body = m["body"] 504 505 if m["name"]: 506 name = f"**{m['name'].strip()}**: " 507 else: 508 name = "" 509 510 if type == "param": 511 nonlocal _has_parameter_section 512 text = f" - {name}{body}" 513 if not _has_parameter_section: 514 _has_parameter_section = True 515 text = "\n###### Parameters\n" + text 516 return text 517 elif type == "type": 518 return "" # we expect users to use modern type annotations. 519 elif type == "return": 520 body = indent(body, "> ", lambda line: True) 521 return f"\n###### Returns\n{body}" 522 elif type == "rtype": 523 return "" # we expect users to use modern type annotations. 524 elif type == "raises": 525 nonlocal _has_raises_section 526 text = f" - {name}{body}" 527 if not _has_raises_section: 528 _has_raises_section = True 529 text = "\n###### Raises\n" + text 530 return text 531 else: # pragma: no cover 532 raise AssertionError("unreachable") 533 534 field = "param|type|return|rtype|raises" 535 return re.sub( 536 rf""" 537 ^:(?P<type>{field})(?:[ ]+(?P<name>.+))?: 538 (?P<body>.*( 539 (?:\n[ ]*)* # maybe some empty lines followed by 540 [ ]+.+ # lines with indentation 541 )*(?:\n|$)) 542 """, 543 _rst_field, 544 contents, 545 flags=re.MULTILINE | re.VERBOSE, 546 )
BaseException, but excluding KeyboardInterrupt.
Modules may raise SystemExit on import (which we want to catch), but we don't want to catch a user's KeyboardInterrupt.
36@cache 37def convert(docstring: str, docformat: str, source_file: Path | None) -> str: 38 """ 39 Convert `docstring` from `docformat` to Markdown. 40 """ 41 docformat = docformat.lower() 42 43 try: 44 if any(x in docformat for x in ["google", "numpy", "restructuredtext"]): 45 docstring = rst(docstring, source_file) 46 47 if "google" in docformat: 48 docstring = google(docstring) 49 50 if "numpy" in docformat: 51 docstring = numpy(docstring) 52 53 if source_file is not None and os.environ.get("PDOC_EMBED_IMAGES") != "0": 54 docstring = embed_images(docstring, source_file) 55 56 except AnyException as e: 57 raise RuntimeError( 58 'Docstring processing failed for docstring=\n"""\n' 59 + docstring 60 + f'\n"""\n{source_file=}\n{docformat=}' 61 ) from e 62 63 return docstring
Convert docstring from docformat to Markdown.
66def embed_images(docstring: str, source_file: Path) -> str: 67 def local_image_to_data_uri(href: str) -> str: 68 image_path = source_file.parent / href 69 image_data = image_path.read_bytes() 70 image_mime = mimetypes.guess_type(image_path)[0] 71 image_data_b64 = base64.b64encode(image_data).decode() 72 return f"data:{image_mime};base64,{image_data_b64}" 73 74 def embed_local_image(m: re.Match) -> str: 75 try: 76 href = local_image_to_data_uri(m["href"]) 77 except Exception: 78 return m[0] 79 else: 80 return m["before"] + href + m["after"] 81 82 # TODO: Could probably do more here, e.g. support rST replacements. 83 for regex in [ 84 r"(?P<before>!\[\s*.*?\s*]\(\s*)(?P<href>.+?)(?P<after>\s*\))", 85 r"""(?P<before>src=['"])(?P<href>.+?)(?P<after>['"])""", 86 ]: 87 docstring = re.sub(regex, embed_local_image, docstring) 88 return docstring
91def google(docstring: str) -> str: 92 """Convert Google-style docstring sections into Markdown.""" 93 return re.sub( 94 r""" 95 ^(?P<name>[A-Z][A-Z a-z]+):\n 96 (?P<contents>( 97 \n # empty lines 98 | # or 99 [ \t]+.+ # lines with indentation 100 )+)$ 101 """, 102 _google_section, 103 docstring, 104 flags=re.VERBOSE | re.MULTILINE, 105 )
Convert Google-style docstring sections into Markdown.
Section headers listed in the official Google docstring style guide.
Alternative section headers that are not listed in the official Google docstring style guide but that we recognize as sections containing lists nevertheless.
189def numpy(docstring: str) -> str: 190 """Convert NumPy-style docstring sections into Markdown. 191 192 See <https://numpydoc.readthedocs.io/en/latest/format.html> for details. 193 """ 194 sections = re.split( 195 r""" 196 ^([A-Z][A-Za-z ]+)\n # a heading 197 ---+\n+ # followed by a dashed line 198 """, 199 docstring, 200 flags=re.VERBOSE | re.MULTILINE, 201 ) 202 contents = sections[0] 203 for heading, content in zip(sections[1::2], sections[2::2]): 204 if content.startswith(" ") and re.search(r"\n(?![ \n])", content): 205 # If the first line of section content is indented, we consider the section to be finished 206 # on the first non-indented line. We take out the rest - the tail - here. 207 content, tail = re.split(r"\n(?![ \n])", content, maxsplit=1) 208 else: 209 tail = "" 210 211 content = dedent(content) 212 213 if heading in ( 214 "Parameters", 215 "Returns", 216 "Yields", 217 "Receives", 218 "Other Parameters", 219 "Raises", 220 "Warns", 221 "Attributes", 222 ): 223 contents += f"###### {heading}\n{_numpy_parameters(content)}" 224 elif heading == "See Also": 225 contents += f"###### {heading}\n{_numpy_seealso(content)}" 226 else: 227 contents += f"###### {heading}\n{content}" 228 contents += tail 229 return contents
Convert NumPy-style docstring sections into Markdown.
See https://numpydoc.readthedocs.io/en/latest/format.html for details.
273def rst(contents: str, source_file: Path | None) -> str: 274 """ 275 Convert reStructuredText elements to Markdown. 276 We support the most common elements, but we do not aim to mirror the full complexity of the spec here. 277 """ 278 contents = _rst_admonitions(contents, source_file) 279 contents = _rst_links(contents) 280 281 def replace_reference(m): 282 _, kind, name = m.groups() 283 if kind in ("meth", "func"): 284 return f"`{name}()`" 285 else: 286 return f"`{name}`" 287 288 # Code References: :obj:`foo` -> `foo` 289 contents = re.sub( 290 r"(:py)?:(mod|func|data|const|class|meth|attr|exc|obj):`([^`]+)`", 291 replace_reference, 292 contents, 293 ) 294 295 # Math: :math:`foo` -> \\( foo \\) 296 # We don't use $ as that's not enabled by MathJax by default. 297 contents = re.sub(r":math:`(.+?)`", r"\\\\( \1 \\\\)", contents) 298 299 contents = _rst_footnotes(contents) 300 301 contents = _rst_fields(contents) 302 303 return contents
Convert reStructuredText elements to Markdown. We support the most common elements, but we do not aim to mirror the full complexity of the spec here.