pdoc.docstrings
This module handles the conversion of docstring flavors to Markdown.
The conversion from docstring flavors to Markdown is mostly done with regular expressions. This is not particularly beautiful, but good enough for our purposes. The alternative would be to depend on https://github.com/rr-/docstring_parser or a similar project, but that introduces more complexity than we are comfortable with.
If you miss a particular feature for your favorite flavor, contributions are welcome.
That being said, please keep the complexity low and make sure that changes are
accompanied by matching snapshot tests in test/testdata/
.
1""" 2This module handles the conversion of docstring flavors to Markdown. 3 4The conversion from docstring flavors to Markdown is mostly done with regular expressions. 5This is not particularly beautiful, but good enough for our purposes. 6The alternative would be to depend on <https://github.com/rr-/docstring_parser> or a similar project, 7but that introduces more complexity than we are comfortable with. 8 9If you miss a particular feature for your favorite flavor, contributions are welcome. 10That being said, please keep the complexity low and make sure that changes are 11accompanied by matching snapshot tests in `test/testdata/`. 12""" 13 14from __future__ import annotations 15 16import base64 17from functools import cache 18import inspect 19import mimetypes 20import os 21from pathlib import Path 22import re 23from textwrap import dedent 24from textwrap import indent 25import warnings 26 27 28@cache 29def convert(docstring: str, docformat: str, source_file: Path | None) -> str: 30 """ 31 Convert `docstring` from `docformat` to Markdown. 32 """ 33 docformat = docformat.lower() 34 35 if any(x in docformat for x in ["google", "numpy", "restructuredtext"]): 36 docstring = rst(docstring, source_file) 37 38 if "google" in docformat: 39 docstring = google(docstring) 40 41 if "numpy" in docformat: 42 docstring = numpy(docstring) 43 44 if source_file is not None and os.environ.get("PDOC_EMBED_IMAGES") != "0": 45 docstring = embed_images(docstring, source_file) 46 47 return docstring 48 49 50def embed_images(docstring: str, source_file: Path) -> str: 51 def embed_local_image(m: re.Match) -> str: 52 image_path = source_file.parent / m["href"] 53 try: 54 image_data = image_path.read_bytes() 55 image_mime = mimetypes.guess_type(image_path)[0] 56 except Exception: 57 return m[0] 58 else: 59 data = base64.b64encode(image_data).decode() 60 return f"![{m['alt']}](data:{image_mime};base64,{data})" 61 62 return re.sub( 63 r"!\[\s*(?P<alt>.*?)\s*]\(\s*(?P<href>.+?)\s*\)", 64 embed_local_image, 65 docstring, 66 ) 67 # TODO: Could probably do more here, e.g. support rST or raw HTML replacements. 68 69 70def google(docstring: str) -> str: 71 """Convert Google-style docstring sections into Markdown.""" 72 return re.sub( 73 r""" 74 ^(?P<name>[A-Z][A-Z a-z]+):\n 75 (?P<contents>( 76 \n # empty lines 77 | # or 78 [ \t]+.+ # lines with indentation 79 )+)$ 80 """, 81 _google_section, 82 docstring, 83 flags=re.VERBOSE | re.MULTILINE, 84 ) 85 86 87GOOGLE_LIST_SECTIONS = ["Args", "Raises", "Attributes"] 88"""Section headers listed in the official Google docstring style guide.""" 89 90GOOGLE_LIST_SECTION_ALIASES = { 91 "Parameters": "Args", 92 "Params": "Args", 93 "Arguments": "Args", 94} 95""" 96Alternative section headers that are not listed in the official Google 97docstring style guide but that we recognize as sections containing lists 98nevertheless. 99""" 100 101 102def _google_section(m: re.Match[str]) -> str: 103 name = m.group("name") 104 contents = dedent(m.group("contents")).lstrip() 105 106 if name in GOOGLE_LIST_SECTION_ALIASES: 107 name = GOOGLE_LIST_SECTION_ALIASES[name] 108 109 if name in GOOGLE_LIST_SECTIONS: 110 items = _indented_list(contents) 111 contents = "" 112 for item in items: 113 try: 114 # first ":" on the first line 115 _, attr, desc = re.split(r"^(.+?:)", item, maxsplit=1) 116 except ValueError: 117 contents += " - " + indent(item, " ")[3:] 118 else: 119 contents += f" - **{attr}** " + indent(desc, " ")[3:] 120 contents += "\n" 121 else: 122 contents = indent(contents, "> ", lambda line: True) 123 124 if name == "Args": 125 name = "Arguments" 126 127 return f"\n###### {name}:\n{contents}\n" 128 129 130def _indented_list(contents: str) -> list[str]: 131 """ 132 Convert a list string into individual (dedented) elements. For example, 133 134 foo: 135 desc 136 bar: int 137 more desc 138 baz: 139 desc 140 indented 141 142 returns [ 143 "foo:\ndesc", 144 "bar: int\nmore desc", 145 "baz:\ndesc\n indented", 146 ] 147 """ 148 # we expect this to be through cleandoc() already. 149 assert not contents.startswith(" "), contents 150 assert not contents.startswith("\n"), contents 151 152 ret: list[str] = [] 153 for line in contents.splitlines(keepends=True): 154 empty = not line.strip() 155 indented = line.startswith(" ") 156 if not (empty or indented): 157 # new section 158 ret.append(line) 159 else: 160 # append to current section 161 ret[-1] += line 162 163 return [inspect.cleandoc(x) for x in ret] 164 165 166def numpy(docstring: str) -> str: 167 """Convert NumPy-style docstring sections into Markdown. 168 169 See <https://numpydoc.readthedocs.io/en/latest/format.html> for details. 170 """ 171 sections = re.split( 172 r""" 173 ^([A-Z][A-Za-z ]+)\n # a heading 174 ---+\n+ # followed by a dashed line 175 """, 176 docstring, 177 flags=re.VERBOSE | re.MULTILINE, 178 ) 179 contents = sections[0] 180 for heading, content in zip(sections[1::2], sections[2::2]): 181 if content.startswith(" "): 182 # If the first line of section content is indented, we consider the section to be finished 183 # on the first non-indented line. We take out the rest - the tail - here. 184 content, tail = re.split(r"\n(?![ \n])", content, maxsplit=1) 185 else: 186 tail = "" 187 188 if heading in ( 189 "Parameters", 190 "Returns", 191 "Yields", 192 "Receives", 193 "Other Parameters", 194 "Raises", 195 "Warns", 196 "Attributes", 197 ): 198 contents += f"###### {heading}\n{_numpy_parameters(content)}" 199 elif heading == "See Also": 200 contents += f"###### {heading}\n{_numpy_seealso(content)}" 201 else: 202 contents += f"###### {heading}\n{dedent(content)}" 203 contents += tail 204 return contents 205 206 207def _numpy_seealso(content: str) -> str: 208 """Convert a NumPy-style "See Also" section into Markdown""" 209 contents = "" 210 for item in _indented_list(content): 211 if ":" in item: 212 funcstr, desc = item.split(":", maxsplit=1) 213 desc = f": {desc}" 214 else: 215 funcstr, desc = item, "" 216 217 funclist = [f.strip() for f in funcstr.split(" ")] 218 funcs = ", ".join(f"`{f}`" for f in funclist if f) 219 contents += f"{funcs}{desc} \n" 220 return contents 221 222 223def _numpy_parameters(content: str) -> str: 224 """Convert a NumPy-style parameter section into Markdown""" 225 contents = "" 226 for item in _indented_list(content): 227 m = re.match(r"^(.+):(.+)([\s\S]*)", item) 228 if m: 229 contents += ( 230 f" - **{m.group(1).strip()}** ({m.group(2).strip()}):\n" 231 f"{indent(m.group(3).strip(), ' ')}\n" 232 ) 233 else: 234 if "\n" in item: 235 name, desc = item.split("\n", maxsplit=1) 236 name = name.strip() 237 desc = desc.strip() 238 else: 239 name, desc = item.strip(), "" 240 241 if desc: 242 contents += f" - **{name}**: {desc}\n" 243 else: 244 contents += f" - **{name}**\n" 245 return f"{contents}\n" 246 247 248def rst(contents: str, source_file: Path | None) -> str: 249 """ 250 Convert reStructuredText elements to Markdown. 251 We support the most common elements, but we do not aim to mirror the full complexity of the spec here. 252 """ 253 contents = _rst_admonitions(contents, source_file) 254 contents = _rst_links(contents) 255 256 def replace_reference(m): 257 _, kind, name = m.groups() 258 if kind in ("meth", "func"): 259 return f"`{name}()`" 260 else: 261 return f"`{name}`" 262 263 # Code References: :obj:`foo` -> `foo` 264 contents = re.sub( 265 r"(:py)?:(mod|func|data|const|class|meth|attr|exc|obj):`([^`]+)`", 266 replace_reference, 267 contents, 268 ) 269 270 # Math: :math:`foo` -> \\( foo \\) 271 # We don't use $ as that's not enabled by MathJax by default. 272 contents = re.sub(r":math:`(.+?)`", r"\\\\( \1 \\\\)", contents) 273 274 contents = _rst_footnotes(contents) 275 276 contents = _rst_fields(contents) 277 278 return contents 279 280 281def _rst_footnotes(contents: str) -> str: 282 """Convert reStructuredText footnotes""" 283 footnotes: set[str] = set() 284 autonum: int 285 286 def register_footnote(m: re.Match[str]) -> str: 287 nonlocal autonum 288 fn_id = m.group("id") 289 if fn_id in "*#": 290 fn_id = f"fn-{autonum}" 291 autonum += 1 292 fn_id = fn_id.lstrip("#*") 293 footnotes.add(fn_id) 294 content = indent(m.group("content"), " ").lstrip() 295 return f"{m.group('indent')}[^{fn_id}]: {content}" 296 297 # Register footnotes 298 autonum = 1 299 contents = re.sub( 300 r""" 301 ^(?P<indent>[ ]*)\.\.[ ]+\[(?P<id>\d+|[#*]\w*)](?P<content>.* 302 ( 303 \n # empty lines 304 | # or 305 (?P=indent)[ ]+.+ # lines with indentation 306 )*)$ 307 """, 308 register_footnote, 309 contents, 310 flags=re.MULTILINE | re.VERBOSE, 311 ) 312 313 def replace_references(m: re.Match[str]) -> str: 314 nonlocal autonum 315 fn_id = m.group("id") 316 if fn_id in "*#": 317 fn_id = f"fn-{autonum}" 318 autonum += 1 319 fn_id = fn_id.lstrip("#*") 320 if fn_id in footnotes: 321 return f"[^{fn_id}]" 322 else: 323 return m.group(0) 324 325 autonum = 1 326 contents = re.sub(r"\[(?P<id>\d+|[#*]\w*)]_", replace_references, contents) 327 return contents 328 329 330def _rst_links(contents: str) -> str: 331 """Convert reStructuredText hyperlinks""" 332 links = {} 333 334 def register_link(m: re.Match[str]) -> str: 335 refid = re.sub(r"\s", "", m.group("id").lower()) 336 links[refid] = m.group("url") 337 return "" 338 339 def replace_link(m: re.Match[str]) -> str: 340 text = m.group("id") 341 refid = re.sub(r"[\s`]", "", text.lower()) 342 try: 343 return f"[{text.strip('`')}]({links[refid]})" 344 except KeyError: 345 return m.group(0) 346 347 # Embedded URIs 348 contents = re.sub( 349 r"`(?P<text>[^`]+)<(?P<url>.+?)>`_", r"[\g<text>](\g<url>)", contents 350 ) 351 # External Hyperlink Targets 352 contents = re.sub( 353 r"^\s*..\s+_(?P<id>[^\n:]+):\s*(?P<url>http\S+)", 354 register_link, 355 contents, 356 flags=re.MULTILINE, 357 ) 358 contents = re.sub(r"(?P<id>[A-Za-z0-9_\-.:+]|`[^`]+`)_", replace_link, contents) 359 return contents 360 361 362def _rst_extract_options(contents: str) -> tuple[str, dict[str, str]]: 363 """ 364 Extract options from the beginning of reStructuredText directives. 365 366 Return the trimmed content and a dict of options. 367 """ 368 options = {} 369 while match := re.match(r"^\s*:(.+?):(.*)([\s\S]*)", contents): 370 key, value, contents = match.groups() 371 options[key] = value.strip() 372 373 return contents, options 374 375 376def _rst_include_trim(contents: str, options: dict[str, str]) -> str: 377 """ 378 <https://docutils.sourceforge.io/docs/ref/rst/directives.html#include-options> 379 """ 380 if "end-line" in options or "start-line" in options: 381 lines = contents.splitlines() 382 if i := options.get("end-line"): 383 lines = lines[: int(i)] 384 if i := options.get("start-line"): 385 lines = lines[int(i) :] 386 contents = "\n".join(lines) 387 if x := options.get("end-before"): 388 contents = contents[: contents.index(x)] 389 if x := options.get("start-after"): 390 contents = contents[contents.index(x) + len(x) :] 391 return contents 392 393 394def _rst_admonitions(contents: str, source_file: Path | None) -> str: 395 """ 396 Convert reStructuredText admonitions - a bit tricky because they may already be indented themselves. 397 <https://www.sphinx-doc.org/en/master/usage/restructuredtext/directives.html> 398 """ 399 400 def _rst_admonition(m: re.Match[str]) -> str: 401 ind = m.group("indent") 402 type = m.group("type") 403 val = m.group("val").strip() 404 contents = dedent(m.group("contents")).strip() 405 contents, options = _rst_extract_options(contents) 406 407 if type == "include": 408 loc = source_file or Path(".") 409 try: 410 included = (loc.parent / val).read_text("utf8", "replace") 411 except OSError as e: 412 warnings.warn(f"Cannot include {val!r}: {e}") 413 included = "\n" 414 try: 415 included = _rst_include_trim(included, options) + "\n" 416 except ValueError as e: 417 warnings.warn(f"Failed to process include options for {val!r}: {e}") 418 included = _rst_admonitions(included, loc.parent / val) 419 included = embed_images(included, loc.parent / val) 420 return indent(included, ind) 421 if type == "math": 422 return f"{ind}$${val}{contents}$$\n" 423 if type in ("note", "warning", "danger"): 424 if val: 425 heading = f"{ind}###### {val}\n" 426 else: 427 heading = "" 428 return ( 429 f'{ind}<div class="alert {type}" markdown="1">\n' 430 f"{heading}" 431 f"{indent(contents, ind)}\n" 432 f"{ind}</div>\n" 433 ) 434 if type == "code-block": 435 return f"{ind}```{val}\n{contents}\n```\n" 436 if type == "versionadded": 437 text = f"New in version {val}" 438 elif type == "versionchanged": 439 text = f"Changed in version {val}" 440 elif type == "deprecated": 441 text = f"Deprecated since version {val}" 442 else: 443 text = f"{type} {val}".strip() 444 445 if contents: 446 text = f"{ind}*{text}:*\n{indent(contents, ind)}\n\n" 447 else: 448 text = f"{ind}*{text}.*\n" 449 450 return text 451 452 admonition = "note|warning|danger|versionadded|versionchanged|deprecated|seealso|math|include|code-block" 453 return re.sub( 454 rf""" 455 ^(?P<indent>[ ]*)\.\.[ ]+(?P<type>{admonition})::(?P<val>.*) 456 (?P<contents>( 457 \n # empty lines 458 | # or 459 (?P=indent)[ ]+.+ # lines with indentation 460 )*)$ 461 """, 462 _rst_admonition, 463 contents, 464 flags=re.MULTILINE | re.VERBOSE, 465 ) 466 467 468def _rst_fields(contents: str) -> str: 469 """ 470 Convert reStructuredText fields to Markdown. 471 <https://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html#rst-field-lists> 472 """ 473 474 _has_parameter_section = False 475 _has_raises_section = False 476 477 def _rst_field(m: re.Match[str]) -> str: 478 type = m["type"] 479 body = m["body"] 480 481 if m["name"]: 482 name = f"**{m['name'].strip()}**: " 483 else: 484 name = "" 485 486 if type == "param": 487 nonlocal _has_parameter_section 488 text = f" - {name}{body}" 489 if not _has_parameter_section: 490 _has_parameter_section = True 491 text = "\n###### Parameters\n" + text 492 return text 493 elif type == "type": 494 return "" # we expect users to use modern type annotations. 495 elif type == "return": 496 body = indent(body, "> ", lambda line: True) 497 return f"\n###### Returns\n{body}" 498 elif type == "rtype": 499 return "" # we expect users to use modern type annotations. 500 elif type == "raises": 501 nonlocal _has_raises_section 502 text = f" - {name}{body}" 503 if not _has_raises_section: 504 _has_raises_section = True 505 text = "\n###### Raises\n" + text 506 return text 507 else: # pragma: no cover 508 raise AssertionError("unreachable") 509 510 field = "param|type|return|rtype|raises" 511 return re.sub( 512 rf""" 513 ^:(?P<type>{field})(?:[ ]+(?P<name>.+))?: 514 (?P<body>.*( 515 (?:\n[ ]*)* # maybe some empty lines followed by 516 [ ]+.+ # lines with indentation 517 )*(?:\n|$)) 518 """, 519 _rst_field, 520 contents, 521 flags=re.MULTILINE | re.VERBOSE, 522 )
29@cache 30def convert(docstring: str, docformat: str, source_file: Path | None) -> str: 31 """ 32 Convert `docstring` from `docformat` to Markdown. 33 """ 34 docformat = docformat.lower() 35 36 if any(x in docformat for x in ["google", "numpy", "restructuredtext"]): 37 docstring = rst(docstring, source_file) 38 39 if "google" in docformat: 40 docstring = google(docstring) 41 42 if "numpy" in docformat: 43 docstring = numpy(docstring) 44 45 if source_file is not None and os.environ.get("PDOC_EMBED_IMAGES") != "0": 46 docstring = embed_images(docstring, source_file) 47 48 return docstring
Convert docstring
from docformat
to Markdown.
51def embed_images(docstring: str, source_file: Path) -> str: 52 def embed_local_image(m: re.Match) -> str: 53 image_path = source_file.parent / m["href"] 54 try: 55 image_data = image_path.read_bytes() 56 image_mime = mimetypes.guess_type(image_path)[0] 57 except Exception: 58 return m[0] 59 else: 60 data = base64.b64encode(image_data).decode() 61 return f"![{m['alt']}](data:{image_mime};base64,{data})" 62 63 return re.sub( 64 r"!\[\s*(?P<alt>.*?)\s*]\(\s*(?P<href>.+?)\s*\)", 65 embed_local_image, 66 docstring, 67 ) 68 # TODO: Could probably do more here, e.g. support rST or raw HTML replacements.
71def google(docstring: str) -> str: 72 """Convert Google-style docstring sections into Markdown.""" 73 return re.sub( 74 r""" 75 ^(?P<name>[A-Z][A-Z a-z]+):\n 76 (?P<contents>( 77 \n # empty lines 78 | # or 79 [ \t]+.+ # lines with indentation 80 )+)$ 81 """, 82 _google_section, 83 docstring, 84 flags=re.VERBOSE | re.MULTILINE, 85 )
Convert Google-style docstring sections into Markdown.
Section headers listed in the official Google docstring style guide.
Alternative section headers that are not listed in the official Google docstring style guide but that we recognize as sections containing lists nevertheless.
167def numpy(docstring: str) -> str: 168 """Convert NumPy-style docstring sections into Markdown. 169 170 See <https://numpydoc.readthedocs.io/en/latest/format.html> for details. 171 """ 172 sections = re.split( 173 r""" 174 ^([A-Z][A-Za-z ]+)\n # a heading 175 ---+\n+ # followed by a dashed line 176 """, 177 docstring, 178 flags=re.VERBOSE | re.MULTILINE, 179 ) 180 contents = sections[0] 181 for heading, content in zip(sections[1::2], sections[2::2]): 182 if content.startswith(" "): 183 # If the first line of section content is indented, we consider the section to be finished 184 # on the first non-indented line. We take out the rest - the tail - here. 185 content, tail = re.split(r"\n(?![ \n])", content, maxsplit=1) 186 else: 187 tail = "" 188 189 if heading in ( 190 "Parameters", 191 "Returns", 192 "Yields", 193 "Receives", 194 "Other Parameters", 195 "Raises", 196 "Warns", 197 "Attributes", 198 ): 199 contents += f"###### {heading}\n{_numpy_parameters(content)}" 200 elif heading == "See Also": 201 contents += f"###### {heading}\n{_numpy_seealso(content)}" 202 else: 203 contents += f"###### {heading}\n{dedent(content)}" 204 contents += tail 205 return contents
Convert NumPy-style docstring sections into Markdown.
See https://numpydoc.readthedocs.io/en/latest/format.html for details.
249def rst(contents: str, source_file: Path | None) -> str: 250 """ 251 Convert reStructuredText elements to Markdown. 252 We support the most common elements, but we do not aim to mirror the full complexity of the spec here. 253 """ 254 contents = _rst_admonitions(contents, source_file) 255 contents = _rst_links(contents) 256 257 def replace_reference(m): 258 _, kind, name = m.groups() 259 if kind in ("meth", "func"): 260 return f"`{name}()`" 261 else: 262 return f"`{name}`" 263 264 # Code References: :obj:`foo` -> `foo` 265 contents = re.sub( 266 r"(:py)?:(mod|func|data|const|class|meth|attr|exc|obj):`([^`]+)`", 267 replace_reference, 268 contents, 269 ) 270 271 # Math: :math:`foo` -> \\( foo \\) 272 # We don't use $ as that's not enabled by MathJax by default. 273 contents = re.sub(r":math:`(.+?)`", r"\\\\( \1 \\\\)", contents) 274 275 contents = _rst_footnotes(contents) 276 277 contents = _rst_fields(contents) 278 279 return contents
Convert reStructuredText elements to Markdown. We support the most common elements, but we do not aim to mirror the full complexity of the spec here.