Edit on GitHub

pdoc.extract

This module handles the interaction with Python's module system, that is it loads the correct module based on whatever the user specified, and provides the rest of pdoc with some additional module metadata.

  1"""
  2This module handles the interaction with Python's module system,
  3that is it loads the correct module based on whatever the user specified,
  4and provides the rest of pdoc with some additional module metadata.
  5"""
  6from __future__ import annotations
  7
  8from collections.abc import Iterable
  9from collections.abc import Iterator
 10from collections.abc import Sequence
 11from contextlib import contextmanager
 12import importlib.util
 13import io
 14import linecache
 15import os
 16from pathlib import Path
 17import pkgutil
 18import platform
 19import re
 20import shutil
 21import subprocess
 22import sys
 23import traceback
 24import types
 25from unittest.mock import patch
 26import warnings
 27
 28import pdoc.doc_ast
 29import pdoc.docstrings
 30
 31
 32def walk_specs(specs: Sequence[Path | str]) -> list[str]:
 33    """
 34    This function processes a list of module specifications and returns a collection of module names, including all
 35    submodules, that should be processed by pdoc.
 36
 37    A module specification can either be the name of an installed module, or the path to a specific file or package.
 38    For example, the following strings are valid module specifications:
 39
 40     - `typing`
 41     - `collections.abc`
 42     - `./test/testdata/demo_long.py`
 43     - `./test/testdata/demopackage`
 44
 45    *This function has side effects:* See `parse_spec`.
 46    """
 47    all_modules: dict[str, None] = {}
 48    for spec in specs:
 49        if isinstance(spec, str) and spec.startswith("!"):
 50            ignore_pattern = re.compile(spec[1:])
 51            all_modules = {
 52                k: v for k, v in all_modules.items() if not ignore_pattern.match(k)
 53            }
 54            continue
 55
 56        modname = parse_spec(spec)
 57
 58        try:
 59            with mock_some_common_side_effects():
 60                modspec = importlib.util.find_spec(modname)
 61                if modspec is None:
 62                    raise ModuleNotFoundError(modname)
 63        except AnyException:
 64            warnings.warn(
 65                f"Cannot find spec for {modname} (from {spec}):\n{traceback.format_exc()}",
 66                stacklevel=2,
 67            )
 68        else:
 69            mod_info = pkgutil.ModuleInfo(
 70                None,  # type: ignore
 71                name=modname,
 72                ispkg=bool(modspec.submodule_search_locations),
 73            )
 74            for m in walk_packages2([mod_info]):
 75                if m.name in all_modules:
 76                    warnings.warn(
 77                        f"The module specification {spec!r} adds a module named {m.name}, but a module with this name "
 78                        f"has already been added. You may have accidentally repeated a module spec, or you are trying "
 79                        f"to document two modules with the same filename from two different directories, which does "
 80                        f"not work. Only one documentation page will be generated."
 81                    )
 82                all_modules[m.name] = None
 83
 84    if not all_modules:
 85        raise ValueError(
 86            f"No modules found matching spec: {', '.join(str(x) for x in specs)}"
 87        )
 88
 89    return list(all_modules)
 90
 91
 92def parse_spec(spec: Path | str) -> str:
 93    """
 94    This functions parses a user's module specification into a module identifier that can be imported.
 95    If both a local file/directory and an importable module with the same name exist, a warning will be printed.
 96
 97    *This function has side effects:* `sys.path` will be amended if the specification is a path.
 98    If this side effect is undesired, pass a module name instead.
 99    """
100    pspec = Path(spec)
101    if isinstance(spec, str) and (os.sep in spec or (os.altsep and os.altsep in spec)):
102        # We have a path separator, so it's definitely a filepath.
103        spec = pspec
104
105    if isinstance(spec, str) and (pspec.is_file() or (pspec / "__init__.py").is_file()):
106        # We have a local file with this name, but is there also a module with the same name?
107        try:
108            with mock_some_common_side_effects():
109                modspec = importlib.util.find_spec(spec)
110                if modspec is None:
111                    raise ModuleNotFoundError
112        except AnyException:
113            # Module does not exist, use local file.
114            spec = pspec
115        else:
116            # Module does exist. We now check if the local file/directory is the same (e.g. after pip install -e),
117            # and emit a warning if that's not the case.
118            origin = (
119                Path(modspec.origin).absolute() if modspec.origin else Path("unknown")
120            )
121            local_dir = Path(spec).absolute()
122            if local_dir not in (origin, origin.parent):
123                warnings.warn(
124                    f"{spec!r} may refer to either the installed Python module or the local file/directory with the "
125                    f"same name. pdoc will document the installed module, prepend './' to force documentation of the "
126                    f"local file/directory.\n"
127                    f" - Module location: {origin}\n"
128                    f" - Local file/directory: {local_dir}",
129                    RuntimeWarning,
130                )
131
132    if isinstance(spec, Path):
133        if spec.name == "__init__.py":
134            spec = spec.parent
135        if (spec.parent / "__init__.py").exists():
136            return parse_spec(spec.resolve().parent) + f".{spec.stem}"
137        parent_dir = str(spec.parent)
138        sys.path = [parent_dir] + [x for x in sys.path if x != parent_dir]
139        if spec.stem in sys.modules and sys.modules[spec.stem].__file__:
140            local_dir = spec.resolve()
141            file = sys.modules[spec.stem].__file__
142            assert file is not None  # make mypy happy
143            origin = Path(file).resolve()
144            if local_dir not in (origin, origin.parent, origin.with_suffix("")):
145                warnings.warn(
146                    f"pdoc cannot load {spec.stem!r} because a module with the same name is already imported in pdoc's "
147                    f"Python process. pdoc will document the loaded module from {origin} instead.",
148                    RuntimeWarning,
149                )
150        return spec.stem
151    else:
152        return spec
153
154
155def _noop(*args, **kwargs):
156    pass
157
158
159class _PdocDefusedPopen(subprocess.Popen):
160    """A small wrapper around subprocess.Popen that converts most executions into no-ops."""
161
162    if platform.system() == "Windows":  # pragma: no cover
163        _noop_exe = "echo.exe"
164    else:  # pragma: no cover
165        _noop_exe = "echo"
166
167    def __init__(self, *args, **kwargs):  # pragma: no cover
168        command_allowed = (
169            args
170            and args[0]
171            and args[0][0]
172            in (
173                # these invocations may all come from https://github.com/python/cpython/blob/main/Lib/ctypes/util.py,
174                # which we want to keep working.
175                "/sbin/ldconfig",
176                "ld",
177                shutil.which("gcc") or shutil.which("cc"),
178                shutil.which("objdump"),
179                # https://github.com/mitmproxy/pdoc/issues/430: GitPython invokes git commands, which is also fine.
180                "git",
181            )
182        )
183        if not command_allowed and os.environ.get("PDOC_ALLOW_EXEC", "") == "":
184            # sys.stderr is patched, so we need to unpatch it for printing a warning.
185            with patch("sys.stderr", new=sys.__stderr__):
186                warnings.warn(
187                    f"Suppressed execution of {args[0]!r} during import. "
188                    f"Set PDOC_ALLOW_EXEC=1 as an environment variable to allow subprocess execution.",
189                    stacklevel=2,
190                )
191            kwargs["executable"] = self._noop_exe
192        super().__init__(*args, **kwargs)
193
194
195@contextmanager
196def mock_some_common_side_effects():
197    """
198    This context manager is applied when importing modules. It mocks some common side effects that may happen upon
199    module import. For example, `import antigravity` normally causes a web browser to open, which we want to suppress.
200
201    Note that this function must not be used for security purposes, it's easily bypassable.
202    """
203    with patch("subprocess.Popen", new=_PdocDefusedPopen), patch(
204        "os.startfile", new=_noop, create=True
205    ), patch("sys.stdout", new=io.StringIO()), patch(
206        "sys.stderr", new=io.StringIO()
207    ), patch(
208        "sys.stdin", new=io.StringIO()
209    ):
210        yield
211
212
213@mock_some_common_side_effects()
214def load_module(module: str) -> types.ModuleType:
215    """Try to import a module. If import fails, a RuntimeError is raised.
216
217    Returns the imported module."""
218    try:
219        return importlib.import_module(module)
220    except AnyException as e:
221        raise RuntimeError(f"Error importing {module}") from e
222
223
224AnyException = (SystemExit, GeneratorExit, Exception)
225"""BaseException, but excluding KeyboardInterrupt.
226
227Modules may raise SystemExit on import (which we want to catch),
228but we don't want to catch a user's KeyboardInterrupt.
229"""
230
231
232def walk_packages2(
233    modules: Iterable[pkgutil.ModuleInfo],
234) -> Iterator[pkgutil.ModuleInfo]:
235    """
236    For a given list of modules, recursively yield their names and all their submodules' names.
237
238    This function is similar to `pkgutil.walk_packages`, but respects a package's `__all__` attribute if specified.
239    If `__all__` is defined, submodules not listed in `__all__` are excluded.
240    """
241
242    # noinspection PyDefaultArgument
243    def seen(p, m={}):  # pragma: no cover
244        if p in m:
245            return True
246        m[p] = True
247
248    for mod in modules:
249        yield mod
250
251        if mod.ispkg:
252            try:
253                module = load_module(mod.name)
254            except RuntimeError:
255                warnings.warn(f"Error loading {mod.name}:\n{traceback.format_exc()}")
256                continue
257
258            mod_all = getattr(module, "__all__", None)
259            # don't traverse path items we've seen before
260            path = [p for p in (getattr(module, "__path__", None) or []) if not seen(p)]
261
262            submodules = []
263            for submodule in pkgutil.iter_modules(path, f"{mod.name}."):
264                name = submodule.name.rpartition(".")[2]
265                if name == "__main__":
266                    continue  # https://github.com/mitmproxy/pdoc/issues/438
267                if mod_all is None or name in mod_all:
268                    submodules.append(submodule)
269
270            yield from walk_packages2(submodules)
271
272
273def module_mtime(modulename: str) -> float | None:
274    """Returns the time the specified module file was last modified, or `None` if this cannot be determined.
275    The primary use of this is live-reloading modules on modification."""
276    try:
277        with mock_some_common_side_effects():
278            spec = importlib.util.find_spec(modulename)
279    except AnyException:
280        pass
281    else:
282        if spec is not None and spec.origin is not None:
283            return Path(spec.origin).stat().st_mtime
284    return None
285
286
287def invalidate_caches(module_name: str) -> None:
288    """
289    Invalidate module cache to allow live-reloading of modules.
290    """
291    # Getting this right is tricky – reloading modules causes a bunch of surprising side effects.
292    # Our current best effort is to call `importlib.reload` on all modules that start with module_name.
293    # We also exclude our own dependencies, which cause fun errors otherwise.
294    if module_name not in sys.modules:
295        return
296    if any(
297        module_name.startswith(f"{x}.") or x == module_name
298        for x in ("jinja2", "markupsafe", "markdown2", "pygments")
299    ):
300        return
301
302    # a more extreme alternative:
303    # filename = sys.modules[module_name].__file__
304    # if (
305    #    filename.startswith(sysconfig.get_path("platstdlib"))
306    #    or filename.startswith(sysconfig.get_path("stdlib"))
307    # ):
308    #     return
309
310    importlib.invalidate_caches()
311    linecache.clearcache()
312    pdoc.doc.Module.from_name.cache_clear()
313    pdoc.doc_ast._get_source.cache_clear()
314    pdoc.docstrings.convert.cache_clear()
315
316    prefix = f"{module_name}."
317    mods = sorted(
318        mod for mod in sys.modules if module_name == mod or mod.startswith(prefix)
319    )
320    for modname in mods:
321        if modname == "pdoc.render":
322            # pdoc.render is stateful after configure(), so we don't want to reload it.
323            continue
324        try:
325            if not isinstance(sys.modules[modname], types.ModuleType):
326                continue  # some funky stuff going on - one example is typing.io, which is a class.
327            with mock_some_common_side_effects():
328                importlib.reload(sys.modules[modname])
329        except AnyException:
330            warnings.warn(
331                f"Error reloading {modname}:\n{traceback.format_exc()}",
332                stacklevel=2,
333            )
def walk_specs(specs: collections.abc.Sequence[pathlib.Path | str]) -> list[str]:
33def walk_specs(specs: Sequence[Path | str]) -> list[str]:
34    """
35    This function processes a list of module specifications and returns a collection of module names, including all
36    submodules, that should be processed by pdoc.
37
38    A module specification can either be the name of an installed module, or the path to a specific file or package.
39    For example, the following strings are valid module specifications:
40
41     - `typing`
42     - `collections.abc`
43     - `./test/testdata/demo_long.py`
44     - `./test/testdata/demopackage`
45
46    *This function has side effects:* See `parse_spec`.
47    """
48    all_modules: dict[str, None] = {}
49    for spec in specs:
50        if isinstance(spec, str) and spec.startswith("!"):
51            ignore_pattern = re.compile(spec[1:])
52            all_modules = {
53                k: v for k, v in all_modules.items() if not ignore_pattern.match(k)
54            }
55            continue
56
57        modname = parse_spec(spec)
58
59        try:
60            with mock_some_common_side_effects():
61                modspec = importlib.util.find_spec(modname)
62                if modspec is None:
63                    raise ModuleNotFoundError(modname)
64        except AnyException:
65            warnings.warn(
66                f"Cannot find spec for {modname} (from {spec}):\n{traceback.format_exc()}",
67                stacklevel=2,
68            )
69        else:
70            mod_info = pkgutil.ModuleInfo(
71                None,  # type: ignore
72                name=modname,
73                ispkg=bool(modspec.submodule_search_locations),
74            )
75            for m in walk_packages2([mod_info]):
76                if m.name in all_modules:
77                    warnings.warn(
78                        f"The module specification {spec!r} adds a module named {m.name}, but a module with this name "
79                        f"has already been added. You may have accidentally repeated a module spec, or you are trying "
80                        f"to document two modules with the same filename from two different directories, which does "
81                        f"not work. Only one documentation page will be generated."
82                    )
83                all_modules[m.name] = None
84
85    if not all_modules:
86        raise ValueError(
87            f"No modules found matching spec: {', '.join(str(x) for x in specs)}"
88        )
89
90    return list(all_modules)

This function processes a list of module specifications and returns a collection of module names, including all submodules, that should be processed by pdoc.

A module specification can either be the name of an installed module, or the path to a specific file or package. For example, the following strings are valid module specifications:

  • typing
  • collections.abc
  • ./test/testdata/demo_long.py
  • ./test/testdata/demopackage

This function has side effects: See parse_spec.

def parse_spec(spec: pathlib.Path | str) -> str:
 93def parse_spec(spec: Path | str) -> str:
 94    """
 95    This functions parses a user's module specification into a module identifier that can be imported.
 96    If both a local file/directory and an importable module with the same name exist, a warning will be printed.
 97
 98    *This function has side effects:* `sys.path` will be amended if the specification is a path.
 99    If this side effect is undesired, pass a module name instead.
100    """
101    pspec = Path(spec)
102    if isinstance(spec, str) and (os.sep in spec or (os.altsep and os.altsep in spec)):
103        # We have a path separator, so it's definitely a filepath.
104        spec = pspec
105
106    if isinstance(spec, str) and (pspec.is_file() or (pspec / "__init__.py").is_file()):
107        # We have a local file with this name, but is there also a module with the same name?
108        try:
109            with mock_some_common_side_effects():
110                modspec = importlib.util.find_spec(spec)
111                if modspec is None:
112                    raise ModuleNotFoundError
113        except AnyException:
114            # Module does not exist, use local file.
115            spec = pspec
116        else:
117            # Module does exist. We now check if the local file/directory is the same (e.g. after pip install -e),
118            # and emit a warning if that's not the case.
119            origin = (
120                Path(modspec.origin).absolute() if modspec.origin else Path("unknown")
121            )
122            local_dir = Path(spec).absolute()
123            if local_dir not in (origin, origin.parent):
124                warnings.warn(
125                    f"{spec!r} may refer to either the installed Python module or the local file/directory with the "
126                    f"same name. pdoc will document the installed module, prepend './' to force documentation of the "
127                    f"local file/directory.\n"
128                    f" - Module location: {origin}\n"
129                    f" - Local file/directory: {local_dir}",
130                    RuntimeWarning,
131                )
132
133    if isinstance(spec, Path):
134        if spec.name == "__init__.py":
135            spec = spec.parent
136        if (spec.parent / "__init__.py").exists():
137            return parse_spec(spec.resolve().parent) + f".{spec.stem}"
138        parent_dir = str(spec.parent)
139        sys.path = [parent_dir] + [x for x in sys.path if x != parent_dir]
140        if spec.stem in sys.modules and sys.modules[spec.stem].__file__:
141            local_dir = spec.resolve()
142            file = sys.modules[spec.stem].__file__
143            assert file is not None  # make mypy happy
144            origin = Path(file).resolve()
145            if local_dir not in (origin, origin.parent, origin.with_suffix("")):
146                warnings.warn(
147                    f"pdoc cannot load {spec.stem!r} because a module with the same name is already imported in pdoc's "
148                    f"Python process. pdoc will document the loaded module from {origin} instead.",
149                    RuntimeWarning,
150                )
151        return spec.stem
152    else:
153        return spec

This functions parses a user's module specification into a module identifier that can be imported. If both a local file/directory and an importable module with the same name exist, a warning will be printed.

This function has side effects: sys.path will be amended if the specification is a path. If this side effect is undesired, pass a module name instead.

@contextmanager
def mock_some_common_side_effects():
196@contextmanager
197def mock_some_common_side_effects():
198    """
199    This context manager is applied when importing modules. It mocks some common side effects that may happen upon
200    module import. For example, `import antigravity` normally causes a web browser to open, which we want to suppress.
201
202    Note that this function must not be used for security purposes, it's easily bypassable.
203    """
204    with patch("subprocess.Popen", new=_PdocDefusedPopen), patch(
205        "os.startfile", new=_noop, create=True
206    ), patch("sys.stdout", new=io.StringIO()), patch(
207        "sys.stderr", new=io.StringIO()
208    ), patch(
209        "sys.stdin", new=io.StringIO()
210    ):
211        yield

This context manager is applied when importing modules. It mocks some common side effects that may happen upon module import. For example, import antigravity normally causes a web browser to open, which we want to suppress.

Note that this function must not be used for security purposes, it's easily bypassable.

@mock_some_common_side_effects()
def load_module(module: str) -> module:
214@mock_some_common_side_effects()
215def load_module(module: str) -> types.ModuleType:
216    """Try to import a module. If import fails, a RuntimeError is raised.
217
218    Returns the imported module."""
219    try:
220        return importlib.import_module(module)
221    except AnyException as e:
222        raise RuntimeError(f"Error importing {module}") from e

Try to import a module. If import fails, a RuntimeError is raised.

Returns the imported module.

AnyException = (<class 'SystemExit'>, <class 'GeneratorExit'>, <class 'Exception'>)

BaseException, but excluding KeyboardInterrupt.

Modules may raise SystemExit on import (which we want to catch), but we don't want to catch a user's KeyboardInterrupt.

def walk_packages2( modules: collections.abc.Iterable[pkgutil.ModuleInfo]) -> collections.abc.Iterator[pkgutil.ModuleInfo]:
233def walk_packages2(
234    modules: Iterable[pkgutil.ModuleInfo],
235) -> Iterator[pkgutil.ModuleInfo]:
236    """
237    For a given list of modules, recursively yield their names and all their submodules' names.
238
239    This function is similar to `pkgutil.walk_packages`, but respects a package's `__all__` attribute if specified.
240    If `__all__` is defined, submodules not listed in `__all__` are excluded.
241    """
242
243    # noinspection PyDefaultArgument
244    def seen(p, m={}):  # pragma: no cover
245        if p in m:
246            return True
247        m[p] = True
248
249    for mod in modules:
250        yield mod
251
252        if mod.ispkg:
253            try:
254                module = load_module(mod.name)
255            except RuntimeError:
256                warnings.warn(f"Error loading {mod.name}:\n{traceback.format_exc()}")
257                continue
258
259            mod_all = getattr(module, "__all__", None)
260            # don't traverse path items we've seen before
261            path = [p for p in (getattr(module, "__path__", None) or []) if not seen(p)]
262
263            submodules = []
264            for submodule in pkgutil.iter_modules(path, f"{mod.name}."):
265                name = submodule.name.rpartition(".")[2]
266                if name == "__main__":
267                    continue  # https://github.com/mitmproxy/pdoc/issues/438
268                if mod_all is None or name in mod_all:
269                    submodules.append(submodule)
270
271            yield from walk_packages2(submodules)

For a given list of modules, recursively yield their names and all their submodules' names.

This function is similar to pkgutil.walk_packages, but respects a package's __all__ attribute if specified. If __all__ is defined, submodules not listed in __all__ are excluded.

def module_mtime(modulename: str) -> float | None:
274def module_mtime(modulename: str) -> float | None:
275    """Returns the time the specified module file was last modified, or `None` if this cannot be determined.
276    The primary use of this is live-reloading modules on modification."""
277    try:
278        with mock_some_common_side_effects():
279            spec = importlib.util.find_spec(modulename)
280    except AnyException:
281        pass
282    else:
283        if spec is not None and spec.origin is not None:
284            return Path(spec.origin).stat().st_mtime
285    return None

Returns the time the specified module file was last modified, or None if this cannot be determined. The primary use of this is live-reloading modules on modification.

def invalidate_caches(module_name: str) -> None:
288def invalidate_caches(module_name: str) -> None:
289    """
290    Invalidate module cache to allow live-reloading of modules.
291    """
292    # Getting this right is tricky – reloading modules causes a bunch of surprising side effects.
293    # Our current best effort is to call `importlib.reload` on all modules that start with module_name.
294    # We also exclude our own dependencies, which cause fun errors otherwise.
295    if module_name not in sys.modules:
296        return
297    if any(
298        module_name.startswith(f"{x}.") or x == module_name
299        for x in ("jinja2", "markupsafe", "markdown2", "pygments")
300    ):
301        return
302
303    # a more extreme alternative:
304    # filename = sys.modules[module_name].__file__
305    # if (
306    #    filename.startswith(sysconfig.get_path("platstdlib"))
307    #    or filename.startswith(sysconfig.get_path("stdlib"))
308    # ):
309    #     return
310
311    importlib.invalidate_caches()
312    linecache.clearcache()
313    pdoc.doc.Module.from_name.cache_clear()
314    pdoc.doc_ast._get_source.cache_clear()
315    pdoc.docstrings.convert.cache_clear()
316
317    prefix = f"{module_name}."
318    mods = sorted(
319        mod for mod in sys.modules if module_name == mod or mod.startswith(prefix)
320    )
321    for modname in mods:
322        if modname == "pdoc.render":
323            # pdoc.render is stateful after configure(), so we don't want to reload it.
324            continue
325        try:
326            if not isinstance(sys.modules[modname], types.ModuleType):
327                continue  # some funky stuff going on - one example is typing.io, which is a class.
328            with mock_some_common_side_effects():
329                importlib.reload(sys.modules[modname])
330        except AnyException:
331            warnings.warn(
332                f"Error reloading {modname}:\n{traceback.format_exc()}",
333                stacklevel=2,
334            )

Invalidate module cache to allow live-reloading of modules.