Edit on GitHub

pdoc.extract

This module handles the interaction with Python's module system, that is it loads the correct module based on whatever the user specified, and provides the rest of pdoc with some additional module metadata.

  1"""
  2This module handles the interaction with Python's module system,
  3that is it loads the correct module based on whatever the user specified,
  4and provides the rest of pdoc with some additional module metadata.
  5"""
  6from __future__ import annotations
  7
  8import importlib.util
  9import io
 10import linecache
 11import os
 12import pkgutil
 13import platform
 14import re
 15import shutil
 16import subprocess
 17import sys
 18import traceback
 19import types
 20import warnings
 21from collections.abc import Iterable
 22from collections.abc import Iterator
 23from collections.abc import Sequence
 24from contextlib import contextmanager
 25from pathlib import Path
 26from unittest.mock import patch
 27
 28import pdoc.doc_ast
 29import pdoc.docstrings
 30
 31
 32def walk_specs(specs: Sequence[Path | str]) -> list[str]:
 33    """
 34    This function processes a list of module specifications and returns a collection of module names, including all
 35    submodules, that should be processed by pdoc.
 36
 37    A module specification can either be the name of an installed module, or the path to a specific file or package.
 38    For example, the following strings are valid module specifications:
 39
 40     - `typing`
 41     - `collections.abc`
 42     - `./test/testdata/demo_long.py`
 43     - `./test/testdata/demopackage`
 44
 45    *This function has side effects:* See `parse_spec`.
 46    """
 47    all_modules: dict[str, None] = {}
 48    for spec in specs:
 49        if isinstance(spec, str) and spec.startswith("!"):
 50            ignore_pattern = re.compile(spec[1:])
 51            all_modules = {
 52                k: v for k, v in all_modules.items() if not ignore_pattern.match(k)
 53            }
 54            continue
 55
 56        modname = parse_spec(spec)
 57
 58        try:
 59            with mock_some_common_side_effects():
 60                modspec = importlib.util.find_spec(modname)
 61                if modspec is None:
 62                    raise ModuleNotFoundError(modname)
 63        except AnyException:
 64            warnings.warn(
 65                f"Cannot find spec for {modname} (from {spec}):\n{traceback.format_exc()}",
 66                stacklevel=2,
 67            )
 68        else:
 69            mod_info = pkgutil.ModuleInfo(
 70                None,  # type: ignore
 71                name=modname,
 72                ispkg=bool(modspec.submodule_search_locations),
 73            )
 74            for m in walk_packages2([mod_info]):
 75                if m.name in all_modules:
 76                    warnings.warn(
 77                        f"The module specification {spec!r} adds a module named {m.name}, but a module with this name "
 78                        f"has already been added. You may have accidentally repeated a module spec, or you are trying "
 79                        f"to document two modules with the same filename from two different directories, which does "
 80                        f"not work. Only one documentation page will be generated."
 81                    )
 82                all_modules[m.name] = None
 83
 84    if not all_modules:
 85        raise ValueError(
 86            f"No modules found matching spec: {', '.join(str(x) for x in specs)}"
 87        )
 88
 89    return list(all_modules)
 90
 91
 92def parse_spec(spec: Path | str) -> str:
 93    """
 94    This functions parses a user's module specification into a module identifier that can be imported.
 95    If both a local file/directory and an importable module with the same name exist, a warning will be printed.
 96
 97    *This function has side effects:* `sys.path` will be amended if the specification is a path.
 98    If this side effect is undesired, pass a module name instead.
 99    """
100    pspec = Path(spec)
101    if isinstance(spec, str) and (os.sep in spec or (os.altsep and os.altsep in spec)):
102        # We have a path separator, so it's definitely a filepath.
103        spec = pspec
104
105    if isinstance(spec, str) and (pspec.is_file() or (pspec / "__init__.py").is_file()):
106        # We have a local file with this name, but is there also a module with the same name?
107        try:
108            with mock_some_common_side_effects():
109                modspec = importlib.util.find_spec(spec)
110                if modspec is None:
111                    raise ModuleNotFoundError
112        except AnyException:
113            # Module does not exist, use local file.
114            spec = pspec
115        else:
116            # Module does exist. We now check if the local file/directory is the same (e.g. after pip install -e),
117            # and emit a warning if that's not the case.
118            origin = (
119                Path(modspec.origin).absolute() if modspec.origin else Path("unknown")
120            )
121            local_dir = Path(spec).absolute()
122            if local_dir not in (origin, origin.parent):
123                warnings.warn(
124                    f"{spec!r} may refer to either the installed Python module or the local file/directory with the "
125                    f"same name. pdoc will document the installed module, prepend './' to force documentation of the "
126                    f"local file/directory.\n"
127                    f" - Module location: {origin}\n"
128                    f" - Local file/directory: {local_dir}",
129                    RuntimeWarning,
130                )
131
132    if isinstance(spec, Path):
133        if (spec.parent / "__init__.py").exists():
134            return parse_spec(spec.resolve().parent) + f".{spec.stem}"
135        parent_dir = str(spec.parent)
136        sys.path = [parent_dir] + [x for x in sys.path if x != parent_dir]
137        if spec.stem in sys.modules and sys.modules[spec.stem].__file__:
138            local_dir = spec.resolve()
139            file = sys.modules[spec.stem].__file__
140            assert file is not None  # make mypy happy
141            origin = Path(file).resolve()
142            if local_dir not in (origin, origin.parent, origin.with_suffix("")):
143                warnings.warn(
144                    f"pdoc cannot load {spec.stem!r} because a module with the same name is already imported in pdoc's "
145                    f"Python process. pdoc will document the loaded module from {origin} instead.",
146                    RuntimeWarning,
147                )
148        return spec.stem
149    else:
150        return spec
151
152
153def _noop(*args, **kwargs):
154    pass
155
156
157class _PdocDefusedPopen(subprocess.Popen):
158    """A small wrapper around subprocess.Popen that converts most executions into no-ops."""
159
160    if platform.system() == "Windows":  # pragma: no cover
161        _noop_exe = "echo.exe"
162    else:  # pragma: no cover
163        _noop_exe = "echo"
164
165    def __init__(self, *args, **kwargs):  # pragma: no cover
166        command_allowed = (
167            args
168            and args[0]
169            and args[0][0]
170            in (
171                # these invocations may all come from https://github.com/python/cpython/blob/main/Lib/ctypes/util.py,
172                # which we want to keep working.
173                "/sbin/ldconfig",
174                "ld",
175                shutil.which("gcc") or shutil.which("cc"),
176                shutil.which("objdump"),
177                # https://github.com/mitmproxy/pdoc/issues/430: GitPython invokes git commands, which is also fine.
178                "git",
179            )
180        )
181        if not command_allowed and os.environ.get("PDOC_ALLOW_EXEC", "") == "":
182            # sys.stderr is patched, so we need to unpatch it for printing a warning.
183            with patch("sys.stderr", new=sys.__stderr__):
184                warnings.warn(
185                    f"Suppressed execution of {args[0]!r} during import. "
186                    f"Set PDOC_ALLOW_EXEC=1 as an environment variable to allow subprocess execution.",
187                    stacklevel=2,
188                )
189            kwargs["executable"] = self._noop_exe
190        super().__init__(*args, **kwargs)
191
192
193@contextmanager
194def mock_some_common_side_effects():
195    """
196    This context manager is applied when importing modules. It mocks some common side effects that may happen upon
197    module import. For example, `import antigravity` normally causes a web browser to open, which we want to suppress.
198
199    Note that this function must not be used for security purposes, it's easily bypassable.
200    """
201    with patch("subprocess.Popen", new=_PdocDefusedPopen), patch(
202        "os.startfile", new=_noop, create=True
203    ), patch("sys.stdout", new=io.StringIO()), patch(
204        "sys.stderr", new=io.StringIO()
205    ), patch(
206        "sys.stdin", new=io.StringIO()
207    ):
208        yield
209
210
211@mock_some_common_side_effects()
212def load_module(module: str) -> types.ModuleType:
213    """Try to import a module. If import fails, a RuntimeError is raised.
214
215    Returns the imported module."""
216    try:
217        return importlib.import_module(module)
218    except AnyException as e:
219        raise RuntimeError(f"Error importing {module}") from e
220
221
222AnyException = (SystemExit, GeneratorExit, Exception)
223"""BaseException, but excluding KeyboardInterrupt.
224
225Modules may raise SystemExit on import (which we want to catch),
226but we don't want to catch a user's KeyboardInterrupt.
227"""
228
229
230def walk_packages2(
231    modules: Iterable[pkgutil.ModuleInfo],
232) -> Iterator[pkgutil.ModuleInfo]:
233    """
234    For a given list of modules, recursively yield their names and all their submodules' names.
235
236    This function is similar to `pkgutil.walk_packages`, but respects a package's `__all__` attribute if specified.
237    If `__all__` is defined, submodules not listed in `__all__` are excluded.
238    """
239
240    # noinspection PyDefaultArgument
241    def seen(p, m={}):  # pragma: no cover
242        if p in m:
243            return True
244        m[p] = True
245
246    for mod in modules:
247        yield mod
248
249        if mod.ispkg:
250            try:
251                module = load_module(mod.name)
252            except RuntimeError:
253                warnings.warn(f"Error loading {mod.name}:\n{traceback.format_exc()}")
254                continue
255
256            mod_all = getattr(module, "__all__", None)
257            # don't traverse path items we've seen before
258            path = [p for p in (getattr(module, "__path__", None) or []) if not seen(p)]
259
260            submodules = []
261            for submodule in pkgutil.iter_modules(path, f"{mod.name}."):
262                name = submodule.name.rpartition(".")[2]
263                if name == "__main__":
264                    continue  # https://github.com/mitmproxy/pdoc/issues/438
265                if mod_all is None or name in mod_all:
266                    submodules.append(submodule)
267
268            yield from walk_packages2(submodules)
269
270
271def module_mtime(modulename: str) -> float | None:
272    """Returns the time the specified module file was last modified, or `None` if this cannot be determined.
273    The primary use of this is live-reloading modules on modification."""
274    try:
275        with mock_some_common_side_effects():
276            spec = importlib.util.find_spec(modulename)
277    except AnyException:
278        pass
279    else:
280        if spec is not None and spec.origin is not None:
281            return Path(spec.origin).stat().st_mtime
282    return None
283
284
285def invalidate_caches(module_name: str) -> None:
286    """
287    Invalidate module cache to allow live-reloading of modules.
288    """
289    # Getting this right is tricky – reloading modules causes a bunch of surprising side effects.
290    # Our current best effort is to call `importlib.reload` on all modules that start with module_name.
291    # We also exclude our own dependencies, which cause fun errors otherwise.
292    if module_name not in sys.modules:
293        return
294    if any(
295        module_name.startswith(f"{x}.") or x == module_name
296        for x in ("jinja2", "markupsafe", "markdown2", "pygments")
297    ):
298        return
299
300    # a more extreme alternative:
301    # filename = sys.modules[module_name].__file__
302    # if (
303    #    filename.startswith(sysconfig.get_path("platstdlib"))
304    #    or filename.startswith(sysconfig.get_path("stdlib"))
305    # ):
306    #     return
307
308    importlib.invalidate_caches()
309    linecache.clearcache()
310    pdoc.doc.Module.from_name.cache_clear()
311    pdoc.doc_ast._get_source.cache_clear()
312    pdoc.docstrings.convert.cache_clear()
313
314    prefix = f"{module_name}."
315    mods = sorted(
316        mod for mod in sys.modules if module_name == mod or mod.startswith(prefix)
317    )
318    for modname in mods:
319        if modname == "pdoc.render":
320            # pdoc.render is stateful after configure(), so we don't want to reload it.
321            continue
322        try:
323            if not isinstance(sys.modules[modname], types.ModuleType):
324                continue  # some funky stuff going on - one example is typing.io, which is a class.
325            with mock_some_common_side_effects():
326                importlib.reload(sys.modules[modname])
327        except AnyException:
328            warnings.warn(
329                f"Error reloading {modname}:\n{traceback.format_exc()}",
330                stacklevel=2,
331            )
def walk_specs(specs: collections.abc.Sequence[pathlib.Path | str]) -> list[str]:
33def walk_specs(specs: Sequence[Path | str]) -> list[str]:
34    """
35    This function processes a list of module specifications and returns a collection of module names, including all
36    submodules, that should be processed by pdoc.
37
38    A module specification can either be the name of an installed module, or the path to a specific file or package.
39    For example, the following strings are valid module specifications:
40
41     - `typing`
42     - `collections.abc`
43     - `./test/testdata/demo_long.py`
44     - `./test/testdata/demopackage`
45
46    *This function has side effects:* See `parse_spec`.
47    """
48    all_modules: dict[str, None] = {}
49    for spec in specs:
50        if isinstance(spec, str) and spec.startswith("!"):
51            ignore_pattern = re.compile(spec[1:])
52            all_modules = {
53                k: v for k, v in all_modules.items() if not ignore_pattern.match(k)
54            }
55            continue
56
57        modname = parse_spec(spec)
58
59        try:
60            with mock_some_common_side_effects():
61                modspec = importlib.util.find_spec(modname)
62                if modspec is None:
63                    raise ModuleNotFoundError(modname)
64        except AnyException:
65            warnings.warn(
66                f"Cannot find spec for {modname} (from {spec}):\n{traceback.format_exc()}",
67                stacklevel=2,
68            )
69        else:
70            mod_info = pkgutil.ModuleInfo(
71                None,  # type: ignore
72                name=modname,
73                ispkg=bool(modspec.submodule_search_locations),
74            )
75            for m in walk_packages2([mod_info]):
76                if m.name in all_modules:
77                    warnings.warn(
78                        f"The module specification {spec!r} adds a module named {m.name}, but a module with this name "
79                        f"has already been added. You may have accidentally repeated a module spec, or you are trying "
80                        f"to document two modules with the same filename from two different directories, which does "
81                        f"not work. Only one documentation page will be generated."
82                    )
83                all_modules[m.name] = None
84
85    if not all_modules:
86        raise ValueError(
87            f"No modules found matching spec: {', '.join(str(x) for x in specs)}"
88        )
89
90    return list(all_modules)

This function processes a list of module specifications and returns a collection of module names, including all submodules, that should be processed by pdoc.

A module specification can either be the name of an installed module, or the path to a specific file or package. For example, the following strings are valid module specifications:

  • typing
  • collections.abc
  • ./test/testdata/demo_long.py
  • ./test/testdata/demopackage

This function has side effects: See parse_spec.

def parse_spec(spec: pathlib.Path | str) -> str:
 93def parse_spec(spec: Path | str) -> str:
 94    """
 95    This functions parses a user's module specification into a module identifier that can be imported.
 96    If both a local file/directory and an importable module with the same name exist, a warning will be printed.
 97
 98    *This function has side effects:* `sys.path` will be amended if the specification is a path.
 99    If this side effect is undesired, pass a module name instead.
100    """
101    pspec = Path(spec)
102    if isinstance(spec, str) and (os.sep in spec or (os.altsep and os.altsep in spec)):
103        # We have a path separator, so it's definitely a filepath.
104        spec = pspec
105
106    if isinstance(spec, str) and (pspec.is_file() or (pspec / "__init__.py").is_file()):
107        # We have a local file with this name, but is there also a module with the same name?
108        try:
109            with mock_some_common_side_effects():
110                modspec = importlib.util.find_spec(spec)
111                if modspec is None:
112                    raise ModuleNotFoundError
113        except AnyException:
114            # Module does not exist, use local file.
115            spec = pspec
116        else:
117            # Module does exist. We now check if the local file/directory is the same (e.g. after pip install -e),
118            # and emit a warning if that's not the case.
119            origin = (
120                Path(modspec.origin).absolute() if modspec.origin else Path("unknown")
121            )
122            local_dir = Path(spec).absolute()
123            if local_dir not in (origin, origin.parent):
124                warnings.warn(
125                    f"{spec!r} may refer to either the installed Python module or the local file/directory with the "
126                    f"same name. pdoc will document the installed module, prepend './' to force documentation of the "
127                    f"local file/directory.\n"
128                    f" - Module location: {origin}\n"
129                    f" - Local file/directory: {local_dir}",
130                    RuntimeWarning,
131                )
132
133    if isinstance(spec, Path):
134        if (spec.parent / "__init__.py").exists():
135            return parse_spec(spec.resolve().parent) + f".{spec.stem}"
136        parent_dir = str(spec.parent)
137        sys.path = [parent_dir] + [x for x in sys.path if x != parent_dir]
138        if spec.stem in sys.modules and sys.modules[spec.stem].__file__:
139            local_dir = spec.resolve()
140            file = sys.modules[spec.stem].__file__
141            assert file is not None  # make mypy happy
142            origin = Path(file).resolve()
143            if local_dir not in (origin, origin.parent, origin.with_suffix("")):
144                warnings.warn(
145                    f"pdoc cannot load {spec.stem!r} because a module with the same name is already imported in pdoc's "
146                    f"Python process. pdoc will document the loaded module from {origin} instead.",
147                    RuntimeWarning,
148                )
149        return spec.stem
150    else:
151        return spec

This functions parses a user's module specification into a module identifier that can be imported. If both a local file/directory and an importable module with the same name exist, a warning will be printed.

This function has side effects: sys.path will be amended if the specification is a path. If this side effect is undesired, pass a module name instead.

@contextmanager
def mock_some_common_side_effects():
194@contextmanager
195def mock_some_common_side_effects():
196    """
197    This context manager is applied when importing modules. It mocks some common side effects that may happen upon
198    module import. For example, `import antigravity` normally causes a web browser to open, which we want to suppress.
199
200    Note that this function must not be used for security purposes, it's easily bypassable.
201    """
202    with patch("subprocess.Popen", new=_PdocDefusedPopen), patch(
203        "os.startfile", new=_noop, create=True
204    ), patch("sys.stdout", new=io.StringIO()), patch(
205        "sys.stderr", new=io.StringIO()
206    ), patch(
207        "sys.stdin", new=io.StringIO()
208    ):
209        yield

This context manager is applied when importing modules. It mocks some common side effects that may happen upon module import. For example, import antigravity normally causes a web browser to open, which we want to suppress.

Note that this function must not be used for security purposes, it's easily bypassable.

@mock_some_common_side_effects()
def load_module(module: str) -> module:
212@mock_some_common_side_effects()
213def load_module(module: str) -> types.ModuleType:
214    """Try to import a module. If import fails, a RuntimeError is raised.
215
216    Returns the imported module."""
217    try:
218        return importlib.import_module(module)
219    except AnyException as e:
220        raise RuntimeError(f"Error importing {module}") from e

Try to import a module. If import fails, a RuntimeError is raised.

Returns the imported module.

AnyException = (<class 'SystemExit'>, <class 'GeneratorExit'>, <class 'Exception'>)

BaseException, but excluding KeyboardInterrupt.

Modules may raise SystemExit on import (which we want to catch), but we don't want to catch a user's KeyboardInterrupt.

def walk_packages2( modules: collections.abc.Iterable[pkgutil.ModuleInfo]) -> collections.abc.Iterator[pkgutil.ModuleInfo]:
231def walk_packages2(
232    modules: Iterable[pkgutil.ModuleInfo],
233) -> Iterator[pkgutil.ModuleInfo]:
234    """
235    For a given list of modules, recursively yield their names and all their submodules' names.
236
237    This function is similar to `pkgutil.walk_packages`, but respects a package's `__all__` attribute if specified.
238    If `__all__` is defined, submodules not listed in `__all__` are excluded.
239    """
240
241    # noinspection PyDefaultArgument
242    def seen(p, m={}):  # pragma: no cover
243        if p in m:
244            return True
245        m[p] = True
246
247    for mod in modules:
248        yield mod
249
250        if mod.ispkg:
251            try:
252                module = load_module(mod.name)
253            except RuntimeError:
254                warnings.warn(f"Error loading {mod.name}:\n{traceback.format_exc()}")
255                continue
256
257            mod_all = getattr(module, "__all__", None)
258            # don't traverse path items we've seen before
259            path = [p for p in (getattr(module, "__path__", None) or []) if not seen(p)]
260
261            submodules = []
262            for submodule in pkgutil.iter_modules(path, f"{mod.name}."):
263                name = submodule.name.rpartition(".")[2]
264                if name == "__main__":
265                    continue  # https://github.com/mitmproxy/pdoc/issues/438
266                if mod_all is None or name in mod_all:
267                    submodules.append(submodule)
268
269            yield from walk_packages2(submodules)

For a given list of modules, recursively yield their names and all their submodules' names.

This function is similar to pkgutil.walk_packages, but respects a package's __all__ attribute if specified. If __all__ is defined, submodules not listed in __all__ are excluded.

def module_mtime(modulename: str) -> float | None:
272def module_mtime(modulename: str) -> float | None:
273    """Returns the time the specified module file was last modified, or `None` if this cannot be determined.
274    The primary use of this is live-reloading modules on modification."""
275    try:
276        with mock_some_common_side_effects():
277            spec = importlib.util.find_spec(modulename)
278    except AnyException:
279        pass
280    else:
281        if spec is not None and spec.origin is not None:
282            return Path(spec.origin).stat().st_mtime
283    return None

Returns the time the specified module file was last modified, or None if this cannot be determined. The primary use of this is live-reloading modules on modification.

def invalidate_caches(module_name: str) -> None:
286def invalidate_caches(module_name: str) -> None:
287    """
288    Invalidate module cache to allow live-reloading of modules.
289    """
290    # Getting this right is tricky – reloading modules causes a bunch of surprising side effects.
291    # Our current best effort is to call `importlib.reload` on all modules that start with module_name.
292    # We also exclude our own dependencies, which cause fun errors otherwise.
293    if module_name not in sys.modules:
294        return
295    if any(
296        module_name.startswith(f"{x}.") or x == module_name
297        for x in ("jinja2", "markupsafe", "markdown2", "pygments")
298    ):
299        return
300
301    # a more extreme alternative:
302    # filename = sys.modules[module_name].__file__
303    # if (
304    #    filename.startswith(sysconfig.get_path("platstdlib"))
305    #    or filename.startswith(sysconfig.get_path("stdlib"))
306    # ):
307    #     return
308
309    importlib.invalidate_caches()
310    linecache.clearcache()
311    pdoc.doc.Module.from_name.cache_clear()
312    pdoc.doc_ast._get_source.cache_clear()
313    pdoc.docstrings.convert.cache_clear()
314
315    prefix = f"{module_name}."
316    mods = sorted(
317        mod for mod in sys.modules if module_name == mod or mod.startswith(prefix)
318    )
319    for modname in mods:
320        if modname == "pdoc.render":
321            # pdoc.render is stateful after configure(), so we don't want to reload it.
322            continue
323        try:
324            if not isinstance(sys.modules[modname], types.ModuleType):
325                continue  # some funky stuff going on - one example is typing.io, which is a class.
326            with mock_some_common_side_effects():
327                importlib.reload(sys.modules[modname])
328        except AnyException:
329            warnings.warn(
330                f"Error reloading {modname}:\n{traceback.format_exc()}",
331                stacklevel=2,
332            )

Invalidate module cache to allow live-reloading of modules.