Coverage for src/auto_intersphinx/catalog.py: 93%
248 statements
« prev ^ index » next coverage.py v7.4.3, created at 2024-04-22 14:48 +0200
« prev ^ index » next coverage.py v7.4.3, created at 2024-04-22 14:48 +0200
1# SPDX-FileCopyrightText: Copyright © 2022 Idiap Research Institute <contact@idiap.ch>
2#
3# SPDX-License-Identifier: BSD-3-Clause
4"""This module contains instructions for documentation lookup."""
6from __future__ import annotations # not required for Python >= 3.10
8import collections.abc
9import importlib.metadata
10import importlib.resources
11import json
12import pathlib
13import re
14import shutil
15import typing
17import lxml.html
18import packaging.version
19import requests
21from sphinx.util import logging
23logger = logging.getLogger(__name__)
26PackageDictionaryType = dict[str, dict[str, str]]
27"""Type for the internal values of :py:class:`Catalog`"""
30BUILTIN_CATALOG = importlib.resources.files(__package__).joinpath("catalog.json")
31"""Base name for the catalog file distributed with this package."""
34PEP440_RE = re.compile(
35 r"^\s*" + packaging.version.VERSION_PATTERN + r"\s*$",
36 re.VERBOSE | re.IGNORECASE,
37)
38"""Regular expression for matching PEP-440 version numbers."""
41def _ensure_webdir(addr: str) -> str:
42 """Ensures the web-address ends in a /, and contains ``objects.inv``"""
43 if addr.endswith(".html"):
44 addr = addr[: addr.rfind("/")]
45 if not addr.endswith("/"):
46 addr += "/"
48 # objects = addr + "/" + "objects.inv"
49 # if requests.head(objects).ok:
50 # logger.error("Cannot find {objects}...")
51 # return None
53 return addr
56def _reorder_versions(vdict: dict[str, str]) -> dict[str, str]:
57 """Re-orders version dictionary by decreasing version."""
58 # nota bene: new dicts preserve insertion order
59 retval: dict[str, str] = {}
61 # these keys come always first, if available
62 protected = ("latest", "main", "master", "stable")
63 for key in protected:
64 if key in vdict:
65 retval[key] = vdict[key]
67 # next, are releases in reverse order
68 version_map = {
69 packaging.version.Version(k): k
70 for k in vdict.keys()
71 if (k not in protected) and PEP440_RE.match(k)
72 }
73 for version in sorted(version_map.keys(), reverse=True):
74 retval[version_map[version]] = vdict[version_map[version]]
76 # now, everything else
77 retval.update({k: v for k, v in vdict.items() if k not in retval})
79 return retval
82def docurls_from_environment(package: str) -> dict[str, str]:
83 """Checks installed package metadata for documentation URLs.
85 Arguments:
87 package: Name of the package you want to check
89 version: A version such as "stable", "latest" or a formal version
90 number parsed by :py:class:`packaging.version.Version`.
93 Returns:
95 A dictionary, that maps the version of the documentation found on PyPI
96 to the URL.
97 """
98 try:
99 md = importlib.metadata.metadata(package)
100 if md.get_all("Project-URL") is None:
101 return {}
102 for k in md.get_all("Project-URL"):
103 if k.startswith(("documentation, ", "Documentation, ")):
104 addr = _ensure_webdir(k.split(",", 1)[1].strip())
105 if requests.head(addr + "/objects.inv").ok:
106 try:
107 return {md["version"]: addr}
108 except KeyError:
109 return {"latest": addr}
111 except importlib.metadata.PackageNotFoundError:
112 pass
114 return {}
117def docurls_from_rtd(package: str) -> dict[str, str]:
118 """Checks readthedocs.org for documentation pointers for the package.
120 Arguments:
122 package: Name of the package to check on rtd.org - this must be the
123 name it is know at rtd.org and not necessarily the package name.
124 Some packages do have different names on rtd.org.
127 Returns:
129 A dictionary, which contains all versions of documentation available
130 for the given package on RTD. If the package's documentation is not
131 available on RTD, returns an empty dictionary.
132 """
133 try:
134 url = f"https://readthedocs.org/projects/{package}/versions/"
135 logger.debug(f"Reaching for `{url}'...")
136 r = requests.get(f"https://readthedocs.org/projects/{package}/versions/")
137 if r.ok:
138 tree = lxml.html.fromstring(r.text)
139 return {
140 k.text: _ensure_webdir(k.attrib["href"])
141 for k in tree.xpath("//a[contains(@class, 'module-item-title')]")
142 if k.attrib["href"].startswith("http")
143 }
145 except requests.exceptions.RequestException:
146 pass
148 return {}
151def _get_json(url: str) -> dict | None:
152 try:
153 logger.debug(f"Reaching for `{url}'...")
154 r = requests.get(url)
155 if r.ok:
156 return r.json()
158 except requests.exceptions.RequestException:
159 pass
161 return None
164def docurls_from_pypi(package: str, max_entries: int) -> dict[str, str]:
165 """Checks PyPI for documentation pointers for a given package.
167 This procedure first looks up the main repo JSON entry, and then figures
168 out all available versions of the package. In a second step, and depending
169 on the value of ``max_entries``, this function will retrieve the latest
170 ``max_entries`` available on that particular package.
173 Arguments:
175 package: Name of the PyPI package you want to check
177 max_entries: The maximum number of entries to lookup in PyPI. A value
178 of zero will download only the main package information and will
179 hit PyPI only once. A value bigger than zero will download at most
180 the information from the last ``max_entries`` releases. Finally, a
181 negative value will imply the download of all available releases.
184 Returns:
186 A dictionary, that maps the version of the documentation found on PyPI
187 to the URL.
188 """
189 versions: dict[str, str] = {}
190 data = _get_json(f"https://pypi.org/pypi/{package}/json")
191 if data is None:
192 return versions
194 urls = data["info"]["project_urls"]
195 addr = urls.get("Documentation") or urls.get("documentation")
196 if addr is not None:
197 addr = _ensure_webdir(addr)
198 if requests.head(addr + "/objects.inv").ok:
199 versions[data["info"]["version"]] = addr
201 # download further versions, if requested by user
202 version_map = {
203 packaging.version.Version(k): k
204 for k in data["releases"].keys()
205 if PEP440_RE.match(k)
206 }
207 versions_to_probe = sorted(list(version_map.keys()), reverse=True)
209 if max_entries >= 0:
210 versions_to_probe = versions_to_probe[:max_entries]
212 for k in versions_to_probe:
213 data = _get_json(f"https://pypi.org/pypi/{package}/{version_map[k]}/json")
214 if data is None:
215 continue
217 urls = data["info"]["project_urls"]
218 addr = urls.get("Documentation") or urls.get("documentation")
219 if addr is not None:
220 addr = _ensure_webdir(addr)
221 if requests.head(addr + "/objects.inv").ok:
222 versions[data["info"]["version"]] = addr
224 return versions
227class Catalog(collections.abc.MutableMapping):
228 """A type that can lookup and store information about Sphinx documents.
230 The object is organised as a dictionary (mutable mapping type) with extra
231 methods to handle information update from various sources. Information is
232 organised as dictionary mapping Python package names to another dictionary
233 containing the following entries:
235 * ``versions``: A dictionary mapping version numbers to URLs. The keys
236 have free form, albeit are mostly PEP440 version numbers. Keywords such
237 as ``stable``, ``latest``, ``master``, or ``main`` are typically found as
238 well.
239 * ``sources``: A dictionary mapping information sources for this particular
240 entry. Keys are one of ``pypi``, ``readthedocs`` or ``environment``.
241 Values correspond to specific names used for the lookup of the
242 information on those sources.
245 Attributes:
247 _data: Internal dictionary containing the mapping between package names
248 the user can refer to, versions and eventual sources of such
249 information.
250 """
252 _data: dict[str, PackageDictionaryType]
254 def __init__(self) -> None:
255 self.reset()
257 def load(self, path: pathlib.Path) -> None:
258 """Loads and replaces contents with those from the file."""
259 with path.open("rt") as f:
260 logger.debug(f"Loading package catalog from {str(path)}...")
261 self._data = json.load(f)
262 logger.debug(f"Loaded {len(self)} entries from {str(path)}")
264 def loads(self, contents: str) -> None:
265 """Loads and replaces contents with those from the string."""
266 self._data = json.loads(contents)
267 logger.debug(f"Loaded {len(self)} entries from string")
269 def dump(self, path: pathlib.Path) -> None:
270 """Loads and replaces contents with those from the file."""
271 if path.exists():
272 backup = path.with_suffix(path.suffix + "~")
273 logger.debug(f"Backing up: {str(path)} -> {str(backup)}...")
274 shutil.copy(path, backup) # backup
276 with path.open("wt") as f:
277 logger.debug(
278 f"Saving package catalog with {len(self)} entries at {str(path)}..."
279 )
280 json.dump(self._data, f, indent=2)
281 f.write("\n") # avoids pre-commit/self-update conflicting changes
283 def dumps(self) -> str:
284 """Loads and replaces contents with those from the string."""
285 return json.dumps(self._data, indent=2)
287 def reset(self) -> None:
288 """Full resets internal catalog."""
289 self._data = {}
291 # mutable mapping operations, so this looks like a dictionary
292 def __getitem__(self, key: str) -> PackageDictionaryType:
293 return self._data[key]
295 def __setitem__(self, key: str, value: PackageDictionaryType) -> None:
296 self._data[key] = value
298 def __delitem__(self, key: str) -> None:
299 del self._data[key]
301 def __len__(self) -> int:
302 return len(self._data)
304 def __iter__(self) -> typing.Iterator[str]:
305 return iter(self._data)
307 def __repr__(self) -> str:
308 return repr(self._data)
310 def _ensure_defaults(self, pkg: str) -> None:
311 """Ensures a standardised setup for a package entry."""
312 self.setdefault(pkg, {"versions": {}, "sources": {}})
313 self[pkg].setdefault("versions", {})
314 self[pkg].setdefault("sources", {})
316 def update_versions_from_environment(self, pkg: str, name: str | None) -> bool:
317 """Replaces package documentation URLs using information from current
318 Python environment.
320 Arguments:
322 pkg: Name of the package as one would find in pypi.org. This
323 name can be different then that of the Python package
324 itself.
326 name: This is the name of the package as installed on the current
327 environment. Sometimes, this name can be different then that of
328 the Python package itself. If this value is set to ``None``,
329 then we just use ``pkg`` as the name to lookup.
332 Returns:
334 ``True``, if the update was successful (found versions), or
335 ``False``, otherwise.
336 """
338 self._ensure_defaults(pkg)
340 name = name or pkg
342 logger.debug(f"{pkg}: checking current Python environment for {name}...")
344 versions = docurls_from_environment(name)
345 logger.debug(
346 f"{pkg}: Found {len(versions)} doc URL(s) at current Python environment"
347 )
349 if versions:
350 self[pkg]["versions"].update(versions)
351 self[pkg]["versions"] = _reorder_versions(self[pkg]["versions"])
352 self[pkg]["sources"]["environment"] = name
354 return len(versions) > 0
356 def update_versions_from_rtd(self, pkg: str, name: str | None) -> bool:
357 """Replaces package documentation URLs using information from
358 readthedocs.org.
360 Arguments:
362 pkg: Name of the Python package to update versions for.
364 name: This is the name of the package on readthedocs.org. Often,
365 this name is different then that of the Python package itself.
366 If this value is set to ``None``, then we just use ``pkg`` as
367 the name to lookup.
370 Returns:
372 The dictionary of values for the current package, as obtained from
373 readthedocs.org, and potentially merged with the existing one.
374 """
375 self._ensure_defaults(pkg)
377 name = name or pkg
379 logger.debug(f"{pkg}: checking readthedocs.org for {name}...")
381 versions = docurls_from_rtd(name)
382 logger.debug(f"{pkg}: Found {len(versions)} doc URL(s) at readthedocs.org")
384 if versions:
385 self[pkg]["versions"].update(versions)
386 self[pkg]["versions"] = _reorder_versions(self[pkg]["versions"])
387 self[pkg]["sources"]["readthedocs"] = name
389 return len(versions) > 0
391 def update_versions_from_pypi(
392 self, pkg: str, name: str | None, max_entries: int
393 ) -> bool:
394 """Replaces package documentation URLs using information from pypi.org.
396 Arguments:
398 pkg: Name of the package as one would find in pypi.org. This
399 name can be different then that of the Python package
400 itself.
402 name: This is the name of the package on pypi.org. Sometimes, this
403 name can be different then that of the Python package itself.
404 If this value is set to ``None``, then we just use ``pkg`` as
405 the name to lookup.
407 max_entries: The maximum number of entries to lookup in PyPI. A
408 value of zero will download only the main package information
409 and will hit PyPI only once. A value bigger than zero will
410 download at most the information from the last ``max_entries``
411 releases. Finally, a negative value will imply the download of
412 all available releases.
415 Returns:
417 The dictionary of values for the current package, as obtained from
418 pypi.org, and potentially merged with the existing one.
419 """
421 self._ensure_defaults(pkg)
423 name = name or pkg
425 logger.debug(f"{pkg}: checking pypi.org for {name}...")
427 versions = docurls_from_pypi(name, max_entries)
428 logger.debug(f"{pkg}: Found {len(versions)} doc URL(s) at pypi.org")
430 if versions:
431 self[pkg]["versions"].update(versions)
432 self[pkg]["versions"] = _reorder_versions(self[pkg]["versions"])
433 self[pkg]["sources"]["pypi"] = name
435 return len(versions) > 0
437 def update_versions(
438 self,
439 pkgs: typing.Iterable[str],
440 order: typing.Iterable[str] = ["environment", "readthedocs", "pypi"],
441 names: dict[str, dict[str, str]] = {},
442 pypi_max_entries: int = 0,
443 keep_going: bool = False,
444 ) -> None:
445 """Updates versions for a list of packages in this catalog.
447 This method will add a list of packages defined by ``pkgs`` (list of
448 names) into its own catalog. The order of look-ups by default is set
449 by the ``order``, and it is the following:
451 1. Current Python environment (``environment``)
452 2. readthedocs.org (``readthedocs``)
453 3. PyPI (``pypi``)
456 Arguments:
458 pkgs: List of packages that will have their versions updated
460 order: A list, containing the order in which lookup will happen.
461 There are only 3 possible keys that can be used here:
462 ``environment``, which stands for finding package metadata from
463 the currently installed Python environment, ``readthedocs``,
464 which will trigger readthedocs.org lookups, and ``pypi``, which
465 will trigger pypi.org lookups from uploaded packages.
467 names: A dictionary, that eventually maps source names (as in
468 ``order``) to another dictionary that maps package names to to
469 their supposed names on readthedocs.org, pypi.org or the current
470 environment. If keys for various packages are not available, then
471 their package names are used. If the keys exist, but are set
472 to ``None``, then lookup for that particular source is skipped.
474 pypi_max_entries: The maximum number of entries to lookup in PyPI.
475 A value of zero will download only the main package information
476 and will hit PyPI only once. A value bigger than zero will
477 download at most the information from the last ``max_entries``
478 releases. Finally, a negative value will imply the download of
479 all available releases.
481 keep_going: By default, the method stops adding a package when a
482 hit is found (in either of these sources of information). If
483 the flag ``keep_going`` is set to ``True`` (defaults to
484 ``False``), then it merges information from all sources. Note
485 that some of this information may be repetitive.
486 """
488 for pkg in pkgs:
489 for action in order:
490 if action == "environment":
491 name = names.get(action, {}).get(pkg, pkg)
492 if name is not None:
493 ok = self.update_versions_from_environment(pkg, name)
494 if ok and not keep_going:
495 break
497 elif action == "readthedocs":
498 name = names.get(action, {}).get(pkg, pkg)
499 if name is not None:
500 ok = self.update_versions_from_rtd(pkg, name)
501 if ok and not keep_going:
502 break
504 elif action == "pypi":
505 name = names.get(action, {}).get(pkg, pkg)
506 if name is not None:
507 ok = self.update_versions_from_pypi(pkg, name, pypi_max_entries)
508 if ok and not keep_going:
509 break
511 else:
512 raise RuntimeError(f"Unrecognized source: {action}")
514 def self_update(self) -> None:
515 """Runs a self-update procedure, by re-looking up known sources."""
516 # organises the names as expected by update_versions()
517 names: dict[str, dict[str, str]] = dict(environment={}, readthedocs={}, pypi={})
518 for pkg, info in self.items():
519 for src in ("environment", "readthedocs", "pypi"):
520 names[src][pkg] = info["sources"].get(src)
522 self.update_versions(pkgs=self.keys(), names=names)
525def _string2version(v: str) -> packaging.version.Version | None:
526 """Converts a string into a version number.
528 This method covers various specific use-cases:
530 * ``1.2.3`` -> specific version
531 * ``1.2.x``, ``1.2`` -> anything in the ``[1.2.0, 1.3.0)`` range
532 * ``1.x.x``, ``1`` -> anything in the ``[1.0.0, 2.0.0)`` range
533 * anything else: discarded
535 Arguments:
537 v: a string containing the version number to be parsed, like the ones
538 in the catalog
541 Returns:
543 Either ``None``, or the version object with the parsed version.
544 """
545 v = v.replace(".x", "")
546 try:
547 return packaging.version.Version(v)
548 except packaging.version.InvalidVersion:
549 return None
552def _prepare_versions(versions: dict[str, str]) -> dict[str, str]:
553 """Prepares a dictionary of versions for structured lookups.
555 This procedure:
557 1. Ensures there is one ``latest`` and ``stable`` entries in the input
558 dictionary
559 2. Augment the version dictionary with PEP-440 version numbers (e.g.
560 annotates ``v2.2.0`` -> ``2.2.0``, or ``1.x`` -> ``1``)
563 Arguments:
565 versions: A dictionary that maps release version (and aliases such as
566 ``stable`` or ``latest`` to URLs that contain Sphinx-generated
567 documentation.
570 Returns:
572 A dictionary with keys that correspond to parsed versions and aliases.
573 """
574 if not versions:
575 return versions
577 # see what each valid number means
578 version_map = {_string2version(k): k for k in versions.keys()}
579 sorted_versions = sorted([k for k in version_map.keys() if k is not None])
581 retval: dict[str, str] = {}
582 if sorted_versions:
583 # there is at least 1 (valid) version number
584 latest = sorted_versions[-1]
585 retval["latest"] = versions.get("latest", versions[version_map[latest]])
587 stable_versions = [
588 k for k in sorted_versions if not (k.is_prerelease or k.is_devrelease)
589 ]
590 if stable_versions:
591 stable = stable_versions[-1]
592 else:
593 stable = latest
594 retval["stable"] = versions.get("stable", versions[version_map[stable]])
596 # fill-in the remainder of the versions, leave latest on top
597 for k in reversed(sorted_versions):
598 retval[version_map[k]] = versions[version_map[k]]
599 if ".x" in version_map[k]:
600 # copy to a shortened version number as well
601 retval[version_map[k].replace(".x", "")] = versions[version_map[k]]
602 elif k.public != version_map[k]:
603 # copy a standardised version number as well
604 retval[k.public] = versions[version_map[k]]
606 else:
607 # there is either nothing, or just aliases such as stable/latest
608 retval["latest"] = (
609 versions.get("latest")
610 or versions.get("stable")
611 or versions.get("master")
612 or versions.get("main")
613 or ""
614 )
615 retval["stable"] = (
616 versions.get("stable")
617 or versions.get("latest")
618 or versions.get("master")
619 or versions.get("main")
620 or ""
621 )
623 return retval
626class LookupCatalog:
627 """A catalog that guarantees standardised version lookups.
629 Arguments:
631 catalog: The catalog to use as base for the lookup.
632 """
634 def __init__(self, catalog: Catalog):
635 self._catalog = catalog
636 self.reset()
638 def reset(self):
639 """Internally creates all possible aliases for package names and
640 versions.
642 This method will expand the catalog package names and version
643 numbers so that the user can refer to these using environment,
644 readthedocs.org or pypi.org names for packages, and PEP-440
645 compatible strings for version names during the lookup.
647 The catalog associated to this lookup is not modified in this
648 process. All augmentations are built-into the object instance.
649 """
650 self._version_map: dict[str, dict[str, str]] = {}
651 self._package_map: dict[str, str] = {}
652 for pkg in self._catalog.keys():
653 self._version_map[pkg] = _prepare_versions(self._catalog[pkg]["versions"])
655 # translations from Python, rtd.org or pypi.org names
656 self._package_map[pkg] = pkg
657 self._package_map.update(
658 {v: pkg for v in self._catalog[pkg]["sources"].values()}
659 )
661 def get(self, pkg: str, version: str | None, default: typing.Any = None):
662 """Accesses one single ``pkg/version`` documentation URL.
664 Arguments:
666 pkg: The package name, as available on the catalog or through one
667 of its environment, readthedocs.org or pypi.org names.
669 version: The version of the package to search for. This must be
670 either an identifier from readthedocs.org or pypi.org, or a valid
671 PEP-440 version number as a string.
673 default: The default value to return in case we do not find a
674 match.
677 Returns:
679 If a match is found, returns the URL for the documentation.
680 Otherwise, returns the ``default`` value.
681 """
682 if pkg not in self._package_map:
683 return default
684 if version not in self._version_map[pkg]:
685 return default
686 return self._version_map[self._package_map[pkg]][version]