Coverage for src/auto_intersphinx/catalog.py: 93%

248 statements  

« prev     ^ index     » next       coverage.py v7.4.3, created at 2024-04-22 14:48 +0200

1# SPDX-FileCopyrightText: Copyright © 2022 Idiap Research Institute <contact@idiap.ch> 

2# 

3# SPDX-License-Identifier: BSD-3-Clause 

4"""This module contains instructions for documentation lookup.""" 

5 

6from __future__ import annotations # not required for Python >= 3.10 

7 

8import collections.abc 

9import importlib.metadata 

10import importlib.resources 

11import json 

12import pathlib 

13import re 

14import shutil 

15import typing 

16 

17import lxml.html 

18import packaging.version 

19import requests 

20 

21from sphinx.util import logging 

22 

23logger = logging.getLogger(__name__) 

24 

25 

26PackageDictionaryType = dict[str, dict[str, str]] 

27"""Type for the internal values of :py:class:`Catalog`""" 

28 

29 

30BUILTIN_CATALOG = importlib.resources.files(__package__).joinpath("catalog.json") 

31"""Base name for the catalog file distributed with this package.""" 

32 

33 

34PEP440_RE = re.compile( 

35 r"^\s*" + packaging.version.VERSION_PATTERN + r"\s*$", 

36 re.VERBOSE | re.IGNORECASE, 

37) 

38"""Regular expression for matching PEP-440 version numbers.""" 

39 

40 

41def _ensure_webdir(addr: str) -> str: 

42 """Ensures the web-address ends in a /, and contains ``objects.inv``""" 

43 if addr.endswith(".html"): 

44 addr = addr[: addr.rfind("/")] 

45 if not addr.endswith("/"): 

46 addr += "/" 

47 

48 # objects = addr + "/" + "objects.inv" 

49 # if requests.head(objects).ok: 

50 # logger.error("Cannot find {objects}...") 

51 # return None 

52 

53 return addr 

54 

55 

56def _reorder_versions(vdict: dict[str, str]) -> dict[str, str]: 

57 """Re-orders version dictionary by decreasing version.""" 

58 # nota bene: new dicts preserve insertion order 

59 retval: dict[str, str] = {} 

60 

61 # these keys come always first, if available 

62 protected = ("latest", "main", "master", "stable") 

63 for key in protected: 

64 if key in vdict: 

65 retval[key] = vdict[key] 

66 

67 # next, are releases in reverse order 

68 version_map = { 

69 packaging.version.Version(k): k 

70 for k in vdict.keys() 

71 if (k not in protected) and PEP440_RE.match(k) 

72 } 

73 for version in sorted(version_map.keys(), reverse=True): 

74 retval[version_map[version]] = vdict[version_map[version]] 

75 

76 # now, everything else 

77 retval.update({k: v for k, v in vdict.items() if k not in retval}) 

78 

79 return retval 

80 

81 

82def docurls_from_environment(package: str) -> dict[str, str]: 

83 """Checks installed package metadata for documentation URLs. 

84 

85 Arguments: 

86 

87 package: Name of the package you want to check 

88 

89 version: A version such as "stable", "latest" or a formal version 

90 number parsed by :py:class:`packaging.version.Version`. 

91 

92 

93 Returns: 

94 

95 A dictionary, that maps the version of the documentation found on PyPI 

96 to the URL. 

97 """ 

98 try: 

99 md = importlib.metadata.metadata(package) 

100 if md.get_all("Project-URL") is None: 

101 return {} 

102 for k in md.get_all("Project-URL"): 

103 if k.startswith(("documentation, ", "Documentation, ")): 

104 addr = _ensure_webdir(k.split(",", 1)[1].strip()) 

105 if requests.head(addr + "/objects.inv").ok: 

106 try: 

107 return {md["version"]: addr} 

108 except KeyError: 

109 return {"latest": addr} 

110 

111 except importlib.metadata.PackageNotFoundError: 

112 pass 

113 

114 return {} 

115 

116 

117def docurls_from_rtd(package: str) -> dict[str, str]: 

118 """Checks readthedocs.org for documentation pointers for the package. 

119 

120 Arguments: 

121 

122 package: Name of the package to check on rtd.org - this must be the 

123 name it is know at rtd.org and not necessarily the package name. 

124 Some packages do have different names on rtd.org. 

125 

126 

127 Returns: 

128 

129 A dictionary, which contains all versions of documentation available 

130 for the given package on RTD. If the package's documentation is not 

131 available on RTD, returns an empty dictionary. 

132 """ 

133 try: 

134 url = f"https://readthedocs.org/projects/{package}/versions/" 

135 logger.debug(f"Reaching for `{url}'...") 

136 r = requests.get(f"https://readthedocs.org/projects/{package}/versions/") 

137 if r.ok: 

138 tree = lxml.html.fromstring(r.text) 

139 return { 

140 k.text: _ensure_webdir(k.attrib["href"]) 

141 for k in tree.xpath("//a[contains(@class, 'module-item-title')]") 

142 if k.attrib["href"].startswith("http") 

143 } 

144 

145 except requests.exceptions.RequestException: 

146 pass 

147 

148 return {} 

149 

150 

151def _get_json(url: str) -> dict | None: 

152 try: 

153 logger.debug(f"Reaching for `{url}'...") 

154 r = requests.get(url) 

155 if r.ok: 

156 return r.json() 

157 

158 except requests.exceptions.RequestException: 

159 pass 

160 

161 return None 

162 

163 

164def docurls_from_pypi(package: str, max_entries: int) -> dict[str, str]: 

165 """Checks PyPI for documentation pointers for a given package. 

166 

167 This procedure first looks up the main repo JSON entry, and then figures 

168 out all available versions of the package. In a second step, and depending 

169 on the value of ``max_entries``, this function will retrieve the latest 

170 ``max_entries`` available on that particular package. 

171 

172 

173 Arguments: 

174 

175 package: Name of the PyPI package you want to check 

176 

177 max_entries: The maximum number of entries to lookup in PyPI. A value 

178 of zero will download only the main package information and will 

179 hit PyPI only once. A value bigger than zero will download at most 

180 the information from the last ``max_entries`` releases. Finally, a 

181 negative value will imply the download of all available releases. 

182 

183 

184 Returns: 

185 

186 A dictionary, that maps the version of the documentation found on PyPI 

187 to the URL. 

188 """ 

189 versions: dict[str, str] = {} 

190 data = _get_json(f"https://pypi.org/pypi/{package}/json") 

191 if data is None: 

192 return versions 

193 

194 urls = data["info"]["project_urls"] 

195 addr = urls.get("Documentation") or urls.get("documentation") 

196 if addr is not None: 

197 addr = _ensure_webdir(addr) 

198 if requests.head(addr + "/objects.inv").ok: 

199 versions[data["info"]["version"]] = addr 

200 

201 # download further versions, if requested by user 

202 version_map = { 

203 packaging.version.Version(k): k 

204 for k in data["releases"].keys() 

205 if PEP440_RE.match(k) 

206 } 

207 versions_to_probe = sorted(list(version_map.keys()), reverse=True) 

208 

209 if max_entries >= 0: 

210 versions_to_probe = versions_to_probe[:max_entries] 

211 

212 for k in versions_to_probe: 

213 data = _get_json(f"https://pypi.org/pypi/{package}/{version_map[k]}/json") 

214 if data is None: 

215 continue 

216 

217 urls = data["info"]["project_urls"] 

218 addr = urls.get("Documentation") or urls.get("documentation") 

219 if addr is not None: 

220 addr = _ensure_webdir(addr) 

221 if requests.head(addr + "/objects.inv").ok: 

222 versions[data["info"]["version"]] = addr 

223 

224 return versions 

225 

226 

227class Catalog(collections.abc.MutableMapping): 

228 """A type that can lookup and store information about Sphinx documents. 

229 

230 The object is organised as a dictionary (mutable mapping type) with extra 

231 methods to handle information update from various sources. Information is 

232 organised as dictionary mapping Python package names to another dictionary 

233 containing the following entries: 

234 

235 * ``versions``: A dictionary mapping version numbers to URLs. The keys 

236 have free form, albeit are mostly PEP440 version numbers. Keywords such 

237 as ``stable``, ``latest``, ``master``, or ``main`` are typically found as 

238 well. 

239 * ``sources``: A dictionary mapping information sources for this particular 

240 entry. Keys are one of ``pypi``, ``readthedocs`` or ``environment``. 

241 Values correspond to specific names used for the lookup of the 

242 information on those sources. 

243 

244 

245 Attributes: 

246 

247 _data: Internal dictionary containing the mapping between package names 

248 the user can refer to, versions and eventual sources of such 

249 information. 

250 """ 

251 

252 _data: dict[str, PackageDictionaryType] 

253 

254 def __init__(self) -> None: 

255 self.reset() 

256 

257 def load(self, path: pathlib.Path) -> None: 

258 """Loads and replaces contents with those from the file.""" 

259 with path.open("rt") as f: 

260 logger.debug(f"Loading package catalog from {str(path)}...") 

261 self._data = json.load(f) 

262 logger.debug(f"Loaded {len(self)} entries from {str(path)}") 

263 

264 def loads(self, contents: str) -> None: 

265 """Loads and replaces contents with those from the string.""" 

266 self._data = json.loads(contents) 

267 logger.debug(f"Loaded {len(self)} entries from string") 

268 

269 def dump(self, path: pathlib.Path) -> None: 

270 """Loads and replaces contents with those from the file.""" 

271 if path.exists(): 

272 backup = path.with_suffix(path.suffix + "~") 

273 logger.debug(f"Backing up: {str(path)} -> {str(backup)}...") 

274 shutil.copy(path, backup) # backup 

275 

276 with path.open("wt") as f: 

277 logger.debug( 

278 f"Saving package catalog with {len(self)} entries at {str(path)}..." 

279 ) 

280 json.dump(self._data, f, indent=2) 

281 f.write("\n") # avoids pre-commit/self-update conflicting changes 

282 

283 def dumps(self) -> str: 

284 """Loads and replaces contents with those from the string.""" 

285 return json.dumps(self._data, indent=2) 

286 

287 def reset(self) -> None: 

288 """Full resets internal catalog.""" 

289 self._data = {} 

290 

291 # mutable mapping operations, so this looks like a dictionary 

292 def __getitem__(self, key: str) -> PackageDictionaryType: 

293 return self._data[key] 

294 

295 def __setitem__(self, key: str, value: PackageDictionaryType) -> None: 

296 self._data[key] = value 

297 

298 def __delitem__(self, key: str) -> None: 

299 del self._data[key] 

300 

301 def __len__(self) -> int: 

302 return len(self._data) 

303 

304 def __iter__(self) -> typing.Iterator[str]: 

305 return iter(self._data) 

306 

307 def __repr__(self) -> str: 

308 return repr(self._data) 

309 

310 def _ensure_defaults(self, pkg: str) -> None: 

311 """Ensures a standardised setup for a package entry.""" 

312 self.setdefault(pkg, {"versions": {}, "sources": {}}) 

313 self[pkg].setdefault("versions", {}) 

314 self[pkg].setdefault("sources", {}) 

315 

316 def update_versions_from_environment(self, pkg: str, name: str | None) -> bool: 

317 """Replaces package documentation URLs using information from current 

318 Python environment. 

319 

320 Arguments: 

321 

322 pkg: Name of the package as one would find in pypi.org. This 

323 name can be different then that of the Python package 

324 itself. 

325 

326 name: This is the name of the package as installed on the current 

327 environment. Sometimes, this name can be different then that of 

328 the Python package itself. If this value is set to ``None``, 

329 then we just use ``pkg`` as the name to lookup. 

330 

331 

332 Returns: 

333 

334 ``True``, if the update was successful (found versions), or 

335 ``False``, otherwise. 

336 """ 

337 

338 self._ensure_defaults(pkg) 

339 

340 name = name or pkg 

341 

342 logger.debug(f"{pkg}: checking current Python environment for {name}...") 

343 

344 versions = docurls_from_environment(name) 

345 logger.debug( 

346 f"{pkg}: Found {len(versions)} doc URL(s) at current Python environment" 

347 ) 

348 

349 if versions: 

350 self[pkg]["versions"].update(versions) 

351 self[pkg]["versions"] = _reorder_versions(self[pkg]["versions"]) 

352 self[pkg]["sources"]["environment"] = name 

353 

354 return len(versions) > 0 

355 

356 def update_versions_from_rtd(self, pkg: str, name: str | None) -> bool: 

357 """Replaces package documentation URLs using information from 

358 readthedocs.org. 

359 

360 Arguments: 

361 

362 pkg: Name of the Python package to update versions for. 

363 

364 name: This is the name of the package on readthedocs.org. Often, 

365 this name is different then that of the Python package itself. 

366 If this value is set to ``None``, then we just use ``pkg`` as 

367 the name to lookup. 

368 

369 

370 Returns: 

371 

372 The dictionary of values for the current package, as obtained from 

373 readthedocs.org, and potentially merged with the existing one. 

374 """ 

375 self._ensure_defaults(pkg) 

376 

377 name = name or pkg 

378 

379 logger.debug(f"{pkg}: checking readthedocs.org for {name}...") 

380 

381 versions = docurls_from_rtd(name) 

382 logger.debug(f"{pkg}: Found {len(versions)} doc URL(s) at readthedocs.org") 

383 

384 if versions: 

385 self[pkg]["versions"].update(versions) 

386 self[pkg]["versions"] = _reorder_versions(self[pkg]["versions"]) 

387 self[pkg]["sources"]["readthedocs"] = name 

388 

389 return len(versions) > 0 

390 

391 def update_versions_from_pypi( 

392 self, pkg: str, name: str | None, max_entries: int 

393 ) -> bool: 

394 """Replaces package documentation URLs using information from pypi.org. 

395 

396 Arguments: 

397 

398 pkg: Name of the package as one would find in pypi.org. This 

399 name can be different then that of the Python package 

400 itself. 

401 

402 name: This is the name of the package on pypi.org. Sometimes, this 

403 name can be different then that of the Python package itself. 

404 If this value is set to ``None``, then we just use ``pkg`` as 

405 the name to lookup. 

406 

407 max_entries: The maximum number of entries to lookup in PyPI. A 

408 value of zero will download only the main package information 

409 and will hit PyPI only once. A value bigger than zero will 

410 download at most the information from the last ``max_entries`` 

411 releases. Finally, a negative value will imply the download of 

412 all available releases. 

413 

414 

415 Returns: 

416 

417 The dictionary of values for the current package, as obtained from 

418 pypi.org, and potentially merged with the existing one. 

419 """ 

420 

421 self._ensure_defaults(pkg) 

422 

423 name = name or pkg 

424 

425 logger.debug(f"{pkg}: checking pypi.org for {name}...") 

426 

427 versions = docurls_from_pypi(name, max_entries) 

428 logger.debug(f"{pkg}: Found {len(versions)} doc URL(s) at pypi.org") 

429 

430 if versions: 

431 self[pkg]["versions"].update(versions) 

432 self[pkg]["versions"] = _reorder_versions(self[pkg]["versions"]) 

433 self[pkg]["sources"]["pypi"] = name 

434 

435 return len(versions) > 0 

436 

437 def update_versions( 

438 self, 

439 pkgs: typing.Iterable[str], 

440 order: typing.Iterable[str] = ["environment", "readthedocs", "pypi"], 

441 names: dict[str, dict[str, str]] = {}, 

442 pypi_max_entries: int = 0, 

443 keep_going: bool = False, 

444 ) -> None: 

445 """Updates versions for a list of packages in this catalog. 

446 

447 This method will add a list of packages defined by ``pkgs`` (list of 

448 names) into its own catalog. The order of look-ups by default is set 

449 by the ``order``, and it is the following: 

450 

451 1. Current Python environment (``environment``) 

452 2. readthedocs.org (``readthedocs``) 

453 3. PyPI (``pypi``) 

454 

455 

456 Arguments: 

457 

458 pkgs: List of packages that will have their versions updated 

459 

460 order: A list, containing the order in which lookup will happen. 

461 There are only 3 possible keys that can be used here: 

462 ``environment``, which stands for finding package metadata from 

463 the currently installed Python environment, ``readthedocs``, 

464 which will trigger readthedocs.org lookups, and ``pypi``, which 

465 will trigger pypi.org lookups from uploaded packages. 

466 

467 names: A dictionary, that eventually maps source names (as in 

468 ``order``) to another dictionary that maps package names to to 

469 their supposed names on readthedocs.org, pypi.org or the current 

470 environment. If keys for various packages are not available, then 

471 their package names are used. If the keys exist, but are set 

472 to ``None``, then lookup for that particular source is skipped. 

473 

474 pypi_max_entries: The maximum number of entries to lookup in PyPI. 

475 A value of zero will download only the main package information 

476 and will hit PyPI only once. A value bigger than zero will 

477 download at most the information from the last ``max_entries`` 

478 releases. Finally, a negative value will imply the download of 

479 all available releases. 

480 

481 keep_going: By default, the method stops adding a package when a 

482 hit is found (in either of these sources of information). If 

483 the flag ``keep_going`` is set to ``True`` (defaults to 

484 ``False``), then it merges information from all sources. Note 

485 that some of this information may be repetitive. 

486 """ 

487 

488 for pkg in pkgs: 

489 for action in order: 

490 if action == "environment": 

491 name = names.get(action, {}).get(pkg, pkg) 

492 if name is not None: 

493 ok = self.update_versions_from_environment(pkg, name) 

494 if ok and not keep_going: 

495 break 

496 

497 elif action == "readthedocs": 

498 name = names.get(action, {}).get(pkg, pkg) 

499 if name is not None: 

500 ok = self.update_versions_from_rtd(pkg, name) 

501 if ok and not keep_going: 

502 break 

503 

504 elif action == "pypi": 

505 name = names.get(action, {}).get(pkg, pkg) 

506 if name is not None: 

507 ok = self.update_versions_from_pypi(pkg, name, pypi_max_entries) 

508 if ok and not keep_going: 

509 break 

510 

511 else: 

512 raise RuntimeError(f"Unrecognized source: {action}") 

513 

514 def self_update(self) -> None: 

515 """Runs a self-update procedure, by re-looking up known sources.""" 

516 # organises the names as expected by update_versions() 

517 names: dict[str, dict[str, str]] = dict(environment={}, readthedocs={}, pypi={}) 

518 for pkg, info in self.items(): 

519 for src in ("environment", "readthedocs", "pypi"): 

520 names[src][pkg] = info["sources"].get(src) 

521 

522 self.update_versions(pkgs=self.keys(), names=names) 

523 

524 

525def _string2version(v: str) -> packaging.version.Version | None: 

526 """Converts a string into a version number. 

527 

528 This method covers various specific use-cases: 

529 

530 * ``1.2.3`` -> specific version 

531 * ``1.2.x``, ``1.2`` -> anything in the ``[1.2.0, 1.3.0)`` range 

532 * ``1.x.x``, ``1`` -> anything in the ``[1.0.0, 2.0.0)`` range 

533 * anything else: discarded 

534 

535 Arguments: 

536 

537 v: a string containing the version number to be parsed, like the ones 

538 in the catalog 

539 

540 

541 Returns: 

542 

543 Either ``None``, or the version object with the parsed version. 

544 """ 

545 v = v.replace(".x", "") 

546 try: 

547 return packaging.version.Version(v) 

548 except packaging.version.InvalidVersion: 

549 return None 

550 

551 

552def _prepare_versions(versions: dict[str, str]) -> dict[str, str]: 

553 """Prepares a dictionary of versions for structured lookups. 

554 

555 This procedure: 

556 

557 1. Ensures there is one ``latest`` and ``stable`` entries in the input 

558 dictionary 

559 2. Augment the version dictionary with PEP-440 version numbers (e.g. 

560 annotates ``v2.2.0`` -> ``2.2.0``, or ``1.x`` -> ``1``) 

561 

562 

563 Arguments: 

564 

565 versions: A dictionary that maps release version (and aliases such as 

566 ``stable`` or ``latest`` to URLs that contain Sphinx-generated 

567 documentation. 

568 

569 

570 Returns: 

571 

572 A dictionary with keys that correspond to parsed versions and aliases. 

573 """ 

574 if not versions: 

575 return versions 

576 

577 # see what each valid number means 

578 version_map = {_string2version(k): k for k in versions.keys()} 

579 sorted_versions = sorted([k for k in version_map.keys() if k is not None]) 

580 

581 retval: dict[str, str] = {} 

582 if sorted_versions: 

583 # there is at least 1 (valid) version number 

584 latest = sorted_versions[-1] 

585 retval["latest"] = versions.get("latest", versions[version_map[latest]]) 

586 

587 stable_versions = [ 

588 k for k in sorted_versions if not (k.is_prerelease or k.is_devrelease) 

589 ] 

590 if stable_versions: 

591 stable = stable_versions[-1] 

592 else: 

593 stable = latest 

594 retval["stable"] = versions.get("stable", versions[version_map[stable]]) 

595 

596 # fill-in the remainder of the versions, leave latest on top 

597 for k in reversed(sorted_versions): 

598 retval[version_map[k]] = versions[version_map[k]] 

599 if ".x" in version_map[k]: 

600 # copy to a shortened version number as well 

601 retval[version_map[k].replace(".x", "")] = versions[version_map[k]] 

602 elif k.public != version_map[k]: 

603 # copy a standardised version number as well 

604 retval[k.public] = versions[version_map[k]] 

605 

606 else: 

607 # there is either nothing, or just aliases such as stable/latest 

608 retval["latest"] = ( 

609 versions.get("latest") 

610 or versions.get("stable") 

611 or versions.get("master") 

612 or versions.get("main") 

613 or "" 

614 ) 

615 retval["stable"] = ( 

616 versions.get("stable") 

617 or versions.get("latest") 

618 or versions.get("master") 

619 or versions.get("main") 

620 or "" 

621 ) 

622 

623 return retval 

624 

625 

626class LookupCatalog: 

627 """A catalog that guarantees standardised version lookups. 

628 

629 Arguments: 

630 

631 catalog: The catalog to use as base for the lookup. 

632 """ 

633 

634 def __init__(self, catalog: Catalog): 

635 self._catalog = catalog 

636 self.reset() 

637 

638 def reset(self): 

639 """Internally creates all possible aliases for package names and 

640 versions. 

641 

642 This method will expand the catalog package names and version 

643 numbers so that the user can refer to these using environment, 

644 readthedocs.org or pypi.org names for packages, and PEP-440 

645 compatible strings for version names during the lookup. 

646 

647 The catalog associated to this lookup is not modified in this 

648 process. All augmentations are built-into the object instance. 

649 """ 

650 self._version_map: dict[str, dict[str, str]] = {} 

651 self._package_map: dict[str, str] = {} 

652 for pkg in self._catalog.keys(): 

653 self._version_map[pkg] = _prepare_versions(self._catalog[pkg]["versions"]) 

654 

655 # translations from Python, rtd.org or pypi.org names 

656 self._package_map[pkg] = pkg 

657 self._package_map.update( 

658 {v: pkg for v in self._catalog[pkg]["sources"].values()} 

659 ) 

660 

661 def get(self, pkg: str, version: str | None, default: typing.Any = None): 

662 """Accesses one single ``pkg/version`` documentation URL. 

663 

664 Arguments: 

665 

666 pkg: The package name, as available on the catalog or through one 

667 of its environment, readthedocs.org or pypi.org names. 

668 

669 version: The version of the package to search for. This must be 

670 either an identifier from readthedocs.org or pypi.org, or a valid 

671 PEP-440 version number as a string. 

672 

673 default: The default value to return in case we do not find a 

674 match. 

675 

676 

677 Returns: 

678 

679 If a match is found, returns the URL for the documentation. 

680 Otherwise, returns the ``default`` value. 

681 """ 

682 if pkg not in self._package_map: 

683 return default 

684 if version not in self._version_map[pkg]: 

685 return default 

686 return self._version_map[self._package_map[pkg]][version]