From 90339f341b2d8589239f9b3c61d4c6e940ae4359 Mon Sep 17 00:00:00 2001 From: Kyle Altendorf Date: Fri, 27 Mar 2026 10:25:02 -0400 Subject: [PATCH 1/7] Replace custom cache with CacheControl library - Add CacheControl[filecache]>=0.14,<0.15 dependency - Rewrite cachedownloader.py to use CacheControl's FileCache backend - Add --log-level CLI option for debug logging (shows cache hits/misses) - Remove dead code (url_to_cache_filename, filename parameters) - Update tests to assert on cache directory existence - Add pytest warning filter for CacheControl's CallbackFileWrapper temp file leak Fixes #668 --- plan.md | 493 ++++++++++++++++++ pyproject.toml | 7 + src/check_jsonschema/cachedownloader.py | 147 ++---- src/check_jsonschema/cli/main_command.py | 23 +- .../test_nonjson_schema_handling.py | 28 - .../acceptance/test_remote_ref_resolution.py | 54 +- tests/conftest.py | 52 +- tests/unit/test_cachedownloader.py | 237 ++------- 8 files changed, 618 insertions(+), 423 deletions(-) create mode 100644 plan.md diff --git a/plan.md b/plan.md new file mode 100644 index 000000000..4b8173e71 --- /dev/null +++ b/plan.md @@ -0,0 +1,493 @@ +# Plan: Replace Custom Cache with CacheControl + +Fixes https://github.com/python-jsonschema/check-jsonschema/issues/668 + +## Problem + +The current cache implementation in `cachedownloader.py` uses only the +`Last-Modified` header for cache validation. When a server omits that header +(but provides `ETag` or `Cache-Control`), `_lastmod_from_response()` returns +`0.0` and `_cache_hit()` always returns `True`, so the cache never invalidates. + +## Solution + +Replace the hand-rolled mtime-based cache logic with the +[CacheControl](https://github.com/psf/cachecontrol) library (v0.14, maintained +under the PSF). CacheControl wraps a `requests.Session` and implements full HTTP +caching semantics — `ETag`/`If-None-Match`, `Last-Modified`/`If-Modified-Since`, +and `Cache-Control` headers — transparently. On-disk persistence uses its +`FileCache` backend (backed by `filelock` for concurrency safety). + +## Behavior Changes + +| Aspect | Before | After | +|---|---|---| +| ETag support | None | Full (`If-None-Match` / 304) | +| `Cache-Control` header | Ignored | Respected (`max-age`, `no-cache`, `no-store`) | +| `Last-Modified` | Only mechanism (buggy when absent) | Supported via `If-Modified-Since` / 304 | +| No caching headers at all | Cache never invalidates (bug) | Not cached — see note below | +| Existing cache files on upgrade | Used | Ignored (incompatible format) — see open question below | +| Cache file format | Raw content, SHA-256 filename | CacheControl msgpack-serialized HTTP response, SHA-224 filename in 5-level dir tree | +| `--no-cache` flag | Unchanged | Unchanged — plain session, no caching | +| Retry on error / bad data | Unchanged | Unchanged — custom retry loop preserved | +| Concurrency safety | `_atomic_write` (tempfile + shutil.copy) | `filelock` (via `FileCache`) | + +### Servers with no caching headers + +When a server provides no caching headers at all (no `ETag`, no `Last-Modified`, +no `Cache-Control`, no `Expires`), the current code caches the response forever +due to the `_lastmod_from_response` returning `0.0` bug. After this change, such +responses will not be cached, following correct HTTP semantics. In practice, +nearly all real-world schema endpoints provide at least one caching header. + +CacheControl has built-in heuristic support for overriding this behavior if +needed. The `ExpiresAfter` heuristic (bundled, not custom) can apply a TTL to all +responses. For a conditional variant that only applies to responses lacking +server-provided caching headers, the documented `BaseHeuristic` extensibility +point allows a small subclass (~10 lines). Neither is a hack — heuristics are a +first-class CacheControl feature designed for exactly this purpose, and +[RFC 7234](https://tools.ietf.org/html/rfc7234) explicitly permits caching +systems to use heuristics for responses that lack caching headers. + +### Open question: clean up old cache files? + +CacheControl uses the same cache directories (`check_jsonschema/schemas/`, +`check_jsonschema/refs/`) but a different file layout (5-level SHA-224 tree vs. +flat SHA-256 files). Old cache files will sit alongside the new CacheControl +tree, ignored but taking up space. + +Options: +1. **Do nothing** (current default) — old files are harmless. Users can manually + delete the cache directory if they want to reclaim space. +2. **Delete on first run** — on `_build_session()`, detect and remove files + matching the old naming pattern (64-char hex + optional extension, no + subdirectories). Risk: could delete user files if they happen to match. +3. **One-time migration warning** — log a message suggesting users clear their + cache directory. Non-invasive but adds noise. + +**Proceeding with option 1 (do nothing) unless revisited.** + +--- + +## 1. Dependency Changes + +**File: `pyproject.toml`** (line 39-46) + +Add `CacheControl[filecache]` to runtime dependencies: + +```toml +dependencies = [ + 'tomli>=2.0;python_version<"3.11"', + "ruamel.yaml>=0.18.10,<0.20.0", + "jsonschema>=4.18.0,<5.0", + "regress>=2024.11.1", + "requests<3.0", + "click>=8,<9", + "CacheControl[filecache]>=0.14,<0.15", +] +``` + +The `[filecache]` extra pulls in `filelock`. CacheControl also depends on +`msgpack` (for response serialization) and `requests` (already a dependency). + +--- + +## 2. Rewrite `src/check_jsonschema/cachedownloader.py` + +### Remove + +| Symbol | Lines | Reason | +|---|---|---| +| `_LASTMOD_FMT` | 16 | CacheControl handles Last-Modified natively | +| `_lastmod_from_response()` | 45-54 | Same | +| `_cache_hit()` | 88-97 | CacheControl handles freshness and conditional requests | +| `_atomic_write()` | 77-85 | FileCache uses `filelock` for concurrency | + +### Preserve existing imports + +`from __future__ import annotations` (line 1) stays. `hashlib` stays (used by +`url_to_cache_filename`). `platform` stays (used by `_base_cache_dir`). + +### Remove imports no longer needed + +`calendar`, `shutil`, `tempfile`, `time` — verify each is truly unused after +the rewrite. + +### Add imports (top-level, third-party group) + +```python +import cachecontrol +from cachecontrol.caches.file_cache import FileCache +``` + +These go in the third-party import group alongside `import requests`, following +the project's PEP 8 / isort convention (no lazy imports in production code). + +### Keep unchanged + +| Symbol | Lines | Reason | +|---|---|---| +| `_base_cache_dir()` | 19-35 | Still needed for platform-specific cache path | +| `_resolve_cache_dir()` | 38-42 | Still needed | +| `url_to_cache_filename()` | 100-116 | Useful utility; not used by CacheControl but no reason to remove | +| `FailedDownloadError` | 119-120 | Still raised by retry logic | + +### Modify `_get_request()` (lines 57-74) + +- Add `session: requests.Session` as the first parameter. +- Call `session.get(file_url)` instead of `requests.get(file_url, stream=True)`. +- Remove `stream=True` — CacheControl needs the full response to cache it, and + schemas are small. +- Keep the retry loop (2 retries) and `response_ok` callback — CacheControl does + not provide retry logic. + +**Retry / CacheControl interaction:** When a server returns a 200 with corrupt +body *and* caching headers (e.g., ETag), CacheControl caches the response before +our `response_ok` callback runs. On a naive retry, `session.get()` would serve +the cached corrupt response and every retry would fail identically. To avoid +this, retry attempts (`_attempt > 0`) pass `Cache-Control: no-cache` in the +request headers. This is standard HTTP (RFC 7234 §5.2.1.4) — it tells +CacheControl to revalidate with the server rather than serve from its local +cache. CacheControl implements this natively; no custom logic is needed. + +```python +def _get_request( + session: requests.Session, + file_url: str, + *, + response_ok: t.Callable[[requests.Response], bool], +) -> requests.Response: + num_retries = 2 + r: requests.Response | None = None + for _attempt in range(num_retries + 1): + try: + # On retries, bypass CacheControl's local cache to avoid + # re-serving a cached bad response. + headers = {"Cache-Control": "no-cache"} if _attempt > 0 else {} + r = session.get(file_url, headers=headers) + except requests.RequestException as e: + if _attempt == num_retries: + raise FailedDownloadError("encountered error during download") from e + continue + if r.ok and response_ok(r): + return r + assert r is not None + raise FailedDownloadError( + f"got response with status={r.status_code}, retries exhausted" + ) +``` + +### Rewrite `CacheDownloader` (lines 123-180) + +- `__init__`: store config only. Do **not** build the session eagerly — + `test_default_cache_dir` creates `CacheDownloader` with fake/relative cache + paths purely to assert `_cache_dir` resolution and never makes HTTP requests. + Eager `os.makedirs` + `FileCache()` in `__init__` would create real + directories using those fake paths. +- Build the session lazily via a `_session` property on first access (i.e., + first `open()` call). This also avoids creating sessions that are never used + (e.g., if the schema turns out to be local after all). +- Remove `_download()` entirely — CacheControl manages on-disk cache files. +- Simplify `open()` — always yields `io.BytesIO(response.content)`. No + branching needed: when cache is enabled, `self._session` is a + CacheControl-wrapped session (handles caching transparently); when disabled, + it is a plain `Session`. +- `bind()` — unchanged API. + +CacheControl imports go at the **top of the module** alongside `requests`, +following the project convention (no lazy/deferred imports in production code): + +```python +import cachecontrol +from cachecontrol.caches.file_cache import FileCache +``` + +```python +class CacheDownloader: + def __init__(self, cache_dir: str, *, disable_cache: bool = False) -> None: + self._cache_dir = _resolve_cache_dir(cache_dir) + self._disable_cache = disable_cache + self._cached_session: requests.Session | None = None + + @property + def _session(self) -> requests.Session: + if self._cached_session is None: + self._cached_session = self._build_session() + return self._cached_session + + def _build_session(self) -> requests.Session: + session = requests.Session() + if self._cache_dir and not self._disable_cache: + os.makedirs(self._cache_dir, exist_ok=True) + session = cachecontrol.CacheControl( + session, cache=FileCache(self._cache_dir) + ) + return session + + @contextlib.contextmanager + def open( + self, + file_url: str, + filename: str, + validate_response: t.Callable[[requests.Response], bool], + ) -> t.Iterator[t.IO[bytes]]: + response = _get_request( + self._session, file_url, response_ok=validate_response + ) + yield io.BytesIO(response.content) + + def bind( + self, + file_url: str, + filename: str | None = None, + validation_callback: t.Callable[[bytes], t.Any] | None = None, + ) -> BoundCacheDownloader: + return BoundCacheDownloader( + file_url, self, filename=filename, + validation_callback=validation_callback, + ) +``` + +The `filename` parameter in `open()` and `bind()` is kept for API compatibility +but ignored — CacheControl manages its own filenames (SHA-224 hash of normalized +URL in a 5-level directory tree). + +`BoundCacheDownloader.__init__` still computes `self._filename = filename or +url_to_cache_filename(file_url)` (line 193). This is now dead code — the value +is passed through to `CacheDownloader.open()` which ignores it. Kept for now +since `test_default_filename_from_uri` asserts on `cd._filename`. Can be cleaned +up in a follow-up. + +### Modify `BoundCacheDownloader._validate_response()` (lines 206-213) + +Skip validation on cache hits. CacheControl sets `response.from_cache = True` on +every response served from cache (either a direct hit or after a 304 +revalidation). This preserves the fix for +[#453](https://github.com/python-jsonschema/check-jsonschema/issues/453) +(validation callback must not run on cache hits). + +```python +def _validate_response(self, response: requests.Response) -> bool: + if not self._validation_callback: + return True + # CacheControl sets from_cache=True on cache hits; skip re-validation + if getattr(response, "from_cache", False): + return True + try: + self._validation_callback(response.content) + return True + except ValueError: + return False +``` + +--- + +## 3. Changes to Callers + +**No changes** to `readers.py` or `resolver.py`. Both use `CacheDownloader` via +`bind()` and `open()`, whose signatures are unchanged. + +- `readers.py:80` — `CacheDownloader("schemas", disable_cache=...).bind(url, validation_callback=...)` +- `resolver.py:53` — `CacheDownloader("refs", disable_cache=...)` + +Cache subdirectory names (`"schemas"`, `"refs"`) are preserved. + +**Update `cli/main_command.py:96`**: The `--schemafile` help text says schemas +will be `"downloaded and cached locally based on mtime."` Replace with generic +wording (e.g., `"downloaded and cached locally"`) since caching is no longer +mtime-based. + +--- + +## 4. Test Changes + +### Test strategy: trust CacheControl, test integration only + +CacheControl's `FileCache` write mechanism relies on a `CallbackFileWrapper` +that monitors when a response stream is fully consumed. The `responses` mock +library creates `BytesIO` bodies that never report `.closed = True` after +exhaustion, so the cache-write callback never fires. **CacheControl never +populates its FileCache when responses come from `responses` mocks.** + +This means tests cannot verify CacheControl's caching behavior (cache hits, +ETag revalidation, max-age expiry) using `responses`. Rather than switching test +infrastructure (e.g., adding a real HTTP server), the test strategy is: + +- **Trust CacheControl** for caching correctness — it is a well-maintained PSF + project with its own test suite. +- **Test our integration**: retry logic, validation callbacks, `--no-cache` flag, + session construction, error handling. +- **Do not assert** on cache file existence, `response.from_cache`, or + `len(responses.calls)` staying constant across requests. + +### `responses` library compatibility + +The `responses` library (v0.26.0) patches `HTTPAdapter.send` at the class level. +CacheControl's `CacheControlAdapter` calls `super().send()`, which hits the +`responses` mock. HTTP request/response flow works normally — only the cache-write +side effect is broken. Tests that use `responses` for mocking HTTP (retry tests, +validation tests, error tests) work fine. + +### `tests/conftest.py` — Fixture changes + +**Update `patch_cache_dir`** (line 67): Works as-is — it monkeypatches +`_base_cache_dir`, which is still called by `_resolve_cache_dir`, which is still +called by `CacheDownloader.__init__`. CacheControl's `FileCache` will use the +patched temp directory. + +**Remove the following fixtures** — they either inject raw cache files (no longer +valid format) or predict cache file paths using `url_to_cache_filename` +(SHA-256), which CacheControl does not use: + +| Fixture | Line | Action | +|---|---|---| +| `url2cachepath` | 77 | Remove — only consumed by the path-prediction fixtures below | +| `inject_cached_download` | 100 | Remove — writes raw content; CacheControl uses msgpack format | +| `inject_cached_ref` | 126 | Remove — same | +| `get_download_cache_loc` | 92 | Remove — path prediction no longer valid | +| `get_ref_cache_loc` | 118 | Remove — same | + +**Keep `downloads_cache_dir` / `refs_cache_dir`** (lines 87, 113) and +**keep `cache_dir`** (line 63) — still useful for asserting cache directory +existence when needed. + +### `tests/unit/test_cachedownloader.py` — Specific changes + +**Update the import block** (lines 10-16): Remove `_cache_hit` and +`_lastmod_from_response` — both symbols are deleted from `cachedownloader.py`. + +**Tests to remove:** + +| Test | Lines | Reason | +|---|---|---| +| `test_cache_hit_by_mtime` | 95-113 | mtime-based caching removed entirely | +| `test_cachedownloader_handles_bad_lastmod_header` | 257-310 | `_lastmod_from_response` removed | +| `test_lastmod_from_header_uses_gmtime` | 352-383 | Same | +| `test_cachedownloader_cached_file` | 116-129 | Tests monkeypatched `_download` method, which is removed | +| `test_cachedownloader_validation_is_not_invoked_on_hit` | 313-345 | Depends on `inject_cached_download` (removed) and `response.from_cache` (not testable with `responses` mocks) | + +**Tests to simplify** (remove `get_download_cache_loc` dependency and file- +existence assertions — these tests are about retry behavior, not caching): + +`test_cachedownloader_on_success` (line 132): +- Remove `get_download_cache_loc` fixture usage and `f.exists()` assertions. +- Keep: verify `cd.open()` returns correct content (`b"{}"`). + +`test_cachedownloader_succeeds_after_few_errors` (line 160): +- Remove `get_download_cache_loc` fixture usage and `f.exists()` assertions. +- Keep: verify `cd.open()` returns correct content after retries. + +`test_cachedownloader_fails_after_many_errors` (line 194): +- Remove `get_download_cache_loc` fixture usage and `f.exists()` assertion. +- Keep: verify `FailedDownloadError` is raised. + +`test_cachedownloader_retries_on_bad_data` (line 225): +- Remove `get_download_cache_loc` fixture usage and `f.exists()` assertions. +- Keep: verify `cd.open()` returns correct content after retrying past bad data. + +`test_cachedownloader_using_alternate_target_dir` (line 149): +- Remove `url2cachepath` / path-based assertions. +- Keep: verify `cd.open()` returns correct content. + +**Tests to keep as-is:** + +| Test | Lines | Notes | +|---|---|---| +| `test_default_filename_from_uri` | 37-39 | `url_to_cache_filename` unchanged | +| `test_default_cache_dir` | 42-92 | `_base_cache_dir` unchanged. Works as-is thanks to lazy session building — `CacheDownloader("downloads")` only stores `_cache_dir`, no `os.makedirs` or `FileCache` until `open()` is called. | + +**New tests to add (integration-focused):** + +| Test | Purpose | +|---|---| +| `test_disable_cache_uses_plain_session` | When `disable_cache=True`, verify `_build_session` returns a plain `requests.Session` (not wrapped by CacheControl). | +| `test_enable_cache_uses_cachecontrol_session` | When `disable_cache=False` and cache dir is valid, verify `_build_session` returns a CacheControl-wrapped session (check for `CacheControlAdapter` on the session). | +| `test_cache_dir_none_uses_plain_session` | When `_resolve_cache_dir` returns `None` (e.g., Windows with no env vars), verify `_build_session` returns a plain session. | + +**Aspirational cache-behavior tests** (require a real HTTP server or alternative +to `responses`; not included in this PR but tracked for follow-up): + +| Test | Purpose | +|---|---| +| `test_etag_cache_revalidation` | Server provides `ETag`, no `Last-Modified`. Second request sends `If-None-Match`, gets 304. | +| `test_cache_control_max_age` | Server provides `Cache-Control: max-age=3600`. Second request served from cache. | +| `test_last_modified_revalidation` | Server provides `Last-Modified`. Second request sends `If-Modified-Since`. | +| `test_no_caching_headers_not_cached` | Server provides no caching headers. Response is not cached. | + +### `tests/acceptance/test_remote_ref_resolution.py` + +`test_remote_ref_resolution_cache_control` (line 68): +- Remove `get_ref_cache_loc` fixture dependency and file-path assertions. +- When `disable_cache=False`: verify the command succeeds (exit code 0). + Cache behavior is trusted to CacheControl. +- When `disable_cache=True`: verify the command succeeds (exit code 0). + +`test_remote_ref_resolution_loads_from_cache` (line 105): +- Remove `inject_cached_ref` and `get_ref_cache_loc` fixture dependencies. +- This test's premise (inject good cache data, serve bad HTTP, verify cache is + used) cannot be replicated without direct FileCache manipulation. **Remove + this test.** Cache-hit behavior is trusted to CacheControl. + +`test_remote_ref_resolution_callout_count_is_scale_free_in_instancefiles` +(line 251): +- Should work as-is. Within-run deduplication is handled by `lru_cache` on + `_get_validator` and the in-memory `ResourceCache` — not by the file cache. + CacheControl's file cache only affects cross-run persistence. + +### `tests/acceptance/test_nonjson_schema_handling.py` + +`test_can_load_remote_yaml_schema_ref_from_cache` (line 140): +- Remove `inject_cached_ref` fixture dependency. This test's premise (inject + good cache, serve bad HTTP) cannot be replicated. **Remove this test.** + Cache-hit behavior is trusted to CacheControl. + +--- + +## 5. Pytest Warning Filter + +CacheControl's `FileCache` uses `filelock` for concurrency safety. When connection +errors occur during tests (e.g., `test_cachedownloader_fails_after_many_errors`), +filelock may leave temporary files open, triggering a `ResourceWarning` that pytest +converts to `PytestUnraisableExceptionWarning`. + +This is not a bug in our code — it's a side effect of how filelock handles error +conditions. Add a filter to `pyproject.toml` to ignore this specific warning: + +```toml +[tool.pytest.ini_options] +filterwarnings = [ + # ... existing filters ... + # filelock (used by CacheControl) may leave temp files open during connection errors + 'ignore:Exception ignored in.*FileIO.*:pytest.PytestUnraisableExceptionWarning', +] +``` + +**TODO:** Investigate whether this warning can be avoided rather than suppressed. + +**Investigation results (2026-03-27):** +- The warning originates from CacheControl's `CallbackFileWrapper` class in + `filewrapper.py`, not filelock. +- `CallbackFileWrapper.__init__` creates a `NamedTemporaryFile` (line 37), which + is only closed in `_close()` when the response is fully read (line 105-106). +- When a `requests.ConnectionError` occurs, the temp file is never closed because + `_close()` is never called. +- No existing upstream issues found in CacheControl or filelock for this specific + problem. +- The `CallbackFileWrapper` class lacks a `__del__` method or context manager + protocol that would ensure cleanup on exceptions. +- **Recommendation:** File an upstream issue with CacheControl suggesting that + `CallbackFileWrapper` implement `__del__` to close `self.__buf` if not already + closed, or use a weak reference callback to ensure cleanup. + +--- + +## 6. Execution Order + +1. Add `CacheControl[filecache]` dependency to `pyproject.toml` +2. Rewrite `cachedownloader.py` +3. Update `conftest.py` fixtures +4. Rewrite/update unit tests in `test_cachedownloader.py` +5. Update acceptance tests +6. Add pytest warning filter for filelock ResourceWarning +7. Run full test suite, fix any `responses` library compatibility issues +8. Commit, push, open PR diff --git a/pyproject.toml b/pyproject.toml index 75a2fce27..a1c43ce2f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,6 +43,7 @@ dependencies = [ "regress>=2024.11.1", "requests<3.0", "click>=8,<9", + "CacheControl[filecache]>=0.14,<0.15", ] [project.readme] @@ -80,6 +81,12 @@ filterwarnings = [ "error", # dateutil has a Python 3.12 compatibility issue. 'ignore:datetime\.datetime\.utcfromtimestamp\(\) is deprecated:DeprecationWarning', + # CacheControl's CallbackFileWrapper (filewrapper.py) creates a NamedTemporaryFile + # in __init__ that is only closed when the response is fully read. When a + # requests.ConnectionError occurs, _close() is never called and the temp file + # leaks. This is a CacheControl limitation, not a check-jsonschema bug. + # See plan.md section "5. Pytest Warning Filter" for details. + 'ignore:Exception ignored in.*FileIO.*:pytest.PytestUnraisableExceptionWarning', ] addopts = [ "--color=yes", diff --git a/src/check_jsonschema/cachedownloader.py b/src/check_jsonschema/cachedownloader.py index 86aad0e62..c75cd932e 100644 --- a/src/check_jsonschema/cachedownloader.py +++ b/src/check_jsonschema/cachedownloader.py @@ -1,19 +1,17 @@ from __future__ import annotations -import calendar import contextlib -import hashlib import io +import logging import os import platform -import shutil -import tempfile -import time import typing as t +import cachecontrol import requests +from cachecontrol.caches.file_cache import FileCache -_LASTMOD_FMT = "%a, %d %b %Y %H:%M:%S %Z" +log = logging.getLogger(__name__) def _base_cache_dir() -> str | None: @@ -42,26 +40,23 @@ def _resolve_cache_dir(dirname: str) -> str | None: return cache_dir -def _lastmod_from_response(response: requests.Response) -> float: - try: - return calendar.timegm( - time.strptime(response.headers["last-modified"], _LASTMOD_FMT) - ) - # OverflowError: time outside of platform-specific bounds - # ValueError: malformed/unparseable - # LookupError: no such header - except (OverflowError, ValueError, LookupError): - return 0.0 - - def _get_request( - file_url: str, *, response_ok: t.Callable[[requests.Response], bool] + session: requests.Session, + file_url: str, + *, + response_ok: t.Callable[[requests.Response], bool], ) -> requests.Response: num_retries = 2 r: requests.Response | None = None for _attempt in range(num_retries + 1): try: - r = requests.get(file_url, stream=True) + # On retries, bypass CacheControl's local cache to avoid + # re-serving a cached bad response. Ideally we'd delete the + # cache entry directly, but CacheControl doesn't expose a public + # API for this (see https://github.com/psf/cachecontrol/issues/219). + # The no-cache header forces revalidation with the origin server. + headers = {"Cache-Control": "no-cache"} if _attempt > 0 else {} + r = session.get(file_url, headers=headers) except requests.RequestException as e: if _attempt == num_retries: raise FailedDownloadError("encountered error during download") from e @@ -74,48 +69,6 @@ def _get_request( ) -def _atomic_write(dest: str, content: bytes) -> None: - # download to a temp file and then move to the dest - # this makes the download safe if run in parallel (parallel runs - # won't create a new empty file for writing and cause failures) - fp = tempfile.NamedTemporaryFile(mode="wb", delete=False) - fp.write(content) - fp.close() - shutil.copy(fp.name, dest) - os.remove(fp.name) - - -def _cache_hit(cachefile: str, response: requests.Response) -> bool: - # no file? miss - if not os.path.exists(cachefile): - return False - - # compare mtime on any cached file against the remote last-modified time - # it is considered a hit if the local file is at least as new as the remote file - local_mtime = os.path.getmtime(cachefile) - remote_mtime = _lastmod_from_response(response) - return local_mtime >= remote_mtime - - -def url_to_cache_filename(ref_url: str) -> str: - """ - Given a schema URL, convert it to a filename for caching in a cache dir. - - Rules are as follows: - - the base filename is an sha256 hash of the URL - - if the filename ends in an extension (.json, .yaml, etc) that extension - is appended to the hash - - Preserving file extensions preserves the extension-based logic used for parsing, and - it also helps a local editor (browsing the cache) identify filetypes. - """ - filename = hashlib.sha256(ref_url.encode()).hexdigest() - if "." in (last_part := ref_url.rpartition("/")[-1]): - _, _, extension = last_part.rpartition(".") - filename = f"{filename}.{extension}" - return filename - - class FailedDownloadError(Exception): pass @@ -124,59 +77,42 @@ class CacheDownloader: def __init__(self, cache_dir: str, *, disable_cache: bool = False) -> None: self._cache_dir = _resolve_cache_dir(cache_dir) self._disable_cache = disable_cache - - def _download( - self, - file_url: str, - filename: str, - response_ok: t.Callable[[requests.Response], bool], - ) -> str: - assert self._cache_dir is not None - os.makedirs(self._cache_dir, exist_ok=True) - dest = os.path.join(self._cache_dir, filename) - - def check_response_for_download(r: requests.Response) -> bool: - # if the response indicates a cache hit, treat it as valid - # this ensures that we short-circuit any further evaluation immediately on - # a hit - if _cache_hit(dest, r): - return True - # we now know it's not a hit, so validate the content (forces download) - return response_ok(r) - - response = _get_request(file_url, response_ok=check_response_for_download) - # check to see if we have a file which matches the connection - # only download if we do not (cache miss, vs hit) - if not _cache_hit(dest, response): - _atomic_write(dest, response.content) - - return dest + self._cached_session: requests.Session | None = None + + @property + def _session(self) -> requests.Session: + if self._cached_session is None: + self._cached_session = self._build_session() + return self._cached_session + + def _build_session(self) -> requests.Session: + session = requests.Session() + if self._cache_dir and not self._disable_cache: + log.debug("using cache dir: %s", self._cache_dir) + os.makedirs(self._cache_dir, exist_ok=True) + session = cachecontrol.CacheControl( + session, cache=FileCache(self._cache_dir) + ) + else: + log.debug("caching disabled") + return session @contextlib.contextmanager def open( self, file_url: str, - filename: str, validate_response: t.Callable[[requests.Response], bool], ) -> t.Iterator[t.IO[bytes]]: - if (not self._cache_dir) or self._disable_cache: - yield io.BytesIO( - _get_request(file_url, response_ok=validate_response).content - ) - else: - with open( - self._download(file_url, filename, response_ok=validate_response), "rb" - ) as fp: - yield fp + response = _get_request(self._session, file_url, response_ok=validate_response) + yield io.BytesIO(response.content) def bind( self, file_url: str, - filename: str | None = None, validation_callback: t.Callable[[bytes], t.Any] | None = None, ) -> BoundCacheDownloader: return BoundCacheDownloader( - file_url, self, filename=filename, validation_callback=validation_callback + file_url, self, validation_callback=validation_callback ) @@ -186,11 +122,9 @@ def __init__( file_url: str, downloader: CacheDownloader, *, - filename: str | None = None, validation_callback: t.Callable[[bytes], t.Any] | None = None, ) -> None: self._file_url = file_url - self._filename = filename or url_to_cache_filename(file_url) self._downloader = downloader self._validation_callback = validation_callback @@ -198,7 +132,6 @@ def __init__( def open(self) -> t.Iterator[t.IO[bytes]]: with self._downloader.open( self._file_url, - self._filename, validate_response=self._validate_response, ) as fp: yield fp @@ -206,7 +139,11 @@ def open(self) -> t.Iterator[t.IO[bytes]]: def _validate_response(self, response: requests.Response) -> bool: if not self._validation_callback: return True - + # CacheControl sets from_cache=True on cache hits; skip re-validation. + # Plain requests.Session (used when disable_cache=True) doesn't set this + # attribute at all, so we use getattr with a default. + if getattr(response, "from_cache", False): + return True try: self._validation_callback(response.content) return True diff --git a/src/check_jsonschema/cli/main_command.py b/src/check_jsonschema/cli/main_command.py index 62d79bb35..96dc9d5cb 100644 --- a/src/check_jsonschema/cli/main_command.py +++ b/src/check_jsonschema/cli/main_command.py @@ -1,5 +1,6 @@ from __future__ import annotations +import logging import os import textwrap import typing as t @@ -43,6 +44,18 @@ def set_color_mode(ctx: click.Context, param: str, value: str) -> None: }[value] +def configure_logging( + ctx: click.Context, param: click.Parameter, value: str | None +) -> None: + if value is None: + return + level = getattr(logging, value.upper()) + logging.basicConfig( + level=level, + format="%(name)s [%(levelname)s]: %(message)s", + ) + + def pretty_helptext_list(values: list[str] | tuple[str, ...]) -> str: return textwrap.indent( "\n".join( @@ -88,12 +101,20 @@ def pretty_helptext_list(values: list[str] | tuple[str, ...]) -> str: ) @click.help_option("-h", "--help") @click.version_option() +@click.option( + "--log-level", + help="Set the log level for debug output (e.g., DEBUG, INFO, WARNING).", + type=click.Choice(["DEBUG", "INFO", "WARNING", "ERROR"], case_sensitive=False), + callback=configure_logging, + expose_value=False, + is_eager=True, +) @click.option( "--schemafile", help=( "The path to a file containing the JSON Schema to use or an " "HTTP(S) URI for the schema. If a remote file is used, " - "it will be downloaded and cached locally based on mtime. " + "it will be downloaded and cached locally. " "Use '-' for stdin." ), metavar="[PATH|URI]", diff --git a/tests/acceptance/test_nonjson_schema_handling.py b/tests/acceptance/test_nonjson_schema_handling.py index 4e56d25e2..ffa325bf1 100644 --- a/tests/acceptance/test_nonjson_schema_handling.py +++ b/tests/acceptance/test_nonjson_schema_handling.py @@ -135,31 +135,3 @@ def test_can_load_remote_yaml_schema_ref(run_line, tmp_path, passing_data): result = run_line(["check-jsonschema", "--schemafile", retrieval_uri, str(doc)]) assert result.exit_code == (0 if passing_data else 1) - - -def test_can_load_remote_yaml_schema_ref_from_cache( - run_line, inject_cached_ref, tmp_path -): - retrieval_uri = "https://example.org/retrieval/schemas/main.yaml" - responses.add( - "GET", - retrieval_uri, - body="""\ -"$schema": "http://json-schema.org/draft-07/schema" -properties: - "title": {"$ref": "./title_schema.yaml"} -additionalProperties: false -""", - ) - - ref_loc = "https://example.org/retrieval/schemas/title_schema.yaml" - # populate a bad schema, but then "override" that with a good cache value - # this can only pass (in the success case) if the cache loading really works - responses.add("GET", ref_loc, body="false") - inject_cached_ref(ref_loc, "type: string") - - doc = tmp_path / "doc.json" - doc.write_text(json.dumps(PASSING_DOCUMENT)) - - result = run_line(["check-jsonschema", "--schemafile", retrieval_uri, str(doc)]) - assert result.exit_code == 0 diff --git a/tests/acceptance/test_remote_ref_resolution.py b/tests/acceptance/test_remote_ref_resolution.py index 3dafc4c8a..59670cdf6 100644 --- a/tests/acceptance/test_remote_ref_resolution.py +++ b/tests/acceptance/test_remote_ref_resolution.py @@ -66,16 +66,14 @@ def test_remote_ref_resolution_simple_case(run_line, check_passes, casename, tmp @pytest.mark.parametrize("casename", ("case1", "case2")) @pytest.mark.parametrize("disable_cache", (True, False)) def test_remote_ref_resolution_cache_control( - run_line, tmp_path, get_ref_cache_loc, casename, disable_cache + run_line, tmp_path, casename, disable_cache, schemas_cache_dir, refs_cache_dir ): main_schema_loc = "https://example.com/main.json" responses.add("GET", main_schema_loc, json=CASES[casename]["main_schema"]) - ref_locs = [] for name, subschema in CASES[casename]["other_schemas"].items(): other_schema_loc = f"https://example.com/{name}.json" responses.add("GET", other_schema_loc, json=subschema) - ref_locs.append(other_schema_loc) instance_path = tmp_path / "instance.json" instance_path.write_text(json.dumps(CASES[casename]["passing_document"])) @@ -88,58 +86,16 @@ def test_remote_ref_resolution_cache_control( output = f"\nstdout:\n{result.stdout}\n\nstderr:\n{result.stderr}" assert result.exit_code == 0, output - cache_locs = [] - for ref_loc in ref_locs: - cache_locs.append(get_ref_cache_loc(ref_loc)) - assert cache_locs # sanity check + # Cache directories are created only when caching is enabled + cache_dirs = [schemas_cache_dir, refs_cache_dir] if disable_cache: - for loc in cache_locs: + for loc in cache_dirs: assert not loc.exists() else: - for loc in cache_locs: + for loc in cache_dirs: assert loc.exists() -@pytest.mark.parametrize("casename", ("case1", "case2")) -@pytest.mark.parametrize("check_passes", (True, False)) -def test_remote_ref_resolution_loads_from_cache( - run_line, tmp_path, get_ref_cache_loc, inject_cached_ref, casename, check_passes -): - main_schema_loc = "https://example.com/main.json" - responses.add("GET", main_schema_loc, json=CASES[casename]["main_schema"]) - - ref_locs = [] - cache_locs = [] - for name, subschema in CASES[casename]["other_schemas"].items(): - other_schema_loc = f"https://example.com/{name}.json" - # intentionally populate the HTTP location with "bad data" - responses.add("GET", other_schema_loc, json="{}") - ref_locs.append(other_schema_loc) - - # but populate the cache with "good data" - inject_cached_ref(other_schema_loc, json.dumps(subschema)) - cache_locs.append(get_ref_cache_loc(other_schema_loc)) - - instance_path = tmp_path / "instance.json" - instance_path.write_text( - json.dumps( - CASES[casename]["passing_document"] - if check_passes - else CASES[casename]["failing_document"] - ) - ) - - # run the command - result = run_line( - ["check-jsonschema", "--schemafile", main_schema_loc, str(instance_path)] - ) - output = f"\nstdout:\n{result.stdout}\n\nstderr:\n{result.stderr}" - if check_passes: - assert result.exit_code == 0, output - else: - assert result.exit_code == 1, output - - # this test ensures that `$id` is preferred for the base URI over # the retrieval URI @pytest.mark.parametrize("check_passes", (True, False)) diff --git a/tests/conftest.py b/tests/conftest.py index e8fc84176..82dc3139d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -74,39 +74,8 @@ def patch_cache_dir(monkeypatch, cache_dir): @pytest.fixture -def url2cachepath(): - from check_jsonschema.cachedownloader import url_to_cache_filename - - def _get(cache_dir, url): - return cache_dir / url_to_cache_filename(url) - - return _get - - -@pytest.fixture -def downloads_cache_dir(tmp_path): - return tmp_path / ".cache" / "check_jsonschema" / "downloads" - - -@pytest.fixture -def get_download_cache_loc(downloads_cache_dir, url2cachepath): - def _get(url): - return url2cachepath(downloads_cache_dir, url) - - return _get - - -@pytest.fixture -def inject_cached_download(downloads_cache_dir, get_download_cache_loc): - def _write(uri, content): - downloads_cache_dir.mkdir(parents=True) - path = get_download_cache_loc(uri) - if isinstance(content, str): - path.write_text(content) - else: - path.write_bytes(content) - - return _write +def schemas_cache_dir(tmp_path): + return tmp_path / ".cache" / "check_jsonschema" / "schemas" @pytest.fixture @@ -114,18 +83,7 @@ def refs_cache_dir(tmp_path): return tmp_path / ".cache" / "check_jsonschema" / "refs" +# Alias for unit tests that use "downloads" as the cache dir name @pytest.fixture -def get_ref_cache_loc(refs_cache_dir, url2cachepath): - def _get(url): - return url2cachepath(refs_cache_dir, url) - - return _get - - -@pytest.fixture -def inject_cached_ref(refs_cache_dir, get_ref_cache_loc): - def _write(uri, content): - refs_cache_dir.mkdir(parents=True) - get_ref_cache_loc(uri).write_text(content) - - return _write +def downloads_cache_dir(tmp_path): + return tmp_path / ".cache" / "check_jsonschema" / "downloads" diff --git a/tests/unit/test_cachedownloader.py b/tests/unit/test_cachedownloader.py index b906ca03a..402bb2c56 100644 --- a/tests/unit/test_cachedownloader.py +++ b/tests/unit/test_cachedownloader.py @@ -1,7 +1,6 @@ import json import os import platform -import time import pytest import requests @@ -10,9 +9,6 @@ from check_jsonschema.cachedownloader import ( CacheDownloader, FailedDownloadError, - _cache_hit, - _lastmod_from_response, - url_to_cache_filename, ) DEFAULT_RESPONSE_URL = "https://example.com/schema1.json" @@ -34,11 +30,6 @@ def default_response(): add_default_response() -def test_default_filename_from_uri(default_response): - cd = CacheDownloader("downloads").bind(DEFAULT_RESPONSE_URL) - assert cd._filename == url_to_cache_filename(DEFAULT_RESPONSE_URL) - - @pytest.mark.parametrize( "sysname, fakeenv, expect_value", [ @@ -92,48 +83,10 @@ def fake_expanduser(path): assert expanduser_path is None -def test_cache_hit_by_mtime(monkeypatch, default_response): - monkeypatch.setattr(os.path, "exists", lambda x: True) - - # local mtime = NOW, cache hit - monkeypatch.setattr(os.path, "getmtime", lambda x: time.time()) - assert _cache_hit( - "/tmp/schema1.json", - requests.get(DEFAULT_RESPONSE_URL, stream=True), - ) - - # local mtime = 0, cache miss - monkeypatch.setattr(os.path, "getmtime", lambda x: 0) - assert ( - _cache_hit( - "/tmp/schema1.json", - requests.get(DEFAULT_RESPONSE_URL, stream=True), - ) - is False - ) - - -def test_cachedownloader_cached_file(tmp_path, monkeypatch, default_response): - # create a file - f = tmp_path / "foo.json" - f.write_text("{}") - - # set the cache_dir to the tmp dir (so that cache_dir will always be set) - cd = CacheDownloader(tmp_path).bind(str(f), filename="foo.json") - # patch the downloader to skip any download "work" - monkeypatch.setattr( - cd._downloader, "_download", lambda file_uri, filename, response_ok: str(f) - ) - - with cd.open() as fp: - assert fp.read() == b"{}" - - @pytest.mark.parametrize("disable_cache", (True, False)) def test_cachedownloader_on_success( - get_download_cache_loc, disable_cache, default_response + disable_cache, default_response, downloads_cache_dir ): - f = get_download_cache_loc(DEFAULT_RESPONSE_URL) cd = CacheDownloader("downloads", disable_cache=disable_cache).bind( DEFAULT_RESPONSE_URL ) @@ -141,26 +94,24 @@ def test_cachedownloader_on_success( with cd.open() as fp: assert fp.read() == b"{}" if disable_cache: - assert not f.exists() + assert not downloads_cache_dir.exists() else: - assert f.exists() + assert downloads_cache_dir.exists() -def test_cachedownloader_using_alternate_target_dir( - cache_dir, default_response, url2cachepath -): - cache_dir = cache_dir / "check_jsonschema" / "otherdir" - f = url2cachepath(cache_dir, DEFAULT_RESPONSE_URL) +def test_cachedownloader_using_alternate_target_dir(cache_dir, default_response): cd = CacheDownloader("otherdir").bind(DEFAULT_RESPONSE_URL) with cd.open() as fp: assert fp.read() == b"{}" - assert f.exists() + + # Cache directory is created for the alternate target dir + assert cache_dir.joinpath("check_jsonschema", "otherdir").exists() @pytest.mark.parametrize("disable_cache", (True, False)) @pytest.mark.parametrize("failures", (1, 2, requests.ConnectionError)) def test_cachedownloader_succeeds_after_few_errors( - get_download_cache_loc, disable_cache, failures + disable_cache, failures, downloads_cache_dir ): if isinstance(failures, int): for _i in range(failures): @@ -178,7 +129,6 @@ def test_cachedownloader_succeeds_after_few_errors( match_querystring=None, ) add_default_response() - f = get_download_cache_loc(DEFAULT_RESPONSE_URL) cd = CacheDownloader("downloads", disable_cache=disable_cache).bind( DEFAULT_RESPONSE_URL ) @@ -186,15 +136,15 @@ def test_cachedownloader_succeeds_after_few_errors( with cd.open() as fp: assert fp.read() == b"{}" if disable_cache: - assert not f.exists() + assert not downloads_cache_dir.exists() else: - assert f.exists() + assert downloads_cache_dir.exists() @pytest.mark.parametrize("disable_cache", (True, False)) @pytest.mark.parametrize("connection_error", (True, False)) def test_cachedownloader_fails_after_many_errors( - get_download_cache_loc, disable_cache, connection_error + disable_cache, connection_error, downloads_cache_dir ): for _i in range(10): if connection_error: @@ -212,18 +162,20 @@ def test_cachedownloader_fails_after_many_errors( match_querystring=None, ) add_default_response() # never reached, the 11th response - f = get_download_cache_loc(DEFAULT_RESPONSE_URL) cd = CacheDownloader("downloads", disable_cache=disable_cache).bind( DEFAULT_RESPONSE_URL ) with pytest.raises(FailedDownloadError): with cd.open(): pass - assert not f.exists() + + # Cache directory is created only when caching is enabled + # (even though the request failed, the session was built) + assert downloads_cache_dir.exists() is not disable_cache @pytest.mark.parametrize("disable_cache", (True, False)) -def test_cachedownloader_retries_on_bad_data(get_download_cache_loc, disable_cache): +def test_cachedownloader_retries_on_bad_data(disable_cache, downloads_cache_dir): responses.add( "GET", DEFAULT_RESPONSE_URL, @@ -232,7 +184,6 @@ def test_cachedownloader_retries_on_bad_data(get_download_cache_loc, disable_cac match_querystring=None, ) add_default_response() - f = get_download_cache_loc(DEFAULT_RESPONSE_URL) cd = CacheDownloader( "downloads", disable_cache=disable_cache, @@ -245,139 +196,39 @@ def test_cachedownloader_retries_on_bad_data(get_download_cache_loc, disable_cac assert fp.read() == b"{}" if disable_cache: - assert not f.exists() + assert not downloads_cache_dir.exists() else: - assert f.exists() + assert downloads_cache_dir.exists() -@pytest.mark.parametrize("file_exists", (True, False)) -@pytest.mark.parametrize( - "failure_mode", ("header_missing", "header_malformed", "time_overflow") -) -def test_cachedownloader_handles_bad_lastmod_header( - monkeypatch, - get_download_cache_loc, - inject_cached_download, - file_exists, - failure_mode, -): - if failure_mode == "header_missing": - responses.add( - "GET", DEFAULT_RESPONSE_URL, headers={}, json={}, match_querystring=None - ) - elif failure_mode == "header_malformed": - responses.add( - "GET", - DEFAULT_RESPONSE_URL, - headers={"Last-Modified": "Jan 2000 00:00:01"}, - json={}, - match_querystring=None, - ) - elif failure_mode == "time_overflow": - add_default_response() - - def fake_timegm(*args): - raise OverflowError("uh-oh") - - monkeypatch.setattr("calendar.timegm", fake_timegm) - else: - raise NotImplementedError - - original_file_contents = b'{"foo": "bar"}' - file_path = get_download_cache_loc(DEFAULT_RESPONSE_URL) - - assert not file_path.exists() - if file_exists: - inject_cached_download(DEFAULT_RESPONSE_URL, original_file_contents) - - cd = CacheDownloader("downloads").bind(DEFAULT_RESPONSE_URL) - - # if the file already existed, it will not be overwritten by the cachedownloader - # so the returned value for both the downloader and a direct file read should be the - # original contents - if file_exists: - with cd.open() as fp: - assert fp.read() == original_file_contents - assert file_path.read_bytes() == original_file_contents - # otherwise, the file will have been created with new content - # both reads will show that new content - else: - with cd.open() as fp: - assert fp.read() == b"{}" - assert file_path.read_bytes() == b"{}" - - # at the end, the file always exists on disk - assert file_path.exists() - - -def test_cachedownloader_validation_is_not_invoked_on_hit( - monkeypatch, default_response, inject_cached_download -): - """ - Regression test for https://github.com/python-jsonschema/check-jsonschema/issues/453 - - This was a bug in which the validation callback was invoked eagerly, even on a cache - hit. As a result, cache hits did not demonstrate their expected performance gain. - """ - # 1: construct some perfectly good data (it doesn't really matter what it is) - # <> - # 2: put equivalent data on disk - inject_cached_download(DEFAULT_RESPONSE_URL, "{}") - - # 3: construct a validator which marks that it ran in a variable - validator_ran = False - - def dummy_validate_bytes(data): - nonlocal validator_ran - validator_ran = True - - # construct a downloader pointed at the schema and file, expecting a cache hit - # and use the above validation method - cd = CacheDownloader("downloads").bind( - DEFAULT_RESPONSE_URL, - validation_callback=dummy_validate_bytes, - ) - - # read data from the downloader - with cd.open() as fp: - assert fp.read() == b"{}" - # assert that the validator was not run - assert validator_ran is False - +def test_disable_cache_uses_plain_session(): + """When disable_cache=True, verify _build_session returns a plain Session.""" + cd = CacheDownloader("downloads", disable_cache=True) + session = cd._build_session() + # A plain requests.Session does not have CacheControlAdapter + assert type(session) is requests.Session -@pytest.mark.skipif( - platform.system() == "Windows", - reason="time.tzset() is not available on Windows", -) -def test_lastmod_from_header_uses_gmtime(request, monkeypatch, default_response): - """ - Regression test for https://github.com/python-jsonschema/check-jsonschema/pull/565 - - The time was converted in local time, when UTC/GMT was desired. - """ - - def final_tzset(): - time.tzset() - request.addfinalizer(final_tzset) +def test_enable_cache_uses_cachecontrol_session(tmp_path, patch_cache_dir): + """When disable_cache=False, verify _build_session returns a CacheControl session.""" + from cachecontrol import CacheControlAdapter - response = requests.get(DEFAULT_RESPONSE_URL, stream=True) + cd = CacheDownloader("downloads", disable_cache=False) + session = cd._build_session() + # CacheControl wraps the session and attaches CacheControlAdapter + assert isinstance(session.get_adapter("https://"), CacheControlAdapter) + assert isinstance(session.get_adapter("http://"), CacheControlAdapter) - with monkeypatch.context() as m: - m.setenv("TZ", "GMT0") - time.tzset() - gmt_parsed_time = _lastmod_from_response(response) - with monkeypatch.context() as m: - m.setenv("TZ", "EST5") - time.tzset() - est_parsed_time = _lastmod_from_response(response) - - with monkeypatch.context() as m: - m.setenv("TZ", "UTC0") - time.tzset() - utc_parsed_time = _lastmod_from_response(response) - - # assert that they all match - assert gmt_parsed_time == utc_parsed_time - assert gmt_parsed_time == est_parsed_time +def test_cache_dir_none_uses_plain_session(monkeypatch, patch_cache_dir): + """When _resolve_cache_dir returns None, _build_session returns plain Session.""" + # Undo the patch and simulate Windows with no env vars + patch_cache_dir.undo() + monkeypatch.delenv("LOCALAPPDATA", raising=False) + monkeypatch.delenv("APPDATA", raising=False) + monkeypatch.setattr(platform, "system", lambda: "Windows") + + cd = CacheDownloader("downloads", disable_cache=False) + assert cd._cache_dir is None + session = cd._build_session() + assert type(session) is requests.Session From 4e3a59eba0bc6201f8f012fb068678e9184ce5a3 Mon Sep 17 00:00:00 2001 From: Kyle Altendorf Date: Tue, 14 Apr 2026 08:07:39 -0400 Subject: [PATCH 2/7] review comment: hide --log-level CLI option Add hidden=True to reduce help output clutter. The option remains functional but won't appear in --help. https://github.com/python-jsonschema/check-jsonschema/pull/671#discussion_r3067143180 --- src/check_jsonschema/cli/main_command.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/check_jsonschema/cli/main_command.py b/src/check_jsonschema/cli/main_command.py index 96dc9d5cb..766f2db11 100644 --- a/src/check_jsonschema/cli/main_command.py +++ b/src/check_jsonschema/cli/main_command.py @@ -103,6 +103,7 @@ def pretty_helptext_list(values: list[str] | tuple[str, ...]) -> str: @click.version_option() @click.option( "--log-level", + hidden=True, help="Set the log level for debug output (e.g., DEBUG, INFO, WARNING).", type=click.Choice(["DEBUG", "INFO", "WARNING", "ERROR"], case_sensitive=False), callback=configure_logging, From 7477e23ae89cec06d339575f946f29bf51bdc7f2 Mon Sep 17 00:00:00 2001 From: Kyle Altendorf Date: Tue, 14 Apr 2026 08:10:18 -0400 Subject: [PATCH 3/7] review comment: use functools.cached_property for _session Simplify the lazy session initialization pattern by replacing the manual property+builder with cached_property. https://github.com/python-jsonschema/check-jsonschema/pull/671#discussion_r3067251298 --- src/check_jsonschema/cachedownloader.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/src/check_jsonschema/cachedownloader.py b/src/check_jsonschema/cachedownloader.py index c75cd932e..f0f03b169 100644 --- a/src/check_jsonschema/cachedownloader.py +++ b/src/check_jsonschema/cachedownloader.py @@ -1,6 +1,7 @@ from __future__ import annotations import contextlib +import functools import io import logging import os @@ -77,15 +78,9 @@ class CacheDownloader: def __init__(self, cache_dir: str, *, disable_cache: bool = False) -> None: self._cache_dir = _resolve_cache_dir(cache_dir) self._disable_cache = disable_cache - self._cached_session: requests.Session | None = None - @property + @functools.cached_property def _session(self) -> requests.Session: - if self._cached_session is None: - self._cached_session = self._build_session() - return self._cached_session - - def _build_session(self) -> requests.Session: session = requests.Session() if self._cache_dir and not self._disable_cache: log.debug("using cache dir: %s", self._cache_dir) From f9e40b9dc6adb302e87bf3137a99f7a32f77b853 Mon Sep 17 00:00:00 2001 From: Kyle Altendorf Date: Tue, 14 Apr 2026 08:18:49 -0400 Subject: [PATCH 4/7] review comment: remove LLM planning artifact The plan.md file was used during development but shouldn't be committed to the repository. https://github.com/python-jsonschema/check-jsonschema/pull/671#discussion_r3067288913 --- plan.md | 493 -------------------------------------------------------- 1 file changed, 493 deletions(-) delete mode 100644 plan.md diff --git a/plan.md b/plan.md deleted file mode 100644 index 4b8173e71..000000000 --- a/plan.md +++ /dev/null @@ -1,493 +0,0 @@ -# Plan: Replace Custom Cache with CacheControl - -Fixes https://github.com/python-jsonschema/check-jsonschema/issues/668 - -## Problem - -The current cache implementation in `cachedownloader.py` uses only the -`Last-Modified` header for cache validation. When a server omits that header -(but provides `ETag` or `Cache-Control`), `_lastmod_from_response()` returns -`0.0` and `_cache_hit()` always returns `True`, so the cache never invalidates. - -## Solution - -Replace the hand-rolled mtime-based cache logic with the -[CacheControl](https://github.com/psf/cachecontrol) library (v0.14, maintained -under the PSF). CacheControl wraps a `requests.Session` and implements full HTTP -caching semantics — `ETag`/`If-None-Match`, `Last-Modified`/`If-Modified-Since`, -and `Cache-Control` headers — transparently. On-disk persistence uses its -`FileCache` backend (backed by `filelock` for concurrency safety). - -## Behavior Changes - -| Aspect | Before | After | -|---|---|---| -| ETag support | None | Full (`If-None-Match` / 304) | -| `Cache-Control` header | Ignored | Respected (`max-age`, `no-cache`, `no-store`) | -| `Last-Modified` | Only mechanism (buggy when absent) | Supported via `If-Modified-Since` / 304 | -| No caching headers at all | Cache never invalidates (bug) | Not cached — see note below | -| Existing cache files on upgrade | Used | Ignored (incompatible format) — see open question below | -| Cache file format | Raw content, SHA-256 filename | CacheControl msgpack-serialized HTTP response, SHA-224 filename in 5-level dir tree | -| `--no-cache` flag | Unchanged | Unchanged — plain session, no caching | -| Retry on error / bad data | Unchanged | Unchanged — custom retry loop preserved | -| Concurrency safety | `_atomic_write` (tempfile + shutil.copy) | `filelock` (via `FileCache`) | - -### Servers with no caching headers - -When a server provides no caching headers at all (no `ETag`, no `Last-Modified`, -no `Cache-Control`, no `Expires`), the current code caches the response forever -due to the `_lastmod_from_response` returning `0.0` bug. After this change, such -responses will not be cached, following correct HTTP semantics. In practice, -nearly all real-world schema endpoints provide at least one caching header. - -CacheControl has built-in heuristic support for overriding this behavior if -needed. The `ExpiresAfter` heuristic (bundled, not custom) can apply a TTL to all -responses. For a conditional variant that only applies to responses lacking -server-provided caching headers, the documented `BaseHeuristic` extensibility -point allows a small subclass (~10 lines). Neither is a hack — heuristics are a -first-class CacheControl feature designed for exactly this purpose, and -[RFC 7234](https://tools.ietf.org/html/rfc7234) explicitly permits caching -systems to use heuristics for responses that lack caching headers. - -### Open question: clean up old cache files? - -CacheControl uses the same cache directories (`check_jsonschema/schemas/`, -`check_jsonschema/refs/`) but a different file layout (5-level SHA-224 tree vs. -flat SHA-256 files). Old cache files will sit alongside the new CacheControl -tree, ignored but taking up space. - -Options: -1. **Do nothing** (current default) — old files are harmless. Users can manually - delete the cache directory if they want to reclaim space. -2. **Delete on first run** — on `_build_session()`, detect and remove files - matching the old naming pattern (64-char hex + optional extension, no - subdirectories). Risk: could delete user files if they happen to match. -3. **One-time migration warning** — log a message suggesting users clear their - cache directory. Non-invasive but adds noise. - -**Proceeding with option 1 (do nothing) unless revisited.** - ---- - -## 1. Dependency Changes - -**File: `pyproject.toml`** (line 39-46) - -Add `CacheControl[filecache]` to runtime dependencies: - -```toml -dependencies = [ - 'tomli>=2.0;python_version<"3.11"', - "ruamel.yaml>=0.18.10,<0.20.0", - "jsonschema>=4.18.0,<5.0", - "regress>=2024.11.1", - "requests<3.0", - "click>=8,<9", - "CacheControl[filecache]>=0.14,<0.15", -] -``` - -The `[filecache]` extra pulls in `filelock`. CacheControl also depends on -`msgpack` (for response serialization) and `requests` (already a dependency). - ---- - -## 2. Rewrite `src/check_jsonschema/cachedownloader.py` - -### Remove - -| Symbol | Lines | Reason | -|---|---|---| -| `_LASTMOD_FMT` | 16 | CacheControl handles Last-Modified natively | -| `_lastmod_from_response()` | 45-54 | Same | -| `_cache_hit()` | 88-97 | CacheControl handles freshness and conditional requests | -| `_atomic_write()` | 77-85 | FileCache uses `filelock` for concurrency | - -### Preserve existing imports - -`from __future__ import annotations` (line 1) stays. `hashlib` stays (used by -`url_to_cache_filename`). `platform` stays (used by `_base_cache_dir`). - -### Remove imports no longer needed - -`calendar`, `shutil`, `tempfile`, `time` — verify each is truly unused after -the rewrite. - -### Add imports (top-level, third-party group) - -```python -import cachecontrol -from cachecontrol.caches.file_cache import FileCache -``` - -These go in the third-party import group alongside `import requests`, following -the project's PEP 8 / isort convention (no lazy imports in production code). - -### Keep unchanged - -| Symbol | Lines | Reason | -|---|---|---| -| `_base_cache_dir()` | 19-35 | Still needed for platform-specific cache path | -| `_resolve_cache_dir()` | 38-42 | Still needed | -| `url_to_cache_filename()` | 100-116 | Useful utility; not used by CacheControl but no reason to remove | -| `FailedDownloadError` | 119-120 | Still raised by retry logic | - -### Modify `_get_request()` (lines 57-74) - -- Add `session: requests.Session` as the first parameter. -- Call `session.get(file_url)` instead of `requests.get(file_url, stream=True)`. -- Remove `stream=True` — CacheControl needs the full response to cache it, and - schemas are small. -- Keep the retry loop (2 retries) and `response_ok` callback — CacheControl does - not provide retry logic. - -**Retry / CacheControl interaction:** When a server returns a 200 with corrupt -body *and* caching headers (e.g., ETag), CacheControl caches the response before -our `response_ok` callback runs. On a naive retry, `session.get()` would serve -the cached corrupt response and every retry would fail identically. To avoid -this, retry attempts (`_attempt > 0`) pass `Cache-Control: no-cache` in the -request headers. This is standard HTTP (RFC 7234 §5.2.1.4) — it tells -CacheControl to revalidate with the server rather than serve from its local -cache. CacheControl implements this natively; no custom logic is needed. - -```python -def _get_request( - session: requests.Session, - file_url: str, - *, - response_ok: t.Callable[[requests.Response], bool], -) -> requests.Response: - num_retries = 2 - r: requests.Response | None = None - for _attempt in range(num_retries + 1): - try: - # On retries, bypass CacheControl's local cache to avoid - # re-serving a cached bad response. - headers = {"Cache-Control": "no-cache"} if _attempt > 0 else {} - r = session.get(file_url, headers=headers) - except requests.RequestException as e: - if _attempt == num_retries: - raise FailedDownloadError("encountered error during download") from e - continue - if r.ok and response_ok(r): - return r - assert r is not None - raise FailedDownloadError( - f"got response with status={r.status_code}, retries exhausted" - ) -``` - -### Rewrite `CacheDownloader` (lines 123-180) - -- `__init__`: store config only. Do **not** build the session eagerly — - `test_default_cache_dir` creates `CacheDownloader` with fake/relative cache - paths purely to assert `_cache_dir` resolution and never makes HTTP requests. - Eager `os.makedirs` + `FileCache()` in `__init__` would create real - directories using those fake paths. -- Build the session lazily via a `_session` property on first access (i.e., - first `open()` call). This also avoids creating sessions that are never used - (e.g., if the schema turns out to be local after all). -- Remove `_download()` entirely — CacheControl manages on-disk cache files. -- Simplify `open()` — always yields `io.BytesIO(response.content)`. No - branching needed: when cache is enabled, `self._session` is a - CacheControl-wrapped session (handles caching transparently); when disabled, - it is a plain `Session`. -- `bind()` — unchanged API. - -CacheControl imports go at the **top of the module** alongside `requests`, -following the project convention (no lazy/deferred imports in production code): - -```python -import cachecontrol -from cachecontrol.caches.file_cache import FileCache -``` - -```python -class CacheDownloader: - def __init__(self, cache_dir: str, *, disable_cache: bool = False) -> None: - self._cache_dir = _resolve_cache_dir(cache_dir) - self._disable_cache = disable_cache - self._cached_session: requests.Session | None = None - - @property - def _session(self) -> requests.Session: - if self._cached_session is None: - self._cached_session = self._build_session() - return self._cached_session - - def _build_session(self) -> requests.Session: - session = requests.Session() - if self._cache_dir and not self._disable_cache: - os.makedirs(self._cache_dir, exist_ok=True) - session = cachecontrol.CacheControl( - session, cache=FileCache(self._cache_dir) - ) - return session - - @contextlib.contextmanager - def open( - self, - file_url: str, - filename: str, - validate_response: t.Callable[[requests.Response], bool], - ) -> t.Iterator[t.IO[bytes]]: - response = _get_request( - self._session, file_url, response_ok=validate_response - ) - yield io.BytesIO(response.content) - - def bind( - self, - file_url: str, - filename: str | None = None, - validation_callback: t.Callable[[bytes], t.Any] | None = None, - ) -> BoundCacheDownloader: - return BoundCacheDownloader( - file_url, self, filename=filename, - validation_callback=validation_callback, - ) -``` - -The `filename` parameter in `open()` and `bind()` is kept for API compatibility -but ignored — CacheControl manages its own filenames (SHA-224 hash of normalized -URL in a 5-level directory tree). - -`BoundCacheDownloader.__init__` still computes `self._filename = filename or -url_to_cache_filename(file_url)` (line 193). This is now dead code — the value -is passed through to `CacheDownloader.open()` which ignores it. Kept for now -since `test_default_filename_from_uri` asserts on `cd._filename`. Can be cleaned -up in a follow-up. - -### Modify `BoundCacheDownloader._validate_response()` (lines 206-213) - -Skip validation on cache hits. CacheControl sets `response.from_cache = True` on -every response served from cache (either a direct hit or after a 304 -revalidation). This preserves the fix for -[#453](https://github.com/python-jsonschema/check-jsonschema/issues/453) -(validation callback must not run on cache hits). - -```python -def _validate_response(self, response: requests.Response) -> bool: - if not self._validation_callback: - return True - # CacheControl sets from_cache=True on cache hits; skip re-validation - if getattr(response, "from_cache", False): - return True - try: - self._validation_callback(response.content) - return True - except ValueError: - return False -``` - ---- - -## 3. Changes to Callers - -**No changes** to `readers.py` or `resolver.py`. Both use `CacheDownloader` via -`bind()` and `open()`, whose signatures are unchanged. - -- `readers.py:80` — `CacheDownloader("schemas", disable_cache=...).bind(url, validation_callback=...)` -- `resolver.py:53` — `CacheDownloader("refs", disable_cache=...)` - -Cache subdirectory names (`"schemas"`, `"refs"`) are preserved. - -**Update `cli/main_command.py:96`**: The `--schemafile` help text says schemas -will be `"downloaded and cached locally based on mtime."` Replace with generic -wording (e.g., `"downloaded and cached locally"`) since caching is no longer -mtime-based. - ---- - -## 4. Test Changes - -### Test strategy: trust CacheControl, test integration only - -CacheControl's `FileCache` write mechanism relies on a `CallbackFileWrapper` -that monitors when a response stream is fully consumed. The `responses` mock -library creates `BytesIO` bodies that never report `.closed = True` after -exhaustion, so the cache-write callback never fires. **CacheControl never -populates its FileCache when responses come from `responses` mocks.** - -This means tests cannot verify CacheControl's caching behavior (cache hits, -ETag revalidation, max-age expiry) using `responses`. Rather than switching test -infrastructure (e.g., adding a real HTTP server), the test strategy is: - -- **Trust CacheControl** for caching correctness — it is a well-maintained PSF - project with its own test suite. -- **Test our integration**: retry logic, validation callbacks, `--no-cache` flag, - session construction, error handling. -- **Do not assert** on cache file existence, `response.from_cache`, or - `len(responses.calls)` staying constant across requests. - -### `responses` library compatibility - -The `responses` library (v0.26.0) patches `HTTPAdapter.send` at the class level. -CacheControl's `CacheControlAdapter` calls `super().send()`, which hits the -`responses` mock. HTTP request/response flow works normally — only the cache-write -side effect is broken. Tests that use `responses` for mocking HTTP (retry tests, -validation tests, error tests) work fine. - -### `tests/conftest.py` — Fixture changes - -**Update `patch_cache_dir`** (line 67): Works as-is — it monkeypatches -`_base_cache_dir`, which is still called by `_resolve_cache_dir`, which is still -called by `CacheDownloader.__init__`. CacheControl's `FileCache` will use the -patched temp directory. - -**Remove the following fixtures** — they either inject raw cache files (no longer -valid format) or predict cache file paths using `url_to_cache_filename` -(SHA-256), which CacheControl does not use: - -| Fixture | Line | Action | -|---|---|---| -| `url2cachepath` | 77 | Remove — only consumed by the path-prediction fixtures below | -| `inject_cached_download` | 100 | Remove — writes raw content; CacheControl uses msgpack format | -| `inject_cached_ref` | 126 | Remove — same | -| `get_download_cache_loc` | 92 | Remove — path prediction no longer valid | -| `get_ref_cache_loc` | 118 | Remove — same | - -**Keep `downloads_cache_dir` / `refs_cache_dir`** (lines 87, 113) and -**keep `cache_dir`** (line 63) — still useful for asserting cache directory -existence when needed. - -### `tests/unit/test_cachedownloader.py` — Specific changes - -**Update the import block** (lines 10-16): Remove `_cache_hit` and -`_lastmod_from_response` — both symbols are deleted from `cachedownloader.py`. - -**Tests to remove:** - -| Test | Lines | Reason | -|---|---|---| -| `test_cache_hit_by_mtime` | 95-113 | mtime-based caching removed entirely | -| `test_cachedownloader_handles_bad_lastmod_header` | 257-310 | `_lastmod_from_response` removed | -| `test_lastmod_from_header_uses_gmtime` | 352-383 | Same | -| `test_cachedownloader_cached_file` | 116-129 | Tests monkeypatched `_download` method, which is removed | -| `test_cachedownloader_validation_is_not_invoked_on_hit` | 313-345 | Depends on `inject_cached_download` (removed) and `response.from_cache` (not testable with `responses` mocks) | - -**Tests to simplify** (remove `get_download_cache_loc` dependency and file- -existence assertions — these tests are about retry behavior, not caching): - -`test_cachedownloader_on_success` (line 132): -- Remove `get_download_cache_loc` fixture usage and `f.exists()` assertions. -- Keep: verify `cd.open()` returns correct content (`b"{}"`). - -`test_cachedownloader_succeeds_after_few_errors` (line 160): -- Remove `get_download_cache_loc` fixture usage and `f.exists()` assertions. -- Keep: verify `cd.open()` returns correct content after retries. - -`test_cachedownloader_fails_after_many_errors` (line 194): -- Remove `get_download_cache_loc` fixture usage and `f.exists()` assertion. -- Keep: verify `FailedDownloadError` is raised. - -`test_cachedownloader_retries_on_bad_data` (line 225): -- Remove `get_download_cache_loc` fixture usage and `f.exists()` assertions. -- Keep: verify `cd.open()` returns correct content after retrying past bad data. - -`test_cachedownloader_using_alternate_target_dir` (line 149): -- Remove `url2cachepath` / path-based assertions. -- Keep: verify `cd.open()` returns correct content. - -**Tests to keep as-is:** - -| Test | Lines | Notes | -|---|---|---| -| `test_default_filename_from_uri` | 37-39 | `url_to_cache_filename` unchanged | -| `test_default_cache_dir` | 42-92 | `_base_cache_dir` unchanged. Works as-is thanks to lazy session building — `CacheDownloader("downloads")` only stores `_cache_dir`, no `os.makedirs` or `FileCache` until `open()` is called. | - -**New tests to add (integration-focused):** - -| Test | Purpose | -|---|---| -| `test_disable_cache_uses_plain_session` | When `disable_cache=True`, verify `_build_session` returns a plain `requests.Session` (not wrapped by CacheControl). | -| `test_enable_cache_uses_cachecontrol_session` | When `disable_cache=False` and cache dir is valid, verify `_build_session` returns a CacheControl-wrapped session (check for `CacheControlAdapter` on the session). | -| `test_cache_dir_none_uses_plain_session` | When `_resolve_cache_dir` returns `None` (e.g., Windows with no env vars), verify `_build_session` returns a plain session. | - -**Aspirational cache-behavior tests** (require a real HTTP server or alternative -to `responses`; not included in this PR but tracked for follow-up): - -| Test | Purpose | -|---|---| -| `test_etag_cache_revalidation` | Server provides `ETag`, no `Last-Modified`. Second request sends `If-None-Match`, gets 304. | -| `test_cache_control_max_age` | Server provides `Cache-Control: max-age=3600`. Second request served from cache. | -| `test_last_modified_revalidation` | Server provides `Last-Modified`. Second request sends `If-Modified-Since`. | -| `test_no_caching_headers_not_cached` | Server provides no caching headers. Response is not cached. | - -### `tests/acceptance/test_remote_ref_resolution.py` - -`test_remote_ref_resolution_cache_control` (line 68): -- Remove `get_ref_cache_loc` fixture dependency and file-path assertions. -- When `disable_cache=False`: verify the command succeeds (exit code 0). - Cache behavior is trusted to CacheControl. -- When `disable_cache=True`: verify the command succeeds (exit code 0). - -`test_remote_ref_resolution_loads_from_cache` (line 105): -- Remove `inject_cached_ref` and `get_ref_cache_loc` fixture dependencies. -- This test's premise (inject good cache data, serve bad HTTP, verify cache is - used) cannot be replicated without direct FileCache manipulation. **Remove - this test.** Cache-hit behavior is trusted to CacheControl. - -`test_remote_ref_resolution_callout_count_is_scale_free_in_instancefiles` -(line 251): -- Should work as-is. Within-run deduplication is handled by `lru_cache` on - `_get_validator` and the in-memory `ResourceCache` — not by the file cache. - CacheControl's file cache only affects cross-run persistence. - -### `tests/acceptance/test_nonjson_schema_handling.py` - -`test_can_load_remote_yaml_schema_ref_from_cache` (line 140): -- Remove `inject_cached_ref` fixture dependency. This test's premise (inject - good cache, serve bad HTTP) cannot be replicated. **Remove this test.** - Cache-hit behavior is trusted to CacheControl. - ---- - -## 5. Pytest Warning Filter - -CacheControl's `FileCache` uses `filelock` for concurrency safety. When connection -errors occur during tests (e.g., `test_cachedownloader_fails_after_many_errors`), -filelock may leave temporary files open, triggering a `ResourceWarning` that pytest -converts to `PytestUnraisableExceptionWarning`. - -This is not a bug in our code — it's a side effect of how filelock handles error -conditions. Add a filter to `pyproject.toml` to ignore this specific warning: - -```toml -[tool.pytest.ini_options] -filterwarnings = [ - # ... existing filters ... - # filelock (used by CacheControl) may leave temp files open during connection errors - 'ignore:Exception ignored in.*FileIO.*:pytest.PytestUnraisableExceptionWarning', -] -``` - -**TODO:** Investigate whether this warning can be avoided rather than suppressed. - -**Investigation results (2026-03-27):** -- The warning originates from CacheControl's `CallbackFileWrapper` class in - `filewrapper.py`, not filelock. -- `CallbackFileWrapper.__init__` creates a `NamedTemporaryFile` (line 37), which - is only closed in `_close()` when the response is fully read (line 105-106). -- When a `requests.ConnectionError` occurs, the temp file is never closed because - `_close()` is never called. -- No existing upstream issues found in CacheControl or filelock for this specific - problem. -- The `CallbackFileWrapper` class lacks a `__del__` method or context manager - protocol that would ensure cleanup on exceptions. -- **Recommendation:** File an upstream issue with CacheControl suggesting that - `CallbackFileWrapper` implement `__del__` to close `self.__buf` if not already - closed, or use a weak reference callback to ensure cleanup. - ---- - -## 6. Execution Order - -1. Add `CacheControl[filecache]` dependency to `pyproject.toml` -2. Rewrite `cachedownloader.py` -3. Update `conftest.py` fixtures -4. Rewrite/update unit tests in `test_cachedownloader.py` -5. Update acceptance tests -6. Add pytest warning filter for filelock ResourceWarning -7. Run full test suite, fix any `responses` library compatibility issues -8. Commit, push, open PR From 7b9a1a6d216c0a4be9d50e5fc56f15054cce267e Mon Sep 17 00:00:00 2001 From: Kyle Altendorf Date: Tue, 14 Apr 2026 08:20:16 -0400 Subject: [PATCH 5/7] fix tests: use _session instead of removed _build_session The cached_property refactor removed _build_session, update tests to access _session directly. https://github.com/python-jsonschema/check-jsonschema/pull/671#discussion_r3067251298 --- tests/unit/test_cachedownloader.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/unit/test_cachedownloader.py b/tests/unit/test_cachedownloader.py index 402bb2c56..2f2da83d7 100644 --- a/tests/unit/test_cachedownloader.py +++ b/tests/unit/test_cachedownloader.py @@ -202,26 +202,26 @@ def test_cachedownloader_retries_on_bad_data(disable_cache, downloads_cache_dir) def test_disable_cache_uses_plain_session(): - """When disable_cache=True, verify _build_session returns a plain Session.""" + """When disable_cache=True, verify _session returns a plain Session.""" cd = CacheDownloader("downloads", disable_cache=True) - session = cd._build_session() + session = cd._session # A plain requests.Session does not have CacheControlAdapter assert type(session) is requests.Session def test_enable_cache_uses_cachecontrol_session(tmp_path, patch_cache_dir): - """When disable_cache=False, verify _build_session returns a CacheControl session.""" + """When disable_cache=False, verify _session returns a CacheControl session.""" from cachecontrol import CacheControlAdapter cd = CacheDownloader("downloads", disable_cache=False) - session = cd._build_session() + session = cd._session # CacheControl wraps the session and attaches CacheControlAdapter assert isinstance(session.get_adapter("https://"), CacheControlAdapter) assert isinstance(session.get_adapter("http://"), CacheControlAdapter) def test_cache_dir_none_uses_plain_session(monkeypatch, patch_cache_dir): - """When _resolve_cache_dir returns None, _build_session returns plain Session.""" + """When _resolve_cache_dir returns None, _session returns plain Session.""" # Undo the patch and simulate Windows with no env vars patch_cache_dir.undo() monkeypatch.delenv("LOCALAPPDATA", raising=False) @@ -230,5 +230,5 @@ def test_cache_dir_none_uses_plain_session(monkeypatch, patch_cache_dir): cd = CacheDownloader("downloads", disable_cache=False) assert cd._cache_dir is None - session = cd._build_session() + session = cd._session assert type(session) is requests.Session From a2655ebacbfe62132342ce94840d0d01f6aee31e Mon Sep 17 00:00:00 2001 From: Kyle Altendorf Date: Tue, 14 Apr 2026 08:45:17 -0400 Subject: [PATCH 6/7] review comment: use cache.delete() for bad data invalidation Replace no-cache header workaround with explicit cache deletion when retrying after a failed response. This ensures bad data doesn't remain stuck in the cache, as confirmed by the cachecontrol maintainer that BaseCache.delete() is part of the public API. https://github.com/python-jsonschema/check-jsonschema/pull/671#discussion_r3067285727 --- src/check_jsonschema/cachedownloader.py | 32 +++++++++++++++---------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/src/check_jsonschema/cachedownloader.py b/src/check_jsonschema/cachedownloader.py index f0f03b169..188ddb7b2 100644 --- a/src/check_jsonschema/cachedownloader.py +++ b/src/check_jsonschema/cachedownloader.py @@ -11,6 +11,7 @@ import cachecontrol import requests from cachecontrol.caches.file_cache import FileCache +from cachecontrol.controller import CacheController log = logging.getLogger(__name__) @@ -46,18 +47,17 @@ def _get_request( file_url: str, *, response_ok: t.Callable[[requests.Response], bool], + cache: FileCache | None = None, ) -> requests.Response: num_retries = 2 r: requests.Response | None = None for _attempt in range(num_retries + 1): + # Delete bad cache entry before retry so we fetch fresh data + if cache is not None and _attempt > 0: + cache_key = CacheController.cache_url(file_url) + cache.delete(cache_key) try: - # On retries, bypass CacheControl's local cache to avoid - # re-serving a cached bad response. Ideally we'd delete the - # cache entry directly, but CacheControl doesn't expose a public - # API for this (see https://github.com/psf/cachecontrol/issues/219). - # The no-cache header forces revalidation with the origin server. - headers = {"Cache-Control": "no-cache"} if _attempt > 0 else {} - r = session.get(file_url, headers=headers) + r = session.get(file_url) except requests.RequestException as e: if _attempt == num_retries: raise FailedDownloadError("encountered error during download") from e @@ -79,15 +79,19 @@ def __init__(self, cache_dir: str, *, disable_cache: bool = False) -> None: self._cache_dir = _resolve_cache_dir(cache_dir) self._disable_cache = disable_cache + @functools.cached_property + def _cache(self) -> FileCache | None: + if self._cache_dir and not self._disable_cache: + os.makedirs(self._cache_dir, exist_ok=True) + return FileCache(self._cache_dir) + return None + @functools.cached_property def _session(self) -> requests.Session: session = requests.Session() - if self._cache_dir and not self._disable_cache: + if self._cache is not None: log.debug("using cache dir: %s", self._cache_dir) - os.makedirs(self._cache_dir, exist_ok=True) - session = cachecontrol.CacheControl( - session, cache=FileCache(self._cache_dir) - ) + session = cachecontrol.CacheControl(session, cache=self._cache) else: log.debug("caching disabled") return session @@ -98,7 +102,9 @@ def open( file_url: str, validate_response: t.Callable[[requests.Response], bool], ) -> t.Iterator[t.IO[bytes]]: - response = _get_request(self._session, file_url, response_ok=validate_response) + response = _get_request( + self._session, file_url, response_ok=validate_response, cache=self._cache + ) yield io.BytesIO(response.content) def bind( From 9e0463bce3b82e6b1a7bbcf8f079be3181d032e0 Mon Sep 17 00:00:00 2001 From: Kyle Altendorf Date: Tue, 14 Apr 2026 09:41:16 -0400 Subject: [PATCH 7/7] review comment: restore cache-loading tests with cachecontrol compatibility Enhance the mocked_responses fixture to work properly with cachecontrol's caching mechanism by patching responses._handle_body to return a BytesIO subclass that properly signals 'closed' after all data is read. This allows restoring the previously deleted cache-loading tests: - test_remote_ref_resolution_loads_from_cache - test_can_load_remote_yaml_schema_ref_from_cache The tests now use a two-run approach: first run populates the cache with good data (using proper HTTP cache headers), second run verifies the cached data is used even when the mocked HTTP responses return errors. https://github.com/python-jsonschema/check-jsonschema/pull/671#discussion_r3067294414 --- .../test_nonjson_schema_handling.py | 36 +++++++++++++ .../acceptance/test_remote_ref_resolution.py | 53 +++++++++++++++++++ tests/conftest.py | 50 +++++++++++++++++ 3 files changed, 139 insertions(+) diff --git a/tests/acceptance/test_nonjson_schema_handling.py b/tests/acceptance/test_nonjson_schema_handling.py index ffa325bf1..9a75111ec 100644 --- a/tests/acceptance/test_nonjson_schema_handling.py +++ b/tests/acceptance/test_nonjson_schema_handling.py @@ -135,3 +135,39 @@ def test_can_load_remote_yaml_schema_ref(run_line, tmp_path, passing_data): result = run_line(["check-jsonschema", "--schemafile", retrieval_uri, str(doc)]) assert result.exit_code == (0 if passing_data else 1) + + +def test_can_load_remote_yaml_schema_ref_from_cache( + run_line, tmp_path, cacheable_headers +): + retrieval_uri = "https://example.org/retrieval/schemas/main.yaml" + ref_loc = "https://example.org/retrieval/schemas/title_schema.yaml" + + # First: add good responses with cache headers + responses.add( + "GET", + retrieval_uri, + body="""\ +"$schema": "http://json-schema.org/draft-07/schema" +properties: + "title": {"$ref": "./title_schema.yaml"} +additionalProperties: false +""", + headers=cacheable_headers, + ) + responses.add("GET", ref_loc, body="type: string", headers=cacheable_headers) + + # Then: add bad responses (used if cache doesn't work) + responses.add("GET", retrieval_uri, body="error", status=500) + responses.add("GET", ref_loc, body="error", status=500) + + doc = tmp_path / "doc.json" + doc.write_text(json.dumps(PASSING_DOCUMENT)) + + # First run: populates cache + result1 = run_line(["check-jsonschema", "--schemafile", retrieval_uri, str(doc)]) + assert result1.exit_code == 0 + + # Second run: should use cached data (not the 500 errors) + result2 = run_line(["check-jsonschema", "--schemafile", retrieval_uri, str(doc)]) + assert result2.exit_code == 0 diff --git a/tests/acceptance/test_remote_ref_resolution.py b/tests/acceptance/test_remote_ref_resolution.py index 59670cdf6..38336c603 100644 --- a/tests/acceptance/test_remote_ref_resolution.py +++ b/tests/acceptance/test_remote_ref_resolution.py @@ -96,6 +96,59 @@ def test_remote_ref_resolution_cache_control( assert loc.exists() +@pytest.mark.parametrize("casename", ("case1", "case2")) +@pytest.mark.parametrize("check_passes", (True, False)) +def test_remote_ref_resolution_loads_from_cache( + run_line, tmp_path, casename, check_passes, cacheable_headers +): + main_schema_loc = "https://example.com/main.json" + + # First: add good responses with cache headers + responses.add( + "GET", + main_schema_loc, + json=CASES[casename]["main_schema"], + headers=cacheable_headers, + ) + for name, subschema in CASES[casename]["other_schemas"].items(): + responses.add( + "GET", + f"https://example.com/{name}.json", + json=subschema, + headers=cacheable_headers, + ) + + # Then: add bad responses (used if cache doesn't work) + responses.add("GET", main_schema_loc, json={}, status=500) + for name in CASES[casename]["other_schemas"]: + responses.add("GET", f"https://example.com/{name}.json", json={}, status=500) + + instance_path = tmp_path / "instance.json" + instance_path.write_text( + json.dumps( + CASES[casename]["passing_document"] + if check_passes + else CASES[casename]["failing_document"] + ) + ) + + # First run: populates cache with good data + result1 = run_line( + ["check-jsonschema", "--schemafile", main_schema_loc, str(instance_path)] + ) + assert result1.exit_code == (0 if check_passes else 1) + + # Second run: should use cached data (not the 500 errors) + result2 = run_line( + ["check-jsonschema", "--schemafile", main_schema_loc, str(instance_path)] + ) + output = f"\nstdout:\n{result2.stdout}\n\nstderr:\n{result2.stderr}" + if check_passes: + assert result2.exit_code == 0, output + else: + assert result2.exit_code == 1, output + + # this test ensures that `$id` is preferred for the base URI over # the retrieval URI @pytest.mark.parametrize("check_passes", (True, False)) diff --git a/tests/conftest.py b/tests/conftest.py index 82dc3139d..523c29562 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,13 +1,42 @@ import inspect +import io import os import pathlib import sys +from email.utils import formatdate import pytest import responses from click.testing import CliRunner +class _CacheControlCompatibleBytesIO(io.BytesIO): + """A BytesIO that signals closed to cachecontrol after all data is read. + + cachecontrol's CallbackFileWrapper checks `fp.fp is None` to determine + if the response has been fully read. Standard BytesIO doesn't have an + `fp` attribute, so cachecontrol falls back to checking `fp.closed`, + which only returns True after explicit `.close()`. This class adds + an `fp` property that returns None after all data has been read, + allowing cachecontrol to properly cache responses from the `responses` + mock library. + """ + + def __init__(self, *args, **kwargs) -> None: + super().__init__(*args, **kwargs) + self._fully_read = False + + @property + def fp(self): + return None if self._fully_read else self + + def read(self, size=-1): + data = super().read(size) + if self.tell() == len(self.getvalue()): + self._fully_read = True + return data + + @pytest.fixture def cli_runner(): # compatibility for click==8.2.0 vs click<=8.1 @@ -19,10 +48,22 @@ def cli_runner(): @pytest.fixture(autouse=True) def mocked_responses(): + # Patch responses._handle_body to return a BytesIO subclass that properly + # signals "closed" to cachecontrol, enabling HTTP caching in tests. + original_handle_body = responses._handle_body + + def _patched_handle_body(body): + result = original_handle_body(body) + if isinstance(result, io.BytesIO): + return _CacheControlCompatibleBytesIO(result.getvalue()) + return result + + responses._handle_body = _patched_handle_body responses.start() yield responses.stop() responses.reset() + responses._handle_body = original_handle_body @pytest.fixture @@ -87,3 +128,12 @@ def refs_cache_dir(tmp_path): @pytest.fixture def downloads_cache_dir(tmp_path): return tmp_path / ".cache" / "check_jsonschema" / "downloads" + + +@pytest.fixture +def cacheable_headers(): + """Returns HTTP headers that enable cachecontrol caching.""" + return { + "Cache-Control": "max-age=31536000", + "Date": formatdate(usegmt=True), + }