python-jsonschema · altendky · Mar 27, 2026 · Apr 14, 2026 · Apr 14, 2026 · Apr 14, 2026
diff --git a/pyproject.toml b/pyproject.toml
@@ -43,6 +43,7 @@ dependencies = [
     "regress>=2024.11.1",
     "requests<3.0",
     "click>=8,<9",
+    "CacheControl[filecache]>=0.14,<0.15",
 ]
 
 [project.readme]
@@ -80,6 +81,12 @@ filterwarnings = [
     "error",
     # dateutil has a Python 3.12 compatibility issue.
     'ignore:datetime\.datetime\.utcfromtimestamp\(\) is deprecated:DeprecationWarning',
+    # CacheControl's CallbackFileWrapper (filewrapper.py) creates a NamedTemporaryFile
+    # in __init__ that is only closed when the response is fully read. When a
+    # requests.ConnectionError occurs, _close() is never called and the temp file
+    # leaks. This is a CacheControl limitation, not a check-jsonschema bug.
+    # See plan.md section "5. Pytest Warning Filter" for details.
+    'ignore:Exception ignored in.*FileIO.*:pytest.PytestUnraisableExceptionWarning',
 ]
 addopts = [
     "--color=yes",

diff --git a/src/check_jsonschema/cachedownloader.py b/src/check_jsonschema/cachedownloader.py
@@ -1,19 +1,19 @@
 from __future__ import annotations
 
-import calendar
 import contextlib
-import hashlib
+import functools
 import io
+import logging
 import os
 import platform
-import shutil
-import tempfile
-import time
 import typing as t
 
+import cachecontrol
 import requests
+from cachecontrol.caches.file_cache import FileCache
+from cachecontrol.controller import CacheController
 
-_LASTMOD_FMT = "%a, %d %b %Y %H:%M:%S %Z"
+log = logging.getLogger(__name__)
 
 
 def _base_cache_dir() -> str | None:
@@ -42,26 +42,22 @@ def _resolve_cache_dir(dirname: str) -> str | None:
     return cache_dir
 
 
-def _lastmod_from_response(response: requests.Response) -> float:
-    try:
-        return calendar.timegm(
-            time.strptime(response.headers["last-modified"], _LASTMOD_FMT)
-        )
-    # OverflowError: time outside of platform-specific bounds
-    # ValueError: malformed/unparseable
-    # LookupError: no such header
-    except (OverflowError, ValueError, LookupError):
-        return 0.0
-
-
 def _get_request(
-    file_url: str, *, response_ok: t.Callable[[requests.Response], bool]
+    session: requests.Session,
+    file_url: str,
+    *,
+    response_ok: t.Callable[[requests.Response], bool],
+    cache: FileCache | None = None,
 ) -> requests.Response:
     num_retries = 2
     r: requests.Response | None = None
     for _attempt in range(num_retries + 1):
+        # Delete bad cache entry before retry so we fetch fresh data
+        if cache is not None and _attempt > 0:
+            cache_key = CacheController.cache_url(file_url)
+            cache.delete(cache_key)
         try:
-            r = requests.get(file_url, stream=True)
+            r = session.get(file_url)
         except requests.RequestException as e:
             if _attempt == num_retries:
                 raise FailedDownloadError("encountered error during download") from e
@@ -74,48 +70,6 @@ def _get_request(
     )
 
 
-def _atomic_write(dest: str, content: bytes) -> None:
-    # download to a temp file and then move to the dest
-    # this makes the download safe if run in parallel (parallel runs
-    # won't create a new empty file for writing and cause failures)
-    fp = tempfile.NamedTemporaryFile(mode="wb", delete=False)
-    fp.write(content)
-    fp.close()
-    shutil.copy(fp.name, dest)
-    os.remove(fp.name)
-
-
-def _cache_hit(cachefile: str, response: requests.Response) -> bool:
-    # no file? miss
-    if not os.path.exists(cachefile):
-        return False
-
-    # compare mtime on any cached file against the remote last-modified time
-    # it is considered a hit if the local file is at least as new as the remote file
-    local_mtime = os.path.getmtime(cachefile)
-    remote_mtime = _lastmod_from_response(response)
-    return local_mtime >= remote_mtime
-
-
-def url_to_cache_filename(ref_url: str) -> str:
-    """
-    Given a schema URL, convert it to a filename for caching in a cache dir.
-
-    Rules are as follows:
-    - the base filename is an sha256 hash of the URL
-    - if the filename ends in an extension (.json, .yaml, etc) that extension
-      is appended to the hash
-
-    Preserving file extensions preserves the extension-based logic used for parsing, and
-    it also helps a local editor (browsing the cache) identify filetypes.
-    """
-    filename = hashlib.sha256(ref_url.encode()).hexdigest()
-    if "." in (last_part := ref_url.rpartition("/")[-1]):
-        _, _, extension = last_part.rpartition(".")
-        filename = f"{filename}.{extension}"
-    return filename
-
-
 class FailedDownloadError(Exception):
     pass
 
@@ -125,58 +79,41 @@ def __init__(self, cache_dir: str, *, disable_cache: bool = False) -> None:
         self._cache_dir = _resolve_cache_dir(cache_dir)
         self._disable_cache = disable_cache
 
-    def _download(
-        self,
-        file_url: str,
-        filename: str,
-        response_ok: t.Callable[[requests.Response], bool],
-    ) -> str:
-        assert self._cache_dir is not None
-        os.makedirs(self._cache_dir, exist_ok=True)
-        dest = os.path.join(self._cache_dir, filename)
-
-        def check_response_for_download(r: requests.Response) -> bool:
-            # if the response indicates a cache hit, treat it as valid
-            # this ensures that we short-circuit any further evaluation immediately on
-            # a hit
-            if _cache_hit(dest, r):
-                return True
-            # we now know it's not a hit, so validate the content (forces download)
-            return response_ok(r)
-
-        response = _get_request(file_url, response_ok=check_response_for_download)
-        # check to see if we have a file which matches the connection
-        # only download if we do not (cache miss, vs hit)
-        if not _cache_hit(dest, response):
-            _atomic_write(dest, response.content)
-
-        return dest
+    @functools.cached_property
+    def _cache(self) -> FileCache | None:
+        if self._cache_dir and not self._disable_cache:
+            os.makedirs(self._cache_dir, exist_ok=True)
+            return FileCache(self._cache_dir)
+        return None
+
+    @functools.cached_property
+    def _session(self) -> requests.Session:
+        session = requests.Session()
+        if self._cache is not None:
+            log.debug("using cache dir: %s", self._cache_dir)
+            session = cachecontrol.CacheControl(session, cache=self._cache)
+        else:
+            log.debug("caching disabled")
+        return session
 
     @contextlib.contextmanager
     def open(
         self,
         file_url: str,
-        filename: str,
         validate_response: t.Callable[[requests.Response], bool],
     ) -> t.Iterator[t.IO[bytes]]:
-        if (not self._cache_dir) or self._disable_cache:
-            yield io.BytesIO(
-                _get_request(file_url, response_ok=validate_response).content
-            )
-        else:
-            with open(
-                self._download(file_url, filename, response_ok=validate_response), "rb"
-            ) as fp:
-                yield fp
+        response = _get_request(
+            self._session, file_url, response_ok=validate_response, cache=self._cache
+        )
+        yield io.BytesIO(response.content)
 
     def bind(
         self,
         file_url: str,
-        filename: str | None = None,
         validation_callback: t.Callable[[bytes], t.Any] | None = None,
     ) -> BoundCacheDownloader:
         return BoundCacheDownloader(
-            file_url, self, filename=filename, validation_callback=validation_callback
+            file_url, self, validation_callback=validation_callback
         )
 
 
@@ -186,27 +123,28 @@ def __init__(
         file_url: str,
         downloader: CacheDownloader,
         *,
-        filename: str | None = None,
         validation_callback: t.Callable[[bytes], t.Any] | None = None,
     ) -> None:
         self._file_url = file_url
-        self._filename = filename or url_to_cache_filename(file_url)
         self._downloader = downloader
         self._validation_callback = validation_callback
 
     @contextlib.contextmanager
     def open(self) -> t.Iterator[t.IO[bytes]]:
         with self._downloader.open(
             self._file_url,
-            self._filename,
             validate_response=self._validate_response,
         ) as fp:
             yield fp
 
     def _validate_response(self, response: requests.Response) -> bool:
         if not self._validation_callback:
             return True
-
+        # CacheControl sets from_cache=True on cache hits; skip re-validation.
+        # Plain requests.Session (used when disable_cache=True) doesn't set this
+        # attribute at all, so we use getattr with a default.
+        if getattr(response, "from_cache", False):
+            return True
         try:
             self._validation_callback(response.content)
             return True

diff --git a/src/check_jsonschema/cli/main_command.py b/src/check_jsonschema/cli/main_command.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import logging
 import os
 import textwrap
 import typing as t
@@ -43,6 +44,18 @@ def set_color_mode(ctx: click.Context, param: str, value: str) -> None:
         }[value]
 
 
+def configure_logging(
+    ctx: click.Context, param: click.Parameter, value: str | None
+) -> None:
+    if value is None:
+        return
+    level = getattr(logging, value.upper())
+    logging.basicConfig(
+        level=level,
+        format="%(name)s [%(levelname)s]: %(message)s",
+    )
+
+
 def pretty_helptext_list(values: list[str] | tuple[str, ...]) -> str:
     return textwrap.indent(
         "\n".join(
@@ -88,12 +101,21 @@ def pretty_helptext_list(values: list[str] | tuple[str, ...]) -> str:
 )
 @click.help_option("-h", "--help")
 @click.version_option()
+@click.option(
+    "--log-level",
+    hidden=True,
+    help="Set the log level for debug output (e.g., DEBUG, INFO, WARNING).",
+    type=click.Choice(["DEBUG", "INFO", "WARNING", "ERROR"], case_sensitive=False),
+    callback=configure_logging,
+    expose_value=False,
+    is_eager=True,
+)
 @click.option(
     "--schemafile",
     help=(
         "The path to a file containing the JSON Schema to use or an "
         "HTTP(S) URI for the schema. If a remote file is used, "
-        "it will be downloaded and cached locally based on mtime. "
+        "it will be downloaded and cached locally. "
         "Use '-' for stdin."
     ),
     metavar="[PATH|URI]",

diff --git a/tests/acceptance/test_nonjson_schema_handling.py b/tests/acceptance/test_nonjson_schema_handling.py
@@ -138,9 +138,12 @@ def test_can_load_remote_yaml_schema_ref(run_line, tmp_path, passing_data):
 
 
 def test_can_load_remote_yaml_schema_ref_from_cache(
-    run_line, inject_cached_ref, tmp_path
+    run_line, tmp_path, cacheable_headers
 ):
     retrieval_uri = "https://example.org/retrieval/schemas/main.yaml"
+    ref_loc = "https://example.org/retrieval/schemas/title_schema.yaml"
+
+    # First: add good responses with cache headers
     responses.add(
         "GET",
         retrieval_uri,
@@ -150,16 +153,21 @@ def test_can_load_remote_yaml_schema_ref_from_cache(
   "title": {"$ref": "./title_schema.yaml"}
 additionalProperties: false
 """,
+        headers=cacheable_headers,
     )
+    responses.add("GET", ref_loc, body="type: string", headers=cacheable_headers)
 
-    ref_loc = "https://example.org/retrieval/schemas/title_schema.yaml"
-    # populate a bad schema, but then "override" that with a good cache value
-    # this can only pass (in the success case) if the cache loading really works
-    responses.add("GET", ref_loc, body="false")
-    inject_cached_ref(ref_loc, "type: string")
+    # Then: add bad responses (used if cache doesn't work)
+    responses.add("GET", retrieval_uri, body="error", status=500)
+    responses.add("GET", ref_loc, body="error", status=500)
 
     doc = tmp_path / "doc.json"
     doc.write_text(json.dumps(PASSING_DOCUMENT))
 
-    result = run_line(["check-jsonschema", "--schemafile", retrieval_uri, str(doc)])
-    assert result.exit_code == 0
+    # First run: populates cache
+    result1 = run_line(["check-jsonschema", "--schemafile", retrieval_uri, str(doc)])
+    assert result1.exit_code == 0
+
+    # Second run: should use cached data (not the 500 errors)
+    result2 = run_line(["check-jsonschema", "--schemafile", retrieval_uri, str(doc)])
+    assert result2.exit_code == 0