Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ dependencies = [
"regress>=2024.11.1",
"requests<3.0",
"click>=8,<9",
"CacheControl[filecache]>=0.14,<0.15",
]

[project.readme]
Expand Down Expand Up @@ -80,6 +81,12 @@ filterwarnings = [
"error",
# dateutil has a Python 3.12 compatibility issue.
'ignore:datetime\.datetime\.utcfromtimestamp\(\) is deprecated:DeprecationWarning',
# CacheControl's CallbackFileWrapper (filewrapper.py) creates a NamedTemporaryFile
# in __init__ that is only closed when the response is fully read. When a
# requests.ConnectionError occurs, _close() is never called and the temp file
# leaks. This is a CacheControl limitation, not a check-jsonschema bug.
# See plan.md section "5. Pytest Warning Filter" for details.
'ignore:Exception ignored in.*FileIO.*:pytest.PytestUnraisableExceptionWarning',
]
addopts = [
"--color=yes",
Expand Down
146 changes: 42 additions & 104 deletions src/check_jsonschema/cachedownloader.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,19 @@
from __future__ import annotations

import calendar
import contextlib
import hashlib
import functools
import io
import logging
import os
import platform
import shutil
import tempfile
import time
import typing as t

import cachecontrol
import requests
from cachecontrol.caches.file_cache import FileCache
from cachecontrol.controller import CacheController

_LASTMOD_FMT = "%a, %d %b %Y %H:%M:%S %Z"
log = logging.getLogger(__name__)


def _base_cache_dir() -> str | None:
Expand Down Expand Up @@ -42,26 +42,22 @@ def _resolve_cache_dir(dirname: str) -> str | None:
return cache_dir


def _lastmod_from_response(response: requests.Response) -> float:
try:
return calendar.timegm(
time.strptime(response.headers["last-modified"], _LASTMOD_FMT)
)
# OverflowError: time outside of platform-specific bounds
# ValueError: malformed/unparseable
# LookupError: no such header
except (OverflowError, ValueError, LookupError):
return 0.0


def _get_request(
file_url: str, *, response_ok: t.Callable[[requests.Response], bool]
session: requests.Session,
file_url: str,
*,
response_ok: t.Callable[[requests.Response], bool],
cache: FileCache | None = None,
) -> requests.Response:
num_retries = 2
r: requests.Response | None = None
for _attempt in range(num_retries + 1):
# Delete bad cache entry before retry so we fetch fresh data
if cache is not None and _attempt > 0:
cache_key = CacheController.cache_url(file_url)
cache.delete(cache_key)
try:
r = requests.get(file_url, stream=True)
r = session.get(file_url)
except requests.RequestException as e:
if _attempt == num_retries:
raise FailedDownloadError("encountered error during download") from e
Expand All @@ -74,48 +70,6 @@ def _get_request(
)


def _atomic_write(dest: str, content: bytes) -> None:
# download to a temp file and then move to the dest
# this makes the download safe if run in parallel (parallel runs
# won't create a new empty file for writing and cause failures)
fp = tempfile.NamedTemporaryFile(mode="wb", delete=False)
fp.write(content)
fp.close()
shutil.copy(fp.name, dest)
os.remove(fp.name)


def _cache_hit(cachefile: str, response: requests.Response) -> bool:
# no file? miss
if not os.path.exists(cachefile):
return False

# compare mtime on any cached file against the remote last-modified time
# it is considered a hit if the local file is at least as new as the remote file
local_mtime = os.path.getmtime(cachefile)
remote_mtime = _lastmod_from_response(response)
return local_mtime >= remote_mtime


def url_to_cache_filename(ref_url: str) -> str:
"""
Given a schema URL, convert it to a filename for caching in a cache dir.

Rules are as follows:
- the base filename is an sha256 hash of the URL
- if the filename ends in an extension (.json, .yaml, etc) that extension
is appended to the hash

Preserving file extensions preserves the extension-based logic used for parsing, and
it also helps a local editor (browsing the cache) identify filetypes.
"""
filename = hashlib.sha256(ref_url.encode()).hexdigest()
if "." in (last_part := ref_url.rpartition("/")[-1]):
_, _, extension = last_part.rpartition(".")
filename = f"{filename}.{extension}"
return filename


class FailedDownloadError(Exception):
pass

Expand All @@ -125,58 +79,41 @@ def __init__(self, cache_dir: str, *, disable_cache: bool = False) -> None:
self._cache_dir = _resolve_cache_dir(cache_dir)
self._disable_cache = disable_cache

def _download(
self,
file_url: str,
filename: str,
response_ok: t.Callable[[requests.Response], bool],
) -> str:
assert self._cache_dir is not None
os.makedirs(self._cache_dir, exist_ok=True)
dest = os.path.join(self._cache_dir, filename)

def check_response_for_download(r: requests.Response) -> bool:
# if the response indicates a cache hit, treat it as valid
# this ensures that we short-circuit any further evaluation immediately on
# a hit
if _cache_hit(dest, r):
return True
# we now know it's not a hit, so validate the content (forces download)
return response_ok(r)

response = _get_request(file_url, response_ok=check_response_for_download)
# check to see if we have a file which matches the connection
# only download if we do not (cache miss, vs hit)
if not _cache_hit(dest, response):
_atomic_write(dest, response.content)

return dest
@functools.cached_property
def _cache(self) -> FileCache | None:
if self._cache_dir and not self._disable_cache:
os.makedirs(self._cache_dir, exist_ok=True)
return FileCache(self._cache_dir)
return None

@functools.cached_property
def _session(self) -> requests.Session:
session = requests.Session()
if self._cache is not None:
log.debug("using cache dir: %s", self._cache_dir)
session = cachecontrol.CacheControl(session, cache=self._cache)
else:
log.debug("caching disabled")
return session

@contextlib.contextmanager
def open(
self,
file_url: str,
filename: str,
validate_response: t.Callable[[requests.Response], bool],
) -> t.Iterator[t.IO[bytes]]:
if (not self._cache_dir) or self._disable_cache:
yield io.BytesIO(
_get_request(file_url, response_ok=validate_response).content
)
else:
with open(
self._download(file_url, filename, response_ok=validate_response), "rb"
) as fp:
yield fp
response = _get_request(
self._session, file_url, response_ok=validate_response, cache=self._cache
)
yield io.BytesIO(response.content)

def bind(
self,
file_url: str,
filename: str | None = None,
validation_callback: t.Callable[[bytes], t.Any] | None = None,
) -> BoundCacheDownloader:
return BoundCacheDownloader(
file_url, self, filename=filename, validation_callback=validation_callback
file_url, self, validation_callback=validation_callback
)


Expand All @@ -186,27 +123,28 @@ def __init__(
file_url: str,
downloader: CacheDownloader,
*,
filename: str | None = None,
validation_callback: t.Callable[[bytes], t.Any] | None = None,
) -> None:
self._file_url = file_url
self._filename = filename or url_to_cache_filename(file_url)
self._downloader = downloader
self._validation_callback = validation_callback

@contextlib.contextmanager
def open(self) -> t.Iterator[t.IO[bytes]]:
with self._downloader.open(
self._file_url,
self._filename,
validate_response=self._validate_response,
) as fp:
yield fp

def _validate_response(self, response: requests.Response) -> bool:
if not self._validation_callback:
return True

# CacheControl sets from_cache=True on cache hits; skip re-validation.
# Plain requests.Session (used when disable_cache=True) doesn't set this
# attribute at all, so we use getattr with a default.
if getattr(response, "from_cache", False):
return True
Copy link
Copy Markdown
Member

@sirosen sirosen Apr 11, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This makes me nervous, and relates to my note above about bad data getting into the cache. Now the question I have is: how do we get bad data out of the cache?

There's only one bit of code which calls the CacheDownloader in practice.
In that code, the validation callback is "parse this data with our available parsers".
You can see it here, in the HttpSchemaReader.

Parsing is actually a dynamic problem! JSON5 parsers are used if available, but not if they aren't. And YAML parsing with ruamel.yaml is generally stable, but if we switched to ryaml or another implementation, that could change what's considered valid.
Also tomllib can be used, and tomllib on Python 3.15+ will support TOML 1.1 while earlier versions support TOML 1.0 . 😵

In general, I think we need to be prepared for the possibility that data which passed validation yesterday might not be valid today.


I'm not quite sure what to do on this front. We can remove this from_cache check, but I think the underlying problem will remain, because we can't clear the cache imperatively in response to a failure.

I think we may need the ability to do that. It would mean reaching "down the stack", to session.adapters["https://"].cache.delete(). It's not documented publicly, but it seems too important to pass up.

I'm also going to ask on the PyPA discord (I see some of the (past?) maintainers of cachecontrol are people I know from over there) to see if anyone can confirm that this is okay to do.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay, I got two pieces of information from a quick exchange with one of the maintainers.

  1. cachecontrol is maintained but not getting new features -- they're not recommending it necessarily for new work
  2. BaseCache.delete() is considered part of the public API, so if we hold a reference to the cache object, we can safely call delete() on it as needed!

(2) means that this has a pretty simple resolution. Use cache.delete() whenever there's bad data in the cache. Yay! 😄

(1) is a suggestion, but I think cachecontrol is a pretty good fit here. And it's stable and battle-tested. If I come to regret the decision, I'll either try to get involved in cachecontrol maintenance (it's the kind of web programming I enjoy), or else swap out for something like requests-cache.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. oof...
  2. yay!

i'll continue to work this to handle your comments at least. but, i can also make a similar pr for requests-cache if you would like to be able to compare them. with this pr as reference i'm guessing it wouldn't be take much time. if nothing else, useful reference later in case a switch might happen.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Implemented cache.delete() as you suggested. On retry attempts, we now explicitly delete the cache entry using CacheController.cache_url() to get the normalized key, then call cache.delete(). This replaces the no-cache header workaround.

The FileCache instance is stored as a @functools.cached_property on CacheDownloader (following the project's lazy initialization pattern) and passed to _get_request().

a2655eb

try:
self._validation_callback(response.content)
return True
Expand Down
24 changes: 23 additions & 1 deletion src/check_jsonschema/cli/main_command.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import logging
import os
import textwrap
import typing as t
Expand Down Expand Up @@ -43,6 +44,18 @@ def set_color_mode(ctx: click.Context, param: str, value: str) -> None:
}[value]


def configure_logging(
ctx: click.Context, param: click.Parameter, value: str | None
) -> None:
if value is None:
return
level = getattr(logging, value.upper())
logging.basicConfig(
level=level,
format="%(name)s [%(levelname)s]: %(message)s",
)


def pretty_helptext_list(values: list[str] | tuple[str, ...]) -> str:
return textwrap.indent(
"\n".join(
Expand Down Expand Up @@ -88,12 +101,21 @@ def pretty_helptext_list(values: list[str] | tuple[str, ...]) -> str:
)
@click.help_option("-h", "--help")
@click.version_option()
@click.option(
"--log-level",
hidden=True,
help="Set the log level for debug output (e.g., DEBUG, INFO, WARNING).",
type=click.Choice(["DEBUG", "INFO", "WARNING", "ERROR"], case_sensitive=False),
callback=configure_logging,
expose_value=False,
is_eager=True,
)
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't mind having an option for setting Python log-level, but I think we should make it hidden=True.
I worry about overwhelming users.

I've already exposed possibly a couple too many knobs for logging and output control -- we have -v, -q, and --traceback-mode. Plus, somewhat-orthogonal to those, --output-format and --color.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done — added hidden=True to the option.

4e3a59e

@click.option(
"--schemafile",
help=(
"The path to a file containing the JSON Schema to use or an "
"HTTP(S) URI for the schema. If a remote file is used, "
"it will be downloaded and cached locally based on mtime. "
"it will be downloaded and cached locally. "
"Use '-' for stdin."
),
metavar="[PATH|URI]",
Expand Down
24 changes: 16 additions & 8 deletions tests/acceptance/test_nonjson_schema_handling.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,9 +138,12 @@ def test_can_load_remote_yaml_schema_ref(run_line, tmp_path, passing_data):


def test_can_load_remote_yaml_schema_ref_from_cache(
run_line, inject_cached_ref, tmp_path
run_line, tmp_path, cacheable_headers
):
retrieval_uri = "https://example.org/retrieval/schemas/main.yaml"
ref_loc = "https://example.org/retrieval/schemas/title_schema.yaml"

# First: add good responses with cache headers
responses.add(
"GET",
retrieval_uri,
Expand All @@ -150,16 +153,21 @@ def test_can_load_remote_yaml_schema_ref_from_cache(
"title": {"$ref": "./title_schema.yaml"}
additionalProperties: false
""",
headers=cacheable_headers,
)
responses.add("GET", ref_loc, body="type: string", headers=cacheable_headers)

ref_loc = "https://example.org/retrieval/schemas/title_schema.yaml"
# populate a bad schema, but then "override" that with a good cache value
# this can only pass (in the success case) if the cache loading really works
responses.add("GET", ref_loc, body="false")
inject_cached_ref(ref_loc, "type: string")
# Then: add bad responses (used if cache doesn't work)
responses.add("GET", retrieval_uri, body="error", status=500)
responses.add("GET", ref_loc, body="error", status=500)

doc = tmp_path / "doc.json"
doc.write_text(json.dumps(PASSING_DOCUMENT))

result = run_line(["check-jsonschema", "--schemafile", retrieval_uri, str(doc)])
assert result.exit_code == 0
# First run: populates cache
result1 = run_line(["check-jsonschema", "--schemafile", retrieval_uri, str(doc)])
assert result1.exit_code == 0

# Second run: should use cached data (not the 500 errors)
result2 = run_line(["check-jsonschema", "--schemafile", retrieval_uri, str(doc)])
assert result2.exit_code == 0
Loading