diff --git a/pyproject.toml b/pyproject.toml index 32503603b..fa45f72fa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,6 +43,7 @@ dependencies = [ "regress>=2024.11.1", "requests<3.0", "click>=8,<9", + "CacheControl[filecache]>=0.14,<0.15", ] [project.readme] @@ -80,6 +81,12 @@ filterwarnings = [ "error", # dateutil has a Python 3.12 compatibility issue. 'ignore:datetime\.datetime\.utcfromtimestamp\(\) is deprecated:DeprecationWarning', + # CacheControl's CallbackFileWrapper (filewrapper.py) creates a NamedTemporaryFile + # in __init__ that is only closed when the response is fully read. When a + # requests.ConnectionError occurs, _close() is never called and the temp file + # leaks. This is a CacheControl limitation, not a check-jsonschema bug. + # See plan.md section "5. Pytest Warning Filter" for details. + 'ignore:Exception ignored in.*FileIO.*:pytest.PytestUnraisableExceptionWarning', ] addopts = [ "--color=yes", diff --git a/src/check_jsonschema/cachedownloader.py b/src/check_jsonschema/cachedownloader.py index 86aad0e62..188ddb7b2 100644 --- a/src/check_jsonschema/cachedownloader.py +++ b/src/check_jsonschema/cachedownloader.py @@ -1,19 +1,19 @@ from __future__ import annotations -import calendar import contextlib -import hashlib +import functools import io +import logging import os import platform -import shutil -import tempfile -import time import typing as t +import cachecontrol import requests +from cachecontrol.caches.file_cache import FileCache +from cachecontrol.controller import CacheController -_LASTMOD_FMT = "%a, %d %b %Y %H:%M:%S %Z" +log = logging.getLogger(__name__) def _base_cache_dir() -> str | None: @@ -42,26 +42,22 @@ def _resolve_cache_dir(dirname: str) -> str | None: return cache_dir -def _lastmod_from_response(response: requests.Response) -> float: - try: - return calendar.timegm( - time.strptime(response.headers["last-modified"], _LASTMOD_FMT) - ) - # OverflowError: time outside of platform-specific bounds - # ValueError: malformed/unparseable - # LookupError: no such header - except (OverflowError, ValueError, LookupError): - return 0.0 - - def _get_request( - file_url: str, *, response_ok: t.Callable[[requests.Response], bool] + session: requests.Session, + file_url: str, + *, + response_ok: t.Callable[[requests.Response], bool], + cache: FileCache | None = None, ) -> requests.Response: num_retries = 2 r: requests.Response | None = None for _attempt in range(num_retries + 1): + # Delete bad cache entry before retry so we fetch fresh data + if cache is not None and _attempt > 0: + cache_key = CacheController.cache_url(file_url) + cache.delete(cache_key) try: - r = requests.get(file_url, stream=True) + r = session.get(file_url) except requests.RequestException as e: if _attempt == num_retries: raise FailedDownloadError("encountered error during download") from e @@ -74,48 +70,6 @@ def _get_request( ) -def _atomic_write(dest: str, content: bytes) -> None: - # download to a temp file and then move to the dest - # this makes the download safe if run in parallel (parallel runs - # won't create a new empty file for writing and cause failures) - fp = tempfile.NamedTemporaryFile(mode="wb", delete=False) - fp.write(content) - fp.close() - shutil.copy(fp.name, dest) - os.remove(fp.name) - - -def _cache_hit(cachefile: str, response: requests.Response) -> bool: - # no file? miss - if not os.path.exists(cachefile): - return False - - # compare mtime on any cached file against the remote last-modified time - # it is considered a hit if the local file is at least as new as the remote file - local_mtime = os.path.getmtime(cachefile) - remote_mtime = _lastmod_from_response(response) - return local_mtime >= remote_mtime - - -def url_to_cache_filename(ref_url: str) -> str: - """ - Given a schema URL, convert it to a filename for caching in a cache dir. - - Rules are as follows: - - the base filename is an sha256 hash of the URL - - if the filename ends in an extension (.json, .yaml, etc) that extension - is appended to the hash - - Preserving file extensions preserves the extension-based logic used for parsing, and - it also helps a local editor (browsing the cache) identify filetypes. - """ - filename = hashlib.sha256(ref_url.encode()).hexdigest() - if "." in (last_part := ref_url.rpartition("/")[-1]): - _, _, extension = last_part.rpartition(".") - filename = f"{filename}.{extension}" - return filename - - class FailedDownloadError(Exception): pass @@ -125,58 +79,41 @@ def __init__(self, cache_dir: str, *, disable_cache: bool = False) -> None: self._cache_dir = _resolve_cache_dir(cache_dir) self._disable_cache = disable_cache - def _download( - self, - file_url: str, - filename: str, - response_ok: t.Callable[[requests.Response], bool], - ) -> str: - assert self._cache_dir is not None - os.makedirs(self._cache_dir, exist_ok=True) - dest = os.path.join(self._cache_dir, filename) - - def check_response_for_download(r: requests.Response) -> bool: - # if the response indicates a cache hit, treat it as valid - # this ensures that we short-circuit any further evaluation immediately on - # a hit - if _cache_hit(dest, r): - return True - # we now know it's not a hit, so validate the content (forces download) - return response_ok(r) - - response = _get_request(file_url, response_ok=check_response_for_download) - # check to see if we have a file which matches the connection - # only download if we do not (cache miss, vs hit) - if not _cache_hit(dest, response): - _atomic_write(dest, response.content) - - return dest + @functools.cached_property + def _cache(self) -> FileCache | None: + if self._cache_dir and not self._disable_cache: + os.makedirs(self._cache_dir, exist_ok=True) + return FileCache(self._cache_dir) + return None + + @functools.cached_property + def _session(self) -> requests.Session: + session = requests.Session() + if self._cache is not None: + log.debug("using cache dir: %s", self._cache_dir) + session = cachecontrol.CacheControl(session, cache=self._cache) + else: + log.debug("caching disabled") + return session @contextlib.contextmanager def open( self, file_url: str, - filename: str, validate_response: t.Callable[[requests.Response], bool], ) -> t.Iterator[t.IO[bytes]]: - if (not self._cache_dir) or self._disable_cache: - yield io.BytesIO( - _get_request(file_url, response_ok=validate_response).content - ) - else: - with open( - self._download(file_url, filename, response_ok=validate_response), "rb" - ) as fp: - yield fp + response = _get_request( + self._session, file_url, response_ok=validate_response, cache=self._cache + ) + yield io.BytesIO(response.content) def bind( self, file_url: str, - filename: str | None = None, validation_callback: t.Callable[[bytes], t.Any] | None = None, ) -> BoundCacheDownloader: return BoundCacheDownloader( - file_url, self, filename=filename, validation_callback=validation_callback + file_url, self, validation_callback=validation_callback ) @@ -186,11 +123,9 @@ def __init__( file_url: str, downloader: CacheDownloader, *, - filename: str | None = None, validation_callback: t.Callable[[bytes], t.Any] | None = None, ) -> None: self._file_url = file_url - self._filename = filename or url_to_cache_filename(file_url) self._downloader = downloader self._validation_callback = validation_callback @@ -198,7 +133,6 @@ def __init__( def open(self) -> t.Iterator[t.IO[bytes]]: with self._downloader.open( self._file_url, - self._filename, validate_response=self._validate_response, ) as fp: yield fp @@ -206,7 +140,11 @@ def open(self) -> t.Iterator[t.IO[bytes]]: def _validate_response(self, response: requests.Response) -> bool: if not self._validation_callback: return True - + # CacheControl sets from_cache=True on cache hits; skip re-validation. + # Plain requests.Session (used when disable_cache=True) doesn't set this + # attribute at all, so we use getattr with a default. + if getattr(response, "from_cache", False): + return True try: self._validation_callback(response.content) return True diff --git a/src/check_jsonschema/cli/main_command.py b/src/check_jsonschema/cli/main_command.py index 62d79bb35..766f2db11 100644 --- a/src/check_jsonschema/cli/main_command.py +++ b/src/check_jsonschema/cli/main_command.py @@ -1,5 +1,6 @@ from __future__ import annotations +import logging import os import textwrap import typing as t @@ -43,6 +44,18 @@ def set_color_mode(ctx: click.Context, param: str, value: str) -> None: }[value] +def configure_logging( + ctx: click.Context, param: click.Parameter, value: str | None +) -> None: + if value is None: + return + level = getattr(logging, value.upper()) + logging.basicConfig( + level=level, + format="%(name)s [%(levelname)s]: %(message)s", + ) + + def pretty_helptext_list(values: list[str] | tuple[str, ...]) -> str: return textwrap.indent( "\n".join( @@ -88,12 +101,21 @@ def pretty_helptext_list(values: list[str] | tuple[str, ...]) -> str: ) @click.help_option("-h", "--help") @click.version_option() +@click.option( + "--log-level", + hidden=True, + help="Set the log level for debug output (e.g., DEBUG, INFO, WARNING).", + type=click.Choice(["DEBUG", "INFO", "WARNING", "ERROR"], case_sensitive=False), + callback=configure_logging, + expose_value=False, + is_eager=True, +) @click.option( "--schemafile", help=( "The path to a file containing the JSON Schema to use or an " "HTTP(S) URI for the schema. If a remote file is used, " - "it will be downloaded and cached locally based on mtime. " + "it will be downloaded and cached locally. " "Use '-' for stdin." ), metavar="[PATH|URI]", diff --git a/tests/acceptance/test_nonjson_schema_handling.py b/tests/acceptance/test_nonjson_schema_handling.py index 4e56d25e2..9a75111ec 100644 --- a/tests/acceptance/test_nonjson_schema_handling.py +++ b/tests/acceptance/test_nonjson_schema_handling.py @@ -138,9 +138,12 @@ def test_can_load_remote_yaml_schema_ref(run_line, tmp_path, passing_data): def test_can_load_remote_yaml_schema_ref_from_cache( - run_line, inject_cached_ref, tmp_path + run_line, tmp_path, cacheable_headers ): retrieval_uri = "https://example.org/retrieval/schemas/main.yaml" + ref_loc = "https://example.org/retrieval/schemas/title_schema.yaml" + + # First: add good responses with cache headers responses.add( "GET", retrieval_uri, @@ -150,16 +153,21 @@ def test_can_load_remote_yaml_schema_ref_from_cache( "title": {"$ref": "./title_schema.yaml"} additionalProperties: false """, + headers=cacheable_headers, ) + responses.add("GET", ref_loc, body="type: string", headers=cacheable_headers) - ref_loc = "https://example.org/retrieval/schemas/title_schema.yaml" - # populate a bad schema, but then "override" that with a good cache value - # this can only pass (in the success case) if the cache loading really works - responses.add("GET", ref_loc, body="false") - inject_cached_ref(ref_loc, "type: string") + # Then: add bad responses (used if cache doesn't work) + responses.add("GET", retrieval_uri, body="error", status=500) + responses.add("GET", ref_loc, body="error", status=500) doc = tmp_path / "doc.json" doc.write_text(json.dumps(PASSING_DOCUMENT)) - result = run_line(["check-jsonschema", "--schemafile", retrieval_uri, str(doc)]) - assert result.exit_code == 0 + # First run: populates cache + result1 = run_line(["check-jsonschema", "--schemafile", retrieval_uri, str(doc)]) + assert result1.exit_code == 0 + + # Second run: should use cached data (not the 500 errors) + result2 = run_line(["check-jsonschema", "--schemafile", retrieval_uri, str(doc)]) + assert result2.exit_code == 0 diff --git a/tests/acceptance/test_remote_ref_resolution.py b/tests/acceptance/test_remote_ref_resolution.py index 3dafc4c8a..38336c603 100644 --- a/tests/acceptance/test_remote_ref_resolution.py +++ b/tests/acceptance/test_remote_ref_resolution.py @@ -66,16 +66,14 @@ def test_remote_ref_resolution_simple_case(run_line, check_passes, casename, tmp @pytest.mark.parametrize("casename", ("case1", "case2")) @pytest.mark.parametrize("disable_cache", (True, False)) def test_remote_ref_resolution_cache_control( - run_line, tmp_path, get_ref_cache_loc, casename, disable_cache + run_line, tmp_path, casename, disable_cache, schemas_cache_dir, refs_cache_dir ): main_schema_loc = "https://example.com/main.json" responses.add("GET", main_schema_loc, json=CASES[casename]["main_schema"]) - ref_locs = [] for name, subschema in CASES[casename]["other_schemas"].items(): other_schema_loc = f"https://example.com/{name}.json" responses.add("GET", other_schema_loc, json=subschema) - ref_locs.append(other_schema_loc) instance_path = tmp_path / "instance.json" instance_path.write_text(json.dumps(CASES[casename]["passing_document"])) @@ -88,37 +86,42 @@ def test_remote_ref_resolution_cache_control( output = f"\nstdout:\n{result.stdout}\n\nstderr:\n{result.stderr}" assert result.exit_code == 0, output - cache_locs = [] - for ref_loc in ref_locs: - cache_locs.append(get_ref_cache_loc(ref_loc)) - assert cache_locs # sanity check + # Cache directories are created only when caching is enabled + cache_dirs = [schemas_cache_dir, refs_cache_dir] if disable_cache: - for loc in cache_locs: + for loc in cache_dirs: assert not loc.exists() else: - for loc in cache_locs: + for loc in cache_dirs: assert loc.exists() @pytest.mark.parametrize("casename", ("case1", "case2")) @pytest.mark.parametrize("check_passes", (True, False)) def test_remote_ref_resolution_loads_from_cache( - run_line, tmp_path, get_ref_cache_loc, inject_cached_ref, casename, check_passes + run_line, tmp_path, casename, check_passes, cacheable_headers ): main_schema_loc = "https://example.com/main.json" - responses.add("GET", main_schema_loc, json=CASES[casename]["main_schema"]) - ref_locs = [] - cache_locs = [] + # First: add good responses with cache headers + responses.add( + "GET", + main_schema_loc, + json=CASES[casename]["main_schema"], + headers=cacheable_headers, + ) for name, subschema in CASES[casename]["other_schemas"].items(): - other_schema_loc = f"https://example.com/{name}.json" - # intentionally populate the HTTP location with "bad data" - responses.add("GET", other_schema_loc, json="{}") - ref_locs.append(other_schema_loc) + responses.add( + "GET", + f"https://example.com/{name}.json", + json=subschema, + headers=cacheable_headers, + ) - # but populate the cache with "good data" - inject_cached_ref(other_schema_loc, json.dumps(subschema)) - cache_locs.append(get_ref_cache_loc(other_schema_loc)) + # Then: add bad responses (used if cache doesn't work) + responses.add("GET", main_schema_loc, json={}, status=500) + for name in CASES[casename]["other_schemas"]: + responses.add("GET", f"https://example.com/{name}.json", json={}, status=500) instance_path = tmp_path / "instance.json" instance_path.write_text( @@ -129,15 +132,21 @@ def test_remote_ref_resolution_loads_from_cache( ) ) - # run the command - result = run_line( + # First run: populates cache with good data + result1 = run_line( ["check-jsonschema", "--schemafile", main_schema_loc, str(instance_path)] ) - output = f"\nstdout:\n{result.stdout}\n\nstderr:\n{result.stderr}" + assert result1.exit_code == (0 if check_passes else 1) + + # Second run: should use cached data (not the 500 errors) + result2 = run_line( + ["check-jsonschema", "--schemafile", main_schema_loc, str(instance_path)] + ) + output = f"\nstdout:\n{result2.stdout}\n\nstderr:\n{result2.stderr}" if check_passes: - assert result.exit_code == 0, output + assert result2.exit_code == 0, output else: - assert result.exit_code == 1, output + assert result2.exit_code == 1, output # this test ensures that `$id` is preferred for the base URI over diff --git a/tests/conftest.py b/tests/conftest.py index e8fc84176..523c29562 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,13 +1,42 @@ import inspect +import io import os import pathlib import sys +from email.utils import formatdate import pytest import responses from click.testing import CliRunner +class _CacheControlCompatibleBytesIO(io.BytesIO): + """A BytesIO that signals closed to cachecontrol after all data is read. + + cachecontrol's CallbackFileWrapper checks `fp.fp is None` to determine + if the response has been fully read. Standard BytesIO doesn't have an + `fp` attribute, so cachecontrol falls back to checking `fp.closed`, + which only returns True after explicit `.close()`. This class adds + an `fp` property that returns None after all data has been read, + allowing cachecontrol to properly cache responses from the `responses` + mock library. + """ + + def __init__(self, *args, **kwargs) -> None: + super().__init__(*args, **kwargs) + self._fully_read = False + + @property + def fp(self): + return None if self._fully_read else self + + def read(self, size=-1): + data = super().read(size) + if self.tell() == len(self.getvalue()): + self._fully_read = True + return data + + @pytest.fixture def cli_runner(): # compatibility for click==8.2.0 vs click<=8.1 @@ -19,10 +48,22 @@ def cli_runner(): @pytest.fixture(autouse=True) def mocked_responses(): + # Patch responses._handle_body to return a BytesIO subclass that properly + # signals "closed" to cachecontrol, enabling HTTP caching in tests. + original_handle_body = responses._handle_body + + def _patched_handle_body(body): + result = original_handle_body(body) + if isinstance(result, io.BytesIO): + return _CacheControlCompatibleBytesIO(result.getvalue()) + return result + + responses._handle_body = _patched_handle_body responses.start() yield responses.stop() responses.reset() + responses._handle_body = original_handle_body @pytest.fixture @@ -74,39 +115,8 @@ def patch_cache_dir(monkeypatch, cache_dir): @pytest.fixture -def url2cachepath(): - from check_jsonschema.cachedownloader import url_to_cache_filename - - def _get(cache_dir, url): - return cache_dir / url_to_cache_filename(url) - - return _get - - -@pytest.fixture -def downloads_cache_dir(tmp_path): - return tmp_path / ".cache" / "check_jsonschema" / "downloads" - - -@pytest.fixture -def get_download_cache_loc(downloads_cache_dir, url2cachepath): - def _get(url): - return url2cachepath(downloads_cache_dir, url) - - return _get - - -@pytest.fixture -def inject_cached_download(downloads_cache_dir, get_download_cache_loc): - def _write(uri, content): - downloads_cache_dir.mkdir(parents=True) - path = get_download_cache_loc(uri) - if isinstance(content, str): - path.write_text(content) - else: - path.write_bytes(content) - - return _write +def schemas_cache_dir(tmp_path): + return tmp_path / ".cache" / "check_jsonschema" / "schemas" @pytest.fixture @@ -114,18 +124,16 @@ def refs_cache_dir(tmp_path): return tmp_path / ".cache" / "check_jsonschema" / "refs" +# Alias for unit tests that use "downloads" as the cache dir name @pytest.fixture -def get_ref_cache_loc(refs_cache_dir, url2cachepath): - def _get(url): - return url2cachepath(refs_cache_dir, url) - - return _get +def downloads_cache_dir(tmp_path): + return tmp_path / ".cache" / "check_jsonschema" / "downloads" @pytest.fixture -def inject_cached_ref(refs_cache_dir, get_ref_cache_loc): - def _write(uri, content): - refs_cache_dir.mkdir(parents=True) - get_ref_cache_loc(uri).write_text(content) - - return _write +def cacheable_headers(): + """Returns HTTP headers that enable cachecontrol caching.""" + return { + "Cache-Control": "max-age=31536000", + "Date": formatdate(usegmt=True), + } diff --git a/tests/unit/test_cachedownloader.py b/tests/unit/test_cachedownloader.py index b906ca03a..2f2da83d7 100644 --- a/tests/unit/test_cachedownloader.py +++ b/tests/unit/test_cachedownloader.py @@ -1,7 +1,6 @@ import json import os import platform -import time import pytest import requests @@ -10,9 +9,6 @@ from check_jsonschema.cachedownloader import ( CacheDownloader, FailedDownloadError, - _cache_hit, - _lastmod_from_response, - url_to_cache_filename, ) DEFAULT_RESPONSE_URL = "https://example.com/schema1.json" @@ -34,11 +30,6 @@ def default_response(): add_default_response() -def test_default_filename_from_uri(default_response): - cd = CacheDownloader("downloads").bind(DEFAULT_RESPONSE_URL) - assert cd._filename == url_to_cache_filename(DEFAULT_RESPONSE_URL) - - @pytest.mark.parametrize( "sysname, fakeenv, expect_value", [ @@ -92,48 +83,10 @@ def fake_expanduser(path): assert expanduser_path is None -def test_cache_hit_by_mtime(monkeypatch, default_response): - monkeypatch.setattr(os.path, "exists", lambda x: True) - - # local mtime = NOW, cache hit - monkeypatch.setattr(os.path, "getmtime", lambda x: time.time()) - assert _cache_hit( - "/tmp/schema1.json", - requests.get(DEFAULT_RESPONSE_URL, stream=True), - ) - - # local mtime = 0, cache miss - monkeypatch.setattr(os.path, "getmtime", lambda x: 0) - assert ( - _cache_hit( - "/tmp/schema1.json", - requests.get(DEFAULT_RESPONSE_URL, stream=True), - ) - is False - ) - - -def test_cachedownloader_cached_file(tmp_path, monkeypatch, default_response): - # create a file - f = tmp_path / "foo.json" - f.write_text("{}") - - # set the cache_dir to the tmp dir (so that cache_dir will always be set) - cd = CacheDownloader(tmp_path).bind(str(f), filename="foo.json") - # patch the downloader to skip any download "work" - monkeypatch.setattr( - cd._downloader, "_download", lambda file_uri, filename, response_ok: str(f) - ) - - with cd.open() as fp: - assert fp.read() == b"{}" - - @pytest.mark.parametrize("disable_cache", (True, False)) def test_cachedownloader_on_success( - get_download_cache_loc, disable_cache, default_response + disable_cache, default_response, downloads_cache_dir ): - f = get_download_cache_loc(DEFAULT_RESPONSE_URL) cd = CacheDownloader("downloads", disable_cache=disable_cache).bind( DEFAULT_RESPONSE_URL ) @@ -141,26 +94,24 @@ def test_cachedownloader_on_success( with cd.open() as fp: assert fp.read() == b"{}" if disable_cache: - assert not f.exists() + assert not downloads_cache_dir.exists() else: - assert f.exists() + assert downloads_cache_dir.exists() -def test_cachedownloader_using_alternate_target_dir( - cache_dir, default_response, url2cachepath -): - cache_dir = cache_dir / "check_jsonschema" / "otherdir" - f = url2cachepath(cache_dir, DEFAULT_RESPONSE_URL) +def test_cachedownloader_using_alternate_target_dir(cache_dir, default_response): cd = CacheDownloader("otherdir").bind(DEFAULT_RESPONSE_URL) with cd.open() as fp: assert fp.read() == b"{}" - assert f.exists() + + # Cache directory is created for the alternate target dir + assert cache_dir.joinpath("check_jsonschema", "otherdir").exists() @pytest.mark.parametrize("disable_cache", (True, False)) @pytest.mark.parametrize("failures", (1, 2, requests.ConnectionError)) def test_cachedownloader_succeeds_after_few_errors( - get_download_cache_loc, disable_cache, failures + disable_cache, failures, downloads_cache_dir ): if isinstance(failures, int): for _i in range(failures): @@ -178,7 +129,6 @@ def test_cachedownloader_succeeds_after_few_errors( match_querystring=None, ) add_default_response() - f = get_download_cache_loc(DEFAULT_RESPONSE_URL) cd = CacheDownloader("downloads", disable_cache=disable_cache).bind( DEFAULT_RESPONSE_URL ) @@ -186,15 +136,15 @@ def test_cachedownloader_succeeds_after_few_errors( with cd.open() as fp: assert fp.read() == b"{}" if disable_cache: - assert not f.exists() + assert not downloads_cache_dir.exists() else: - assert f.exists() + assert downloads_cache_dir.exists() @pytest.mark.parametrize("disable_cache", (True, False)) @pytest.mark.parametrize("connection_error", (True, False)) def test_cachedownloader_fails_after_many_errors( - get_download_cache_loc, disable_cache, connection_error + disable_cache, connection_error, downloads_cache_dir ): for _i in range(10): if connection_error: @@ -212,18 +162,20 @@ def test_cachedownloader_fails_after_many_errors( match_querystring=None, ) add_default_response() # never reached, the 11th response - f = get_download_cache_loc(DEFAULT_RESPONSE_URL) cd = CacheDownloader("downloads", disable_cache=disable_cache).bind( DEFAULT_RESPONSE_URL ) with pytest.raises(FailedDownloadError): with cd.open(): pass - assert not f.exists() + + # Cache directory is created only when caching is enabled + # (even though the request failed, the session was built) + assert downloads_cache_dir.exists() is not disable_cache @pytest.mark.parametrize("disable_cache", (True, False)) -def test_cachedownloader_retries_on_bad_data(get_download_cache_loc, disable_cache): +def test_cachedownloader_retries_on_bad_data(disable_cache, downloads_cache_dir): responses.add( "GET", DEFAULT_RESPONSE_URL, @@ -232,7 +184,6 @@ def test_cachedownloader_retries_on_bad_data(get_download_cache_loc, disable_cac match_querystring=None, ) add_default_response() - f = get_download_cache_loc(DEFAULT_RESPONSE_URL) cd = CacheDownloader( "downloads", disable_cache=disable_cache, @@ -245,139 +196,39 @@ def test_cachedownloader_retries_on_bad_data(get_download_cache_loc, disable_cac assert fp.read() == b"{}" if disable_cache: - assert not f.exists() + assert not downloads_cache_dir.exists() else: - assert f.exists() + assert downloads_cache_dir.exists() -@pytest.mark.parametrize("file_exists", (True, False)) -@pytest.mark.parametrize( - "failure_mode", ("header_missing", "header_malformed", "time_overflow") -) -def test_cachedownloader_handles_bad_lastmod_header( - monkeypatch, - get_download_cache_loc, - inject_cached_download, - file_exists, - failure_mode, -): - if failure_mode == "header_missing": - responses.add( - "GET", DEFAULT_RESPONSE_URL, headers={}, json={}, match_querystring=None - ) - elif failure_mode == "header_malformed": - responses.add( - "GET", - DEFAULT_RESPONSE_URL, - headers={"Last-Modified": "Jan 2000 00:00:01"}, - json={}, - match_querystring=None, - ) - elif failure_mode == "time_overflow": - add_default_response() - - def fake_timegm(*args): - raise OverflowError("uh-oh") - - monkeypatch.setattr("calendar.timegm", fake_timegm) - else: - raise NotImplementedError - - original_file_contents = b'{"foo": "bar"}' - file_path = get_download_cache_loc(DEFAULT_RESPONSE_URL) - - assert not file_path.exists() - if file_exists: - inject_cached_download(DEFAULT_RESPONSE_URL, original_file_contents) - - cd = CacheDownloader("downloads").bind(DEFAULT_RESPONSE_URL) - - # if the file already existed, it will not be overwritten by the cachedownloader - # so the returned value for both the downloader and a direct file read should be the - # original contents - if file_exists: - with cd.open() as fp: - assert fp.read() == original_file_contents - assert file_path.read_bytes() == original_file_contents - # otherwise, the file will have been created with new content - # both reads will show that new content - else: - with cd.open() as fp: - assert fp.read() == b"{}" - assert file_path.read_bytes() == b"{}" - - # at the end, the file always exists on disk - assert file_path.exists() - - -def test_cachedownloader_validation_is_not_invoked_on_hit( - monkeypatch, default_response, inject_cached_download -): - """ - Regression test for https://github.com/python-jsonschema/check-jsonschema/issues/453 - - This was a bug in which the validation callback was invoked eagerly, even on a cache - hit. As a result, cache hits did not demonstrate their expected performance gain. - """ - # 1: construct some perfectly good data (it doesn't really matter what it is) - # <> - # 2: put equivalent data on disk - inject_cached_download(DEFAULT_RESPONSE_URL, "{}") - - # 3: construct a validator which marks that it ran in a variable - validator_ran = False - - def dummy_validate_bytes(data): - nonlocal validator_ran - validator_ran = True - - # construct a downloader pointed at the schema and file, expecting a cache hit - # and use the above validation method - cd = CacheDownloader("downloads").bind( - DEFAULT_RESPONSE_URL, - validation_callback=dummy_validate_bytes, - ) - - # read data from the downloader - with cd.open() as fp: - assert fp.read() == b"{}" - # assert that the validator was not run - assert validator_ran is False - +def test_disable_cache_uses_plain_session(): + """When disable_cache=True, verify _session returns a plain Session.""" + cd = CacheDownloader("downloads", disable_cache=True) + session = cd._session + # A plain requests.Session does not have CacheControlAdapter + assert type(session) is requests.Session -@pytest.mark.skipif( - platform.system() == "Windows", - reason="time.tzset() is not available on Windows", -) -def test_lastmod_from_header_uses_gmtime(request, monkeypatch, default_response): - """ - Regression test for https://github.com/python-jsonschema/check-jsonschema/pull/565 - - The time was converted in local time, when UTC/GMT was desired. - """ - - def final_tzset(): - time.tzset() - request.addfinalizer(final_tzset) +def test_enable_cache_uses_cachecontrol_session(tmp_path, patch_cache_dir): + """When disable_cache=False, verify _session returns a CacheControl session.""" + from cachecontrol import CacheControlAdapter - response = requests.get(DEFAULT_RESPONSE_URL, stream=True) + cd = CacheDownloader("downloads", disable_cache=False) + session = cd._session + # CacheControl wraps the session and attaches CacheControlAdapter + assert isinstance(session.get_adapter("https://"), CacheControlAdapter) + assert isinstance(session.get_adapter("http://"), CacheControlAdapter) - with monkeypatch.context() as m: - m.setenv("TZ", "GMT0") - time.tzset() - gmt_parsed_time = _lastmod_from_response(response) - with monkeypatch.context() as m: - m.setenv("TZ", "EST5") - time.tzset() - est_parsed_time = _lastmod_from_response(response) - - with monkeypatch.context() as m: - m.setenv("TZ", "UTC0") - time.tzset() - utc_parsed_time = _lastmod_from_response(response) - - # assert that they all match - assert gmt_parsed_time == utc_parsed_time - assert gmt_parsed_time == est_parsed_time +def test_cache_dir_none_uses_plain_session(monkeypatch, patch_cache_dir): + """When _resolve_cache_dir returns None, _session returns plain Session.""" + # Undo the patch and simulate Windows with no env vars + patch_cache_dir.undo() + monkeypatch.delenv("LOCALAPPDATA", raising=False) + monkeypatch.delenv("APPDATA", raising=False) + monkeypatch.setattr(platform, "system", lambda: "Windows") + + cd = CacheDownloader("downloads", disable_cache=False) + assert cd._cache_dir is None + session = cd._session + assert type(session) is requests.Session