Skip to content

Commit 52a5a7e

Browse files
committed
feat(core): brotli-compress .socket.facts.json on full-scan upload
Compress the reachability facts file to a `.socket.facts.json.br` multipart part before uploading it as part of a full scan. The Socket API transparently decompresses parts named exactly `.socket.facts.json.br` and stores plain JSON, so the stored result is unchanged while the on-the-wire payload shrinks by roughly 10-40x for typical facts files. This keeps large tier-1 reachability facts files under the API's per-file upload size cap. Previously an oversized facts file made the full-scan upload fail (surfaced as an HTTP 4xx/502 with the scan stuck and no report produced). - Compress at the upload boundary (Core.create_full_scan); the on-disk file is left untouched so local consumers still read plain .socket.facts.json. - Only files whose basename is exactly .socket.facts.json are compressed (the API matches that exact name); a custom --reach-output-file name and empty placeholder files are left as plain uploads. - Stream in 1 MiB chunks so large files aren't held fully in memory. - Never blocks an upload: any compression failure falls back to the plain file, and a partially-written .socket.facts.json.br is removed rather than left behind in the target directory. - Add brotli (CPython) / brotlicffi (PyPy) dependency.
1 parent cdd3bf6 commit 52a5a7e

6 files changed

Lines changed: 405 additions & 5 deletions

File tree

CHANGELOG.md

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,32 @@
11
# Changelog
22

3+
## 2.3.1
4+
5+
### New: brotli-compressed `.socket.facts.json` upload
6+
7+
The reachability facts file (`.socket.facts.json`) is now brotli-compressed before it is
8+
uploaded as part of a full scan. The Socket API transparently decompresses any multipart
9+
part named exactly `.socket.facts.json.br` and stores it as plain `.socket.facts.json`, so
10+
the stored result is unchanged — but the on-the-wire payload shrinks dramatically (a
11+
~262 MB facts file compresses to roughly 15–30 MB).
12+
13+
This fixes large tier‑1 reachability scans that previously failed when the uncompressed
14+
facts file exceeded the API's per‑file upload size cap (surfaced to the CLI as an HTTP
15+
4xx/“502”, leaving the scan stuck with no report).
16+
17+
Details:
18+
19+
- Compression happens at the upload boundary (`Core.create_full_scan`); the file on disk is
20+
left untouched, so local consumers (SARIF/JSON output, tier‑1 finalize, alert selection)
21+
continue to read the plain `.socket.facts.json`.
22+
- Only a file whose basename is exactly `.socket.facts.json` is compressed (the API matches
23+
that exact name). A custom `--reach-output-file` name is uploaded uncompressed, as before.
24+
- Empty baseline-scan placeholder files are not compressed.
25+
- Compression never blocks an upload: if it fails for any reason it falls back to uploading
26+
the plain file, and a partially-written `.socket.facts.json.br` is removed rather than
27+
left behind in the target directory.
28+
- Adds a `brotli` (CPython) / `brotlicffi` (PyPy) dependency.
29+
330
## 2.3.0
431

532
### New: `--exit-code-on-api-error`

pyproject.toml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ build-backend = "hatchling.build"
66

77
[project]
88
name = "socketsecurity"
9-
version = "2.3.0"
9+
version = "2.3.1"
1010
requires-python = ">= 3.11"
1111
license = {"file" = "LICENSE"}
1212
dependencies = [
@@ -19,6 +19,8 @@ dependencies = [
1919
"socketdev>=3.0.33,<4.0.0",
2020
"bs4>=0.0.2",
2121
"markdown>=3.10",
22+
"brotli>=1.0.9; platform_python_implementation == 'CPython'",
23+
"brotlicffi>=1.0.9; platform_python_implementation != 'CPython'",
2224
]
2325
readme = "README.md"
2426
description = "Socket Security CLI for CI/CD"

socketsecurity/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
__author__ = 'socket.dev'
2-
__version__ = '2.3.0'
2+
__version__ = '2.3.1'
33
USER_AGENT = f'SocketPythonCLI/{__version__}'

socketsecurity/core/__init__.py

Lines changed: 129 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,26 @@
5151

5252
_HUMANIZE_BOUNDARY = re.compile(r"(?<=[a-z0-9])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])")
5353

54+
# Reachability facts-file upload compression.
55+
#
56+
# The Socket full-scan endpoint transparently brotli-decompresses any multipart part
57+
# whose basename is exactly ``.socket.facts.json.br`` and stores it as plain
58+
# ``.socket.facts.json``. Compressing the facts file on upload keeps it well under the
59+
# server's per-file size cap (a ~262 MB facts file compresses to roughly 15-30 MB),
60+
# which is required for large reachability (tier 1) scans to succeed.
61+
#
62+
# The server matches the *exact* name ``.socket.facts.json.br``, so we only compress
63+
# files whose basename is exactly ``.socket.facts.json`` (a custom ``--reach-output-file``
64+
# name would not be decompressed server-side, so it is left as a plain upload).
65+
SOCKET_FACTS_FILENAME = ".socket.facts.json"
66+
SOCKET_FACTS_BROTLI_FILENAME = ".socket.facts.json.br"
67+
# Brotli quality (0-11); 5 is a good speed/ratio tradeoff for large JSON payloads.
68+
SOCKET_FACTS_BROTLI_QUALITY = 5
69+
# Largest brotli window (2**24 bytes); improves the ratio on large facts files.
70+
SOCKET_FACTS_BROTLI_LGWIN = 24
71+
# Stream the facts file in 1 MiB chunks so large files aren't held fully in memory.
72+
SOCKET_FACTS_BROTLI_CHUNK_SIZE = 1024 * 1024
73+
5474

5575
def _humanize_alert_type(alert_type: str) -> str:
5676
"""Convert a camelCase/PascalCase alert type into a Title-Cased label.
@@ -544,6 +564,102 @@ def finalize_tier1_scan(self, full_scan_id: str, facts_file_path: str) -> bool:
544564
log.debug(f"Unable to finalize tier 1 scan: {e}")
545565
return False
546566

567+
@staticmethod
568+
def _compress_facts_file(source_path: str) -> str:
569+
"""Brotli-compress a ``.socket.facts.json`` file to a sibling ``.socket.facts.json.br``.
570+
571+
The source is streamed in chunks so a large facts file (hundreds of MB) never has
572+
to be held in memory at once. The compressed file is written next to the source so
573+
that the multipart key the SDK derives keeps the same directory prefix, only with a
574+
``.br`` basename. Any existing ``.socket.facts.json.br`` sibling is overwritten, and a
575+
partially-written output is removed if compression fails part-way through (e.g. the
576+
disk fills up mid-stream) so no orphaned ``.br`` is left in the target directory.
577+
578+
Args:
579+
source_path: Path to the plain ``.socket.facts.json`` file.
580+
581+
Returns:
582+
Path to the compressed sibling file.
583+
"""
584+
# Imported lazily so the dependency is only needed when actually uploading a facts
585+
# file. brotlicffi is the API-compatible fallback used on PyPy / non-CPython runtimes.
586+
try:
587+
import brotli
588+
except ImportError:
589+
import brotlicffi as brotli
590+
591+
target_path = os.path.join(os.path.dirname(source_path), SOCKET_FACTS_BROTLI_FILENAME)
592+
compressor = brotli.Compressor(
593+
quality=SOCKET_FACTS_BROTLI_QUALITY,
594+
lgwin=SOCKET_FACTS_BROTLI_LGWIN,
595+
)
596+
try:
597+
with open(source_path, "rb") as src, open(target_path, "wb") as dst:
598+
while True:
599+
chunk = src.read(SOCKET_FACTS_BROTLI_CHUNK_SIZE)
600+
if not chunk:
601+
break
602+
compressed = compressor.process(chunk)
603+
if compressed:
604+
dst.write(compressed)
605+
dst.write(compressor.finish())
606+
except BaseException:
607+
# Don't leave a half-written .br behind for the caller to miss (it only tracks
608+
# the path for cleanup once this returns). Remove it, then re-raise so the caller
609+
# falls back to uploading the plain file.
610+
try:
611+
os.unlink(target_path)
612+
except OSError:
613+
pass
614+
raise
615+
return target_path
616+
617+
def _compress_facts_files_for_upload(self, files: List[str]) -> Tuple[List[str], List[str]]:
618+
"""Replace any ``.socket.facts.json`` upload entry with a brotli-compressed ``.br`` sibling.
619+
620+
The Socket full-scan endpoint transparently decompresses a multipart part named
621+
exactly ``.socket.facts.json.br``, so compressing here keeps a large facts file under
622+
the server's per-file size cap without changing the stored result. Files whose
623+
basename is not exactly ``.socket.facts.json`` are left untouched (the server only
624+
matches that exact name), as are empty placeholder files (e.g. baseline scans).
625+
626+
Compression never blocks an upload: if it fails for any reason (missing optional
627+
``brotli`` dependency, unwritable directory, etc.) the original plain file is used.
628+
629+
Args:
630+
files: The list of file paths about to be uploaded.
631+
632+
Returns:
633+
``(upload_files, temp_paths)`` where ``upload_files`` is the possibly-rewritten
634+
list to upload and ``temp_paths`` are compressed files the caller must delete
635+
once the upload completes.
636+
"""
637+
upload_files: List[str] = []
638+
temp_paths: List[str] = []
639+
for file_path in files:
640+
try:
641+
if (
642+
os.path.basename(file_path) == SOCKET_FACTS_FILENAME
643+
and os.path.isfile(file_path)
644+
and os.path.getsize(file_path) > 0
645+
):
646+
compressed_path = self._compress_facts_file(file_path)
647+
log.debug(
648+
f"Brotli-compressed {file_path} for upload: "
649+
f"{os.path.getsize(file_path)} -> {os.path.getsize(compressed_path)} bytes "
650+
f"(uploading as {SOCKET_FACTS_BROTLI_FILENAME})"
651+
)
652+
upload_files.append(compressed_path)
653+
temp_paths.append(compressed_path)
654+
continue
655+
except Exception as e:
656+
# Never let compression break an upload: fall back to the plain file.
657+
log.warning(
658+
f"Failed to brotli-compress facts file {file_path}, uploading uncompressed: {e}"
659+
)
660+
upload_files.append(file_path)
661+
return upload_files, temp_paths
662+
547663
def create_full_scan(self, files: List[str], params: FullScanParams, base_paths: Optional[List[str]] = None) -> FullScan:
548664
"""
549665
Creates a new full scan via the Socket API.
@@ -559,7 +675,19 @@ def create_full_scan(self, files: List[str], params: FullScanParams, base_paths:
559675
log.info("Creating new full scan")
560676
create_full_start = time.time()
561677

562-
res = self.sdk.fullscans.post(files, params, use_types=True, use_lazy_loading=True, max_open_files=50, base_paths=base_paths)
678+
# Brotli-compress the reachability facts file (if present) so it is uploaded as a
679+
# `.socket.facts.json.br` part. The API decompresses it server-side, keeping a large
680+
# facts file under the per-file upload size cap. See _compress_facts_files_for_upload.
681+
upload_files, compressed_temp_files = self._compress_facts_files_for_upload(files)
682+
try:
683+
res = self.sdk.fullscans.post(upload_files, params, use_types=True, use_lazy_loading=True, max_open_files=50, base_paths=base_paths)
684+
finally:
685+
for temp_file in compressed_temp_files:
686+
try:
687+
os.unlink(temp_file)
688+
log.debug(f"Cleaned up temporary compressed facts file: {temp_file}")
689+
except OSError as cleanup_error:
690+
log.debug(f"Failed to clean up temporary compressed facts file {temp_file}: {cleanup_error}")
563691
if not res.success:
564692
log.error(f"Error creating full scan: {res.message}, status: {res.status}")
565693
raise Exception(f"Error creating full scan: {res.message}, status: {res.status}")
Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
"""Tests for brotli compression of the reachability facts file on upload.
2+
3+
The Socket full-scan endpoint transparently decompresses a multipart part named exactly
4+
`.socket.facts.json.br`, so the CLI compresses the facts file before uploading it. These
5+
tests cover the helpers in `Core` that do that rewriting.
6+
"""
7+
import json
8+
import os
9+
10+
import pytest
11+
12+
try:
13+
import brotli
14+
except ImportError: # pragma: no cover - PyPy / non-CPython fallback
15+
import brotlicffi as brotli
16+
17+
from socketsecurity.core import (
18+
SOCKET_FACTS_BROTLI_FILENAME,
19+
SOCKET_FACTS_FILENAME,
20+
Core,
21+
)
22+
23+
24+
def _write(path, data: bytes):
25+
with open(path, "wb") as f:
26+
f.write(data)
27+
return path
28+
29+
30+
def test_compress_facts_file_roundtrips(tmp_path):
31+
"""The compressed sibling decompresses back to the exact original bytes."""
32+
source = tmp_path / SOCKET_FACTS_FILENAME
33+
payload = json.dumps({"components": [{"id": str(i)} for i in range(1000)]}).encode()
34+
_write(str(source), payload)
35+
36+
compressed_path = Core._compress_facts_file(str(source))
37+
38+
# Compressed file is a sibling named exactly `.socket.facts.json.br`.
39+
assert compressed_path == str(tmp_path / SOCKET_FACTS_BROTLI_FILENAME)
40+
assert os.path.basename(compressed_path) == SOCKET_FACTS_BROTLI_FILENAME
41+
# The original is untouched (other code paths still read it locally).
42+
assert source.read_bytes() == payload
43+
# Roundtrip matches.
44+
with open(compressed_path, "rb") as f:
45+
assert brotli.decompress(f.read()) == payload
46+
47+
48+
def test_compress_for_upload_rewrites_facts_entry(tmp_path):
49+
"""A `.socket.facts.json` entry is replaced by its `.br` sibling; others pass through."""
50+
core = Core.__new__(Core)
51+
facts = _write(str(tmp_path / SOCKET_FACTS_FILENAME), b'{"a": 1}')
52+
manifest = _write(str(tmp_path / "package.json"), b"{}")
53+
54+
upload_files, temp_paths = core._compress_facts_files_for_upload([facts, manifest])
55+
56+
expected_br = str(tmp_path / SOCKET_FACTS_BROTLI_FILENAME)
57+
assert upload_files == [expected_br, manifest]
58+
assert temp_paths == [expected_br]
59+
assert os.path.isfile(expected_br)
60+
# Non-facts files are never compressed.
61+
assert manifest in upload_files
62+
63+
64+
def test_compress_facts_file_removes_partial_output_on_failure(tmp_path, monkeypatch):
65+
"""If compression fails mid-stream, the half-written .br is removed (not orphaned)."""
66+
source = _write(str(tmp_path / SOCKET_FACTS_FILENAME), b'{"a": 1}' * 1000)
67+
68+
class ExplodingCompressor:
69+
def __init__(self, *args, **kwargs):
70+
pass
71+
72+
def process(self, _data):
73+
raise RuntimeError("disk full")
74+
75+
def finish(self): # pragma: no cover - never reached
76+
return b""
77+
78+
# Patch the module the helper imports (brotli on CPython, brotlicffi elsewhere).
79+
monkeypatch.setattr(brotli, "Compressor", ExplodingCompressor)
80+
81+
with pytest.raises(RuntimeError, match="disk full"):
82+
Core._compress_facts_file(source)
83+
84+
# No orphaned .br left behind in the target directory.
85+
assert not (tmp_path / SOCKET_FACTS_BROTLI_FILENAME).exists()
86+
87+
88+
def test_compress_for_upload_preserves_directory_prefix(tmp_path):
89+
"""The `.br` sibling keeps the facts file's directory so the relative key is preserved."""
90+
core = Core.__new__(Core)
91+
subdir = tmp_path / "nested"
92+
subdir.mkdir()
93+
facts = _write(str(subdir / SOCKET_FACTS_FILENAME), b'{"a": 1}')
94+
95+
upload_files, temp_paths = core._compress_facts_files_for_upload([facts])
96+
97+
assert upload_files == [str(subdir / SOCKET_FACTS_BROTLI_FILENAME)]
98+
assert temp_paths == [str(subdir / SOCKET_FACTS_BROTLI_FILENAME)]
99+
100+
101+
def test_empty_facts_file_is_not_compressed(tmp_path):
102+
"""Empty placeholder facts files (e.g. baseline scans) are uploaded as-is."""
103+
core = Core.__new__(Core)
104+
empty_facts = _write(str(tmp_path / SOCKET_FACTS_FILENAME), b"")
105+
106+
upload_files, temp_paths = core._compress_facts_files_for_upload([empty_facts])
107+
108+
assert upload_files == [empty_facts]
109+
assert temp_paths == []
110+
assert not (tmp_path / SOCKET_FACTS_BROTLI_FILENAME).exists()
111+
112+
113+
def test_custom_named_facts_file_is_not_compressed(tmp_path):
114+
"""A custom --reach-output-file name is not compressed (server only matches the exact name)."""
115+
core = Core.__new__(Core)
116+
custom = _write(str(tmp_path / "custom.facts.json"), b'{"a": 1}')
117+
118+
upload_files, temp_paths = core._compress_facts_files_for_upload([custom])
119+
120+
assert upload_files == [custom]
121+
assert temp_paths == []
122+
123+
124+
def test_compression_failure_falls_back_to_plain_file(tmp_path, monkeypatch):
125+
"""If compression raises, the original plain file is uploaded instead of failing."""
126+
core = Core.__new__(Core)
127+
facts = _write(str(tmp_path / SOCKET_FACTS_FILENAME), b'{"a": 1}')
128+
129+
def boom(_source_path):
130+
raise RuntimeError("brotli unavailable")
131+
132+
monkeypatch.setattr(Core, "_compress_facts_file", staticmethod(boom))
133+
134+
upload_files, temp_paths = core._compress_facts_files_for_upload([facts])
135+
136+
assert upload_files == [facts]
137+
assert temp_paths == []

0 commit comments

Comments
 (0)