5151
5252_HUMANIZE_BOUNDARY = re .compile (r"(?<=[a-z0-9])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])" )
5353
54+ # Reachability facts-file upload compression.
55+ #
56+ # The Socket full-scan endpoint transparently brotli-decompresses any multipart part
57+ # whose basename is exactly ``.socket.facts.json.br`` and stores it as plain
58+ # ``.socket.facts.json``. Compressing the facts file on upload keeps it well under the
59+ # server's per-file size cap (a ~262 MB facts file compresses to roughly 15-30 MB),
60+ # which is required for large reachability (tier 1) scans to succeed.
61+ #
62+ # The server matches the *exact* name ``.socket.facts.json.br``, so we only compress
63+ # files whose basename is exactly ``.socket.facts.json`` (a custom ``--reach-output-file``
64+ # name would not be decompressed server-side, so it is left as a plain upload).
65+ SOCKET_FACTS_FILENAME = ".socket.facts.json"
66+ SOCKET_FACTS_BROTLI_FILENAME = ".socket.facts.json.br"
67+ # Brotli quality (0-11); 5 is a good speed/ratio tradeoff for large JSON payloads.
68+ SOCKET_FACTS_BROTLI_QUALITY = 5
69+ # Largest brotli window (2**24 bytes); improves the ratio on large facts files.
70+ SOCKET_FACTS_BROTLI_LGWIN = 24
71+ # Stream the facts file in 1 MiB chunks so large files aren't held fully in memory.
72+ SOCKET_FACTS_BROTLI_CHUNK_SIZE = 1024 * 1024
73+
5474
5575def _humanize_alert_type (alert_type : str ) -> str :
5676 """Convert a camelCase/PascalCase alert type into a Title-Cased label.
@@ -544,6 +564,102 @@ def finalize_tier1_scan(self, full_scan_id: str, facts_file_path: str) -> bool:
544564 log .debug (f"Unable to finalize tier 1 scan: { e } " )
545565 return False
546566
567+ @staticmethod
568+ def _compress_facts_file (source_path : str ) -> str :
569+ """Brotli-compress a ``.socket.facts.json`` file to a sibling ``.socket.facts.json.br``.
570+
571+ The source is streamed in chunks so a large facts file (hundreds of MB) never has
572+ to be held in memory at once. The compressed file is written next to the source so
573+ that the multipart key the SDK derives keeps the same directory prefix, only with a
574+ ``.br`` basename. Any existing ``.socket.facts.json.br`` sibling is overwritten, and a
575+ partially-written output is removed if compression fails part-way through (e.g. the
576+ disk fills up mid-stream) so no orphaned ``.br`` is left in the target directory.
577+
578+ Args:
579+ source_path: Path to the plain ``.socket.facts.json`` file.
580+
581+ Returns:
582+ Path to the compressed sibling file.
583+ """
584+ # Imported lazily so the dependency is only needed when actually uploading a facts
585+ # file. brotlicffi is the API-compatible fallback used on PyPy / non-CPython runtimes.
586+ try :
587+ import brotli
588+ except ImportError :
589+ import brotlicffi as brotli
590+
591+ target_path = os .path .join (os .path .dirname (source_path ), SOCKET_FACTS_BROTLI_FILENAME )
592+ compressor = brotli .Compressor (
593+ quality = SOCKET_FACTS_BROTLI_QUALITY ,
594+ lgwin = SOCKET_FACTS_BROTLI_LGWIN ,
595+ )
596+ try :
597+ with open (source_path , "rb" ) as src , open (target_path , "wb" ) as dst :
598+ while True :
599+ chunk = src .read (SOCKET_FACTS_BROTLI_CHUNK_SIZE )
600+ if not chunk :
601+ break
602+ compressed = compressor .process (chunk )
603+ if compressed :
604+ dst .write (compressed )
605+ dst .write (compressor .finish ())
606+ except BaseException :
607+ # Don't leave a half-written .br behind for the caller to miss (it only tracks
608+ # the path for cleanup once this returns). Remove it, then re-raise so the caller
609+ # falls back to uploading the plain file.
610+ try :
611+ os .unlink (target_path )
612+ except OSError :
613+ pass
614+ raise
615+ return target_path
616+
617+ def _compress_facts_files_for_upload (self , files : List [str ]) -> Tuple [List [str ], List [str ]]:
618+ """Replace any ``.socket.facts.json`` upload entry with a brotli-compressed ``.br`` sibling.
619+
620+ The Socket full-scan endpoint transparently decompresses a multipart part named
621+ exactly ``.socket.facts.json.br``, so compressing here keeps a large facts file under
622+ the server's per-file size cap without changing the stored result. Files whose
623+ basename is not exactly ``.socket.facts.json`` are left untouched (the server only
624+ matches that exact name), as are empty placeholder files (e.g. baseline scans).
625+
626+ Compression never blocks an upload: if it fails for any reason (missing optional
627+ ``brotli`` dependency, unwritable directory, etc.) the original plain file is used.
628+
629+ Args:
630+ files: The list of file paths about to be uploaded.
631+
632+ Returns:
633+ ``(upload_files, temp_paths)`` where ``upload_files`` is the possibly-rewritten
634+ list to upload and ``temp_paths`` are compressed files the caller must delete
635+ once the upload completes.
636+ """
637+ upload_files : List [str ] = []
638+ temp_paths : List [str ] = []
639+ for file_path in files :
640+ try :
641+ if (
642+ os .path .basename (file_path ) == SOCKET_FACTS_FILENAME
643+ and os .path .isfile (file_path )
644+ and os .path .getsize (file_path ) > 0
645+ ):
646+ compressed_path = self ._compress_facts_file (file_path )
647+ log .debug (
648+ f"Brotli-compressed { file_path } for upload: "
649+ f"{ os .path .getsize (file_path )} -> { os .path .getsize (compressed_path )} bytes "
650+ f"(uploading as { SOCKET_FACTS_BROTLI_FILENAME } )"
651+ )
652+ upload_files .append (compressed_path )
653+ temp_paths .append (compressed_path )
654+ continue
655+ except Exception as e :
656+ # Never let compression break an upload: fall back to the plain file.
657+ log .warning (
658+ f"Failed to brotli-compress facts file { file_path } , uploading uncompressed: { e } "
659+ )
660+ upload_files .append (file_path )
661+ return upload_files , temp_paths
662+
547663 def create_full_scan (self , files : List [str ], params : FullScanParams , base_paths : Optional [List [str ]] = None ) -> FullScan :
548664 """
549665 Creates a new full scan via the Socket API.
@@ -559,7 +675,19 @@ def create_full_scan(self, files: List[str], params: FullScanParams, base_paths:
559675 log .info ("Creating new full scan" )
560676 create_full_start = time .time ()
561677
562- res = self .sdk .fullscans .post (files , params , use_types = True , use_lazy_loading = True , max_open_files = 50 , base_paths = base_paths )
678+ # Brotli-compress the reachability facts file (if present) so it is uploaded as a
679+ # `.socket.facts.json.br` part. The API decompresses it server-side, keeping a large
680+ # facts file under the per-file upload size cap. See _compress_facts_files_for_upload.
681+ upload_files , compressed_temp_files = self ._compress_facts_files_for_upload (files )
682+ try :
683+ res = self .sdk .fullscans .post (upload_files , params , use_types = True , use_lazy_loading = True , max_open_files = 50 , base_paths = base_paths )
684+ finally :
685+ for temp_file in compressed_temp_files :
686+ try :
687+ os .unlink (temp_file )
688+ log .debug (f"Cleaned up temporary compressed facts file: { temp_file } " )
689+ except OSError as cleanup_error :
690+ log .debug (f"Failed to clean up temporary compressed facts file { temp_file } : { cleanup_error } " )
563691 if not res .success :
564692 log .error (f"Error creating full scan: { res .message } , status: { res .status } " )
565693 raise Exception (f"Error creating full scan: { res .message } , status: { res .status } " )
0 commit comments