5151
5252_HUMANIZE_BOUNDARY = re .compile (r"(?<=[a-z0-9])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])" )
5353
54+ # Reachability facts-file upload compression.
55+ #
56+ # The Socket full-scan endpoint transparently brotli-decompresses any multipart part
57+ # whose basename is exactly ``.socket.facts.json.br`` and stores it as plain
58+ # ``.socket.facts.json``. Compressing the facts file on upload keeps it well under the
59+ # server's per-file size cap (a ~262 MB facts file compresses to roughly 15-30 MB),
60+ # which is required for large reachability (tier 1) scans to succeed.
61+ #
62+ # The server matches the *exact* name ``.socket.facts.json.br``, so we only compress
63+ # files whose basename is exactly ``.socket.facts.json`` (a custom ``--reach-output-file``
64+ # name would not be decompressed server-side, so it is left as a plain upload).
65+ SOCKET_FACTS_FILENAME = ".socket.facts.json"
66+ SOCKET_FACTS_BROTLI_FILENAME = ".socket.facts.json.br"
67+ # Brotli quality (0-11); 5 is a good speed/ratio tradeoff for large JSON payloads.
68+ SOCKET_FACTS_BROTLI_QUALITY = 5
69+ # Largest brotli window (2**24 bytes); improves the ratio on large facts files.
70+ SOCKET_FACTS_BROTLI_LGWIN = 24
71+
5472
5573def _humanize_alert_type (alert_type : str ) -> str :
5674 """Convert a camelCase/PascalCase alert type into a Title-Cased label.
@@ -544,6 +562,91 @@ def finalize_tier1_scan(self, full_scan_id: str, facts_file_path: str) -> bool:
544562 log .debug (f"Unable to finalize tier 1 scan: { e } " )
545563 return False
546564
565+ @staticmethod
566+ def _compress_facts_file (source_path : str ) -> str :
567+ """Brotli-compress a ``.socket.facts.json`` file to a sibling ``.socket.facts.json.br``.
568+
569+ The source is streamed in chunks so a large facts file (hundreds of MB) never has
570+ to be held in memory at once. The compressed file is written next to the source so
571+ that the multipart key the SDK derives keeps the same directory prefix, only with a
572+ ``.br`` basename.
573+
574+ Args:
575+ source_path: Path to the plain ``.socket.facts.json`` file.
576+
577+ Returns:
578+ Path to the compressed sibling file.
579+ """
580+ # Imported lazily so the dependency is only needed when actually uploading a facts
581+ # file. brotlicffi is the API-compatible fallback used on PyPy / non-CPython runtimes.
582+ try :
583+ import brotli
584+ except ImportError :
585+ import brotlicffi as brotli
586+
587+ target_path = os .path .join (os .path .dirname (source_path ), SOCKET_FACTS_BROTLI_FILENAME )
588+ compressor = brotli .Compressor (
589+ quality = SOCKET_FACTS_BROTLI_QUALITY ,
590+ lgwin = SOCKET_FACTS_BROTLI_LGWIN ,
591+ )
592+ chunk_size = 1024 * 1024 # 1 MiB
593+ with open (source_path , "rb" ) as src , open (target_path , "wb" ) as dst :
594+ while True :
595+ chunk = src .read (chunk_size )
596+ if not chunk :
597+ break
598+ compressed = compressor .process (chunk )
599+ if compressed :
600+ dst .write (compressed )
601+ dst .write (compressor .finish ())
602+ return target_path
603+
604+ def _compress_facts_files_for_upload (self , files : List [str ]) -> Tuple [List [str ], List [str ]]:
605+ """Replace any ``.socket.facts.json`` upload entry with a brotli-compressed ``.br`` sibling.
606+
607+ The Socket full-scan endpoint transparently decompresses a multipart part named
608+ exactly ``.socket.facts.json.br``, so compressing here keeps a large facts file under
609+ the server's per-file size cap without changing the stored result. Files whose
610+ basename is not exactly ``.socket.facts.json`` are left untouched (the server only
611+ matches that exact name), as are empty placeholder files (e.g. baseline scans).
612+
613+ Compression never blocks an upload: if it fails for any reason (missing optional
614+ ``brotli`` dependency, unwritable directory, etc.) the original plain file is used.
615+
616+ Args:
617+ files: The list of file paths about to be uploaded.
618+
619+ Returns:
620+ ``(upload_files, temp_paths)`` where ``upload_files`` is the possibly-rewritten
621+ list to upload and ``temp_paths`` are compressed files the caller must delete
622+ once the upload completes.
623+ """
624+ upload_files : List [str ] = []
625+ temp_paths : List [str ] = []
626+ for file_path in files :
627+ try :
628+ if (
629+ os .path .basename (file_path ) == SOCKET_FACTS_FILENAME
630+ and os .path .isfile (file_path )
631+ and os .path .getsize (file_path ) > 0
632+ ):
633+ compressed_path = self ._compress_facts_file (file_path )
634+ log .debug (
635+ f"Brotli-compressed { file_path } for upload: "
636+ f"{ os .path .getsize (file_path )} -> { os .path .getsize (compressed_path )} bytes "
637+ f"(uploading as { SOCKET_FACTS_BROTLI_FILENAME } )"
638+ )
639+ upload_files .append (compressed_path )
640+ temp_paths .append (compressed_path )
641+ continue
642+ except Exception as e :
643+ # Never let compression break an upload: fall back to the plain file.
644+ log .warning (
645+ f"Failed to brotli-compress facts file { file_path } , uploading uncompressed: { e } "
646+ )
647+ upload_files .append (file_path )
648+ return upload_files , temp_paths
649+
547650 def create_full_scan (self , files : List [str ], params : FullScanParams , base_paths : Optional [List [str ]] = None ) -> FullScan :
548651 """
549652 Creates a new full scan via the Socket API.
@@ -559,7 +662,19 @@ def create_full_scan(self, files: List[str], params: FullScanParams, base_paths:
559662 log .info ("Creating new full scan" )
560663 create_full_start = time .time ()
561664
562- res = self .sdk .fullscans .post (files , params , use_types = True , use_lazy_loading = True , max_open_files = 50 , base_paths = base_paths )
665+ # Brotli-compress the reachability facts file (if present) so it is uploaded as a
666+ # `.socket.facts.json.br` part. The API decompresses it server-side, keeping a large
667+ # facts file under the per-file upload size cap. See _compress_facts_files_for_upload.
668+ upload_files , compressed_temp_files = self ._compress_facts_files_for_upload (files )
669+ try :
670+ res = self .sdk .fullscans .post (upload_files , params , use_types = True , use_lazy_loading = True , max_open_files = 50 , base_paths = base_paths )
671+ finally :
672+ for temp_file in compressed_temp_files :
673+ try :
674+ os .unlink (temp_file )
675+ log .debug (f"Cleaned up temporary compressed facts file: { temp_file } " )
676+ except OSError as cleanup_error :
677+ log .debug (f"Failed to clean up temporary compressed facts file { temp_file } : { cleanup_error } " )
563678 if not res .success :
564679 log .error (f"Error creating full scan: { res .message } , status: { res .status } " )
565680 raise Exception (f"Error creating full scan: { res .message } , status: { res .status } " )
0 commit comments