From 4d7e4f97399a7d5225f67b57fd5908dd4283c072 Mon Sep 17 00:00:00 2001 From: lelia <2418071+lelia@users.noreply.github.com> Date: Tue, 2 Jun 2026 18:34:18 -0400 Subject: [PATCH] feat: add diff-only scan scoping Signed-off-by: lelia <2418071+lelia@users.noreply.github.com> --- CHANGELOG.md | 12 ++ action.yml | 22 +++ docs/github-action.md | 51 ++++++ docs/parameters.md | 25 ++- socket_basics/core/config.py | 158 ++++++++++++---- .../core/connector/opengrep/__init__.py | 7 + tests/test_changed_files_scope.py | 172 ++++++++++++++++++ 7 files changed, 413 insertions(+), 34 deletions(-) create mode 100644 tests/test_changed_files_scope.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 057682e..ddc0828 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,18 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). ## [Unreleased] +## [2.1.0] - 2026-06-02 + +### Added +- Diff-only scan scoping now applies to SAST/OpenGrep via `changed_files` and + `scan_files`. +- Added GitHub Action inputs for `changed_files` and `scan_files`. + +### Fixed +- Delete-only changed-file scans now skip instead of falling back to a full + workspace scan. +- Updated parameter docs to reflect SAST/OpenGrep diff-only scoping. + ## [2.0.3] - 2026-04-24 diff --git a/action.yml b/action.yml index aa3537b..2480e97 100644 --- a/action.yml +++ b/action.yml @@ -9,6 +9,9 @@ runs: # Core GitHub variables (these are automatically available, but we explicitly pass GITHUB_TOKEN) GITHUB_TOKEN: ${{ inputs.github_token }} INPUT_WORKSPACE: ${{ inputs.workspace }} + # Scan scope + INPUT_CHANGED_FILES: ${{ inputs.changed_files }} + INPUT_SCAN_FILES: ${{ inputs.scan_files }} # Input mappings for all parameters INPUT_ALL_LANGUAGES_ENABLED: ${{ inputs.all_languages_enabled }} INPUT_ALL_RULES_ENABLED: ${{ inputs.all_rules_enabled }} @@ -103,6 +106,25 @@ inputs: description: "Workspace directory to scan (defaults to GITHUB_WORKSPACE)" required: false default: "" + changed_files: + description: >- + Diff-only mode: scope every scanner (SAST/OpenGrep, secrets, containers) + to changed files only, instead of the whole repository. Accepts a + comma-separated file list, a commit hash, 'auto' (diffs against the PR + base branch in CI, else staged changes), 'pr' (diff against + GITHUB_BASE_REF), or 'current-commit'. For PR/'auto' modes, check out with + actions/checkout fetch-depth: 0 so the base branch is available. When the + diff resolves to no existing files (e.g. a delete-only PR) the scanners + are skipped rather than scanning the whole repo. + required: false + default: "" + scan_files: + description: >- + Explicit comma-separated list of files to scan. Scopes SAST/OpenGrep, + secret, and container scans to just these files. Used when changed_files + is not set; changed_files takes precedence when both are provided. + required: false + default: "" socket_org: description: "Socket organization slug (required for Enterprise features)" required: false diff --git a/docs/github-action.md b/docs/github-action.md index 33c6c49..2438b3c 100644 --- a/docs/github-action.md +++ b/docs/github-action.md @@ -271,6 +271,57 @@ Include these in your workflow's `jobs..permissions` section. verbose: 'true' ``` +## Diff-Only Mode (Changed Files) + +By default the scanners run against the **entire repository**, so every PR +re-reports the whole repo's existing findings. To report only on what the PR +changed — the way Socket SCA Pull Request alerts behave — use the +`changed_files` input. This scopes SAST/OpenGrep, secret, and container scans to +the changed files and dramatically reduces PR finding volume. + +```yaml +name: Socket Basics (PR diff-only) +on: + pull_request: + +jobs: + socket-basics: + permissions: + contents: read + pull-requests: write + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + # Required so the PR base branch is available for the diff + fetch-depth: 0 + + - name: Run Socket Basics (changed files only) + uses: SocketDev/socket-basics@v2.0.3 + env: + GITHUB_PR_NUMBER: ${{ github.event.pull_request.number }} + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + # Diff-only: scope all scanners to files changed in this PR + changed_files: 'auto' + python_sast_enabled: 'true' + javascript_sast_enabled: 'true' + secret_scanning_enabled: 'true' +``` + +`changed_files` accepts: + +- `auto` — diff against the PR base branch in CI (`GITHUB_BASE_REF`), else staged changes +- `pr` — diff against the PR base branch (`GITHUB_BASE_REF`) +- a commit hash — files changed in that commit +- a comma-separated file list — e.g. `src/app.py,src/utils.js` + +> [!IMPORTANT] +> For `auto`/`pr` modes, check out with `fetch-depth: 0` so the base branch is +> available to diff against. Deletions are excluded, so a delete-only PR scans +> nothing rather than falling back to the whole repo. To scan an explicit file +> list regardless of git state, use the `scan_files` input instead. + ## PR Comment Customization Socket Basics automatically posts enhanced PR comments with **smart defaults that work out of the box** — clickable file links, collapsible sections, syntax highlighting, CVE links, CVSS scores, and auto-labels are all enabled by default. diff --git a/docs/parameters.md b/docs/parameters.md index 91c389e..e1ad6c6 100644 --- a/docs/parameters.md +++ b/docs/parameters.md @@ -92,7 +92,11 @@ socket-basics --committers "user1@example.com,user2@example.com" ``` ### `--scan-files SCAN_FILES` -Comma-separated list of files to scan. +Explicit comma-separated list of files to scan. Scopes **all** scanners — +SAST/OpenGrep, secrets, and container scanning — to just these files instead of +the whole workspace. Used when `--changed-files` is not set (`--changed-files` +takes precedence when both are provided). Paths that do not exist are skipped; +if none exist, the scanners are skipped rather than scanning the whole repo. **Example:** ```bash @@ -100,7 +104,21 @@ socket-basics --scan-files "src/app.py,src/utils.js" ``` ### `--changed-files CHANGED_FILES` -Comma-separated list of files to scan or 'auto' to detect changed files from git. +Diff-only mode: scope **all** scanners (SAST/OpenGrep, secrets, containers) to +changed files only, the way Socket SCA Pull Request alerts behave. Accepts: + +- a comma-separated file list (e.g. `src/app.py,src/utils.js`) +- a commit hash — files changed in that commit +- `auto` — the PR base-ref diff when running in a PR CI context + (`GITHUB_BASE_REF` is set), otherwise staged (`--cached`) changes +- `pr` — diff against the PR base branch (`GITHUB_BASE_REF`) +- `current-commit` — files in the `HEAD` commit + +Deletions are excluded from PR/`auto`/`pr` diffs so removed paths never become +scan targets. When the diff resolves to no existing files (e.g. a delete-only +PR), the scanners are skipped rather than falling back to scanning the whole +repository. For PR/`auto`/`pr` modes, check out with full history (e.g. +`actions/checkout` with `fetch-depth: 0`) so the base branch is available. **Example:** ```bash @@ -623,6 +641,9 @@ socket-basics \ ### CI/CD Scan (Changed Files Only) +Scope every scanner — SAST/OpenGrep included — to only the files the PR changed, +so each PR reports findings for its own changes rather than the whole repo: + ```bash socket-basics \ --changed-files auto \ diff --git a/socket_basics/core/config.py b/socket_basics/core/config.py index 55512cf..5f6a962 100644 --- a/socket_basics/core/config.py +++ b/socket_basics/core/config.py @@ -76,24 +76,58 @@ def _parse_scan_files(self, scan_files_str: str) -> List[str]: return [f.strip() for f in scan_files_str.split(',') if f.strip()] def get_scan_targets(self) -> List[str]: - """Determine files to scan based on configuration""" - # If explicit 'scan_all' set, return workspace directory + """Determine files to scan based on configuration. + + Precedence (highest to lowest): + 1. ``scan_all`` -> scan the entire workspace (explicit override). + 2. ``changed_files`` -> scope the scan to the PR/diff changed files + (diff-only mode; mirrors how Socket SCA Pull Request alerts behave). + 3. ``scan_files`` -> explicit user-provided file list. + 4. default -> scan the entire workspace. + + For the scoped modes (2 and 3) an empty list may be returned when none + of the requested paths exist (for example a delete-only PR). Callers + MUST treat an empty result as "nothing to scan" and skip the scanner + rather than falling back to scanning the whole workspace or their own + working directory. + """ + # Explicit "scan everything" override. if self.get('scan_all', False): return [str(self.workspace)] - # If user provided specific files to scan, validate their existence + # Diff-only mode: scope the scan to the files changed in the PR/commit. + # Keep honoring the scope when git resolves to zero files, e.g. a + # delete-only PR, so callers skip instead of scanning the workspace. + changed_files = self.get('changed_files', []) or [] + if changed_files or self.get('changed_files_scope_requested', False): + return self._resolve_file_targets(changed_files) + + # Explicit list of files to scan. if self.scan_files: - targets = [self.workspace / f for f in self.scan_files] - valid = [] - for t in targets: - if t.exists(): - valid.append(str(t)) - else: - logging.getLogger(__name__).warning("Scan target does not exist: %s", str(t)) - return valid - - # Default: scan the workspace itself + return self._resolve_file_targets(self.scan_files) + + # Default: scan the workspace itself. return [str(self.workspace)] + + def _resolve_file_targets(self, files: List[str]) -> List[str]: + """Resolve a list of file paths to absolute scan targets. + + Relative paths are resolved against the workspace; absolute paths are + used as-is. Paths that do not exist are skipped with a warning (a + common case for delete-only PRs). Returns an empty list when none of + the provided paths exist, signalling callers that there is nothing to + scan. + """ + valid: List[str] = [] + for f in files: + p = Path(f) + if not p.is_absolute(): + p = self.workspace / f + if p.exists(): + valid.append(str(p)) + else: + logging.getLogger(__name__).warning("Scan target does not exist: %s", str(p)) + return valid def get_action_for_severity(self, severity: str) -> str: """Map severity to action according to security policy""" @@ -1096,7 +1130,10 @@ def add_dynamic_cli_args(parser: argparse.ArgumentParser): # Add optional changed-files CLI argument to limit scans to changed files parser.add_argument('--changed-files', type=str, default='', - help="Comma-separated list of files to scan or 'auto' to detect changed files from git") + help="Scope all scanners (SAST/OpenGrep, secrets, containers) to changed " + "files only. Accepts a comma-separated file list, a commit hash, " + "'auto' (PR base-ref diff in CI, else staged changes), 'pr' (diff " + "against GITHUB_BASE_REF), or 'current-commit'.") # Also add CLI args for notification plugins declared in notifications.yaml try: @@ -1303,17 +1340,30 @@ def create_config_from_args(args) -> Config: except Exception: pass - # Handle changed-files: CLI overrides env/config. Accept 'auto' to detect via git + # Handle changed-files: CLI overrides env/config. Accept 'auto' to detect via git. + # When invoked via the GitHub Action (entrypoint passes no CLI args) the value + # arrives through the INPUT_CHANGED_FILES environment variable instead. changed_files_arg = getattr(args, 'changed_files', '') if args is not None else '' + if not changed_files_arg: + changed_files_arg = os.getenv('INPUT_CHANGED_FILES', '') if changed_files_arg: val = str(changed_files_arg).strip() - # 'auto' defaults to staged changes (--cached) + config_dict['changed_files_scope_requested'] = True + # 'auto' resolves to the PR base-ref diff in CI, else staged changes. if val.lower() == 'auto': try: - git_changed = _detect_git_changed_files(config_dict.get('workspace', os.getcwd()), mode='staged') + git_changed = _detect_git_changed_files(config_dict.get('workspace', os.getcwd()), mode='auto') config_dict['changed_files'] = git_changed except Exception as e: - logging.getLogger(__name__).warning("Warning: failed to detect git changed files (staged): %s", e) + logging.getLogger(__name__).warning("Warning: failed to detect git changed files (auto): %s", e) + config_dict['changed_files'] = [] + elif val.lower() == 'pr': + # Explicit PR diff against the base branch (GITHUB_BASE_REF). + try: + git_changed = _detect_git_changed_files(config_dict.get('workspace', os.getcwd()), mode='pr') + config_dict['changed_files'] = git_changed + except Exception as e: + logging.getLogger(__name__).warning("Warning: failed to detect git changed files (pr): %s", e) config_dict['changed_files'] = [] elif val.lower() in ('current-commit', 'current_commit'): try: @@ -1370,27 +1420,34 @@ def create_config_from_args(args) -> Config: return Config(config_dict) -def _detect_git_changed_files(workspace_path: str, mode: str = 'staged', commit: str | None = None) -> List[str]: +def _detect_git_changed_files(workspace_path: str, mode: str = 'staged', commit: str | None = None, base_ref: str | None = None) -> List[str]: """Detect changed files in a git repository. mode: - - 'staged' -> files staged for commit (git diff --name-only --cached) - - 'current-commit' -> files included in HEAD commit - - 'commit' -> files included in the given commit hash (commit param required) - - Returns a list of file paths relative to the workspace root. If not a git repo or detection fails, returns []. + - 'staged' -> files staged for commit (git diff --name-only --cached) + - 'current-commit' -> files included in the HEAD commit + - 'commit' -> files included in the given commit hash (commit param required) + - 'pr' -> files changed relative to a base ref (a GitHub PR). + Uses ``base_ref`` or ``GITHUB_BASE_REF`` and excludes + deletions so removed paths never become scan targets. + - 'auto' -> the PR base-ref diff when running in a PR CI context + (``GITHUB_BASE_REF`` is set), otherwise staged changes. + This is what ``--changed-files auto`` resolves to. + + Returns a list of file paths relative to the workspace root. If not a git + repo or detection fails, returns []. """ try: from subprocess import check_output, CalledProcessError import subprocess - + # Prefer GITHUB_WORKSPACE if set (GitHub Actions environment) # Otherwise use the provided workspace_path if os.environ.get('GITHUB_WORKSPACE'): ws = Path(os.environ['GITHUB_WORKSPACE']) else: ws = Path(workspace_path) if workspace_path else Path.cwd() - + if not ws.exists(): return [] @@ -1404,24 +1461,61 @@ def _detect_git_changed_files(workspace_path: str, mode: str = 'staged', commit: original_cwd = os.getcwd() try: os.chdir(str(ws)) - - if mode == 'staged': + + def _split(out: str) -> List[str]: + return [line.strip() for line in out.splitlines() if line.strip()] + + def _diff_against_base(ref: str) -> Optional[List[str]]: + """Diff changed files (excluding deletions) against a base ref. + + Tries the remote-tracking ref (``origin/``) first, then the + bare ref. Returns None when neither ref can be resolved so the + caller can fall back to another detection strategy. The + ``--diff-filter=ACMR`` excludes deleted paths so they never + become scan targets. + """ + if not ref: + return None + for candidate in (f'origin/{ref}', ref): + try: + out = check_output( + ['git', 'diff', '--name-only', '--diff-filter=ACMR', f'{candidate}...HEAD'], + text=True, stderr=subprocess.DEVNULL, + ) + return _split(out) + except CalledProcessError: + continue + return None + + if mode == 'auto': + # Prefer the PR base-ref diff in CI; fall back to staged changes + # for local/pre-commit use. + base = base_ref or os.environ.get('GITHUB_BASE_REF', '') + pr_files = _diff_against_base(base) + if pr_files is not None: + return pr_files + out = check_output(['git', 'diff', '--name-only', '--cached'], text=True, stderr=subprocess.DEVNULL) + return _split(out) + elif mode == 'pr': + base = base_ref or os.environ.get('GITHUB_BASE_REF', '') + return _diff_against_base(base) or [] + elif mode == 'staged': # staged but not yet committed out = check_output(['git', 'diff', '--name-only', '--cached'], text=True, stderr=subprocess.DEVNULL) + return _split(out) elif mode == 'current-commit': # files that are part of HEAD commit out = check_output(['git', 'diff-tree', '--no-commit-id', '--name-only', '-r', 'HEAD'], text=True, stderr=subprocess.DEVNULL) + return _split(out) elif mode == 'commit' and commit: out = check_output(['git', 'diff-tree', '--no-commit-id', '--name-only', '-r', commit], text=True, stderr=subprocess.DEVNULL) + return _split(out) else: return [] - - files = [line.strip() for line in out.splitlines() if line.strip()] - return files finally: # Always restore original working directory os.chdir(original_cwd) - + except CalledProcessError: return [] except Exception: diff --git a/socket_basics/core/connector/opengrep/__init__.py b/socket_basics/core/connector/opengrep/__init__.py index 0cf4135..1125d11 100644 --- a/socket_basics/core/connector/opengrep/__init__.py +++ b/socket_basics/core/connector/opengrep/__init__.py @@ -48,6 +48,13 @@ def scan(self) -> Dict[str, Any]: targets = self.config.get_scan_targets() + # Diff-only / explicit file scoping resolved to no existing files (for + # example a delete-only PR). Running OpenGrep without any target makes it + # scan its own working directory and emit spurious findings, so skip. + if not targets: + logger.info('No scan targets to analyze (scoped scan matched no existing files); skipping OpenGrep') + return {} + # Check if custom rules mode is enabled custom_rules_path = self.config.get_custom_rules_path() custom_rule_files: Dict[str, Path] = {} diff --git a/tests/test_changed_files_scope.py b/tests/test_changed_files_scope.py new file mode 100644 index 0000000..3eda06d --- /dev/null +++ b/tests/test_changed_files_scope.py @@ -0,0 +1,172 @@ +"""Tests for diff-only (changed-files) scan scoping. + +SAST/OpenGrep (and the other connectors that call ``get_scan_targets``) must +honor ``changed_files`` so PRs report only on what the PR changed, instead of +re-scanning the whole repository. +""" + +import os +import subprocess +from argparse import Namespace + +import pytest + +from socket_basics.core.config import Config, _detect_git_changed_files, create_config_from_args + + +def _make_config(workspace, **overrides): + cfg = {"workspace": str(workspace)} + cfg.update(overrides) + return Config(cfg) + + +class TestGetScanTargets: + """Precedence and scoping behaviour of Config.get_scan_targets().""" + + def test_default_scans_whole_workspace(self, tmp_path): + (tmp_path / "a.py").write_text("x = 1") + assert _make_config(tmp_path).get_scan_targets() == [str(tmp_path)] + + def test_scan_all_returns_workspace(self, tmp_path): + (tmp_path / "a.py").write_text("x = 1") + cfg = _make_config(tmp_path, scan_all=True, changed_files=["a.py"]) + # scan_all is an explicit override and wins over changed_files + assert cfg.get_scan_targets() == [str(tmp_path)] + + def test_changed_files_scopes_to_existing_files(self, tmp_path): + (tmp_path / "a.py").write_text("x = 1") + (tmp_path / "b.py").write_text("y = 2") + cfg = _make_config(tmp_path, changed_files=["a.py"]) + assert cfg.get_scan_targets() == [str(tmp_path / "a.py")] + + def test_changed_files_skips_missing_paths(self, tmp_path): + (tmp_path / "a.py").write_text("x = 1") + cfg = _make_config(tmp_path, changed_files=["a.py", "gone.py"]) + assert cfg.get_scan_targets() == [str(tmp_path / "a.py")] + + def test_delete_only_pr_returns_empty(self, tmp_path): + # All changed paths were deleted -> nothing to scan. Must NOT fall back + # to scanning the whole workspace/cwd (the footgun this fixes). + cfg = _make_config(tmp_path, changed_files=["gone.py"]) + assert cfg.get_scan_targets() == [] + + def test_changed_files_takes_precedence_over_scan_files(self, tmp_path): + (tmp_path / "a.py").write_text("x = 1") + (tmp_path / "b.py").write_text("y = 2") + cfg = _make_config(tmp_path, scan_files="a.py", changed_files=["b.py"]) + assert cfg.get_scan_targets() == [str(tmp_path / "b.py")] + + def test_scan_files_used_when_no_changed_files(self, tmp_path): + (tmp_path / "a.py").write_text("x = 1") + cfg = _make_config(tmp_path, scan_files="a.py") + assert cfg.get_scan_targets() == [str(tmp_path / "a.py")] + + def test_absolute_changed_file_path_preserved(self, tmp_path): + abs_path = tmp_path / "a.py" + abs_path.write_text("x = 1") + cfg = _make_config(tmp_path, changed_files=[str(abs_path)]) + assert cfg.get_scan_targets() == [str(abs_path)] + + +def _git(repo, *args): + env = { + **os.environ, + "GIT_AUTHOR_NAME": "t", + "GIT_AUTHOR_EMAIL": "t@example.com", + "GIT_COMMITTER_NAME": "t", + "GIT_COMMITTER_EMAIL": "t@example.com", + } + return subprocess.run( + ["git", "-C", str(repo), *args], capture_output=True, text=True, env=env + ) + + +def _config_args(workspace, changed_files): + return Namespace( + config=None, + workspace=str(workspace), + scan_files=None, + console_tabular_enabled=False, + output_console_enabled=False, + console_json_enabled=False, + output_json_enabled=False, + verbose=False, + repo="test/repo", + branch="feature", + default_branch=False, + commit_message=None, + pull_request=None, + committers=None, + enable_s3_upload=False, + output=".socket.facts.json", + changed_files=changed_files, + ) + + +@pytest.fixture +def pr_repo(tmp_path, monkeypatch): + """A git repo with a 'main' base and a 'feature' branch ahead of it.""" + # _detect_git_changed_files prefers GITHUB_WORKSPACE; clear it so the + # explicit workspace path is used. + monkeypatch.delenv("GITHUB_WORKSPACE", raising=False) + monkeypatch.delenv("GITHUB_BASE_REF", raising=False) + + _git(tmp_path, "init", "-b", "main") + (tmp_path / "base.py").write_text("base = 1") + (tmp_path / "old.py").write_text("old = 1") + _git(tmp_path, "add", ".") + _git(tmp_path, "commit", "-m", "base") + + _git(tmp_path, "checkout", "-b", "feature") + (tmp_path / "feat.py").write_text("feat = 1") + (tmp_path / "base.py").write_text("base = 2") # modify + (tmp_path / "old.py").unlink() # delete + _git(tmp_path, "add", "-A") + _git(tmp_path, "commit", "-m", "feature") + return tmp_path + + +class TestDetectGitChangedFiles: + + def test_pr_mode_lists_added_and_modified(self, pr_repo): + files = _detect_git_changed_files(str(pr_repo), mode="pr", base_ref="main") + assert sorted(files) == ["base.py", "feat.py"] + + def test_pr_mode_excludes_deletions(self, pr_repo): + files = _detect_git_changed_files(str(pr_repo), mode="pr", base_ref="main") + assert "old.py" not in files + + def test_auto_uses_base_ref_env(self, pr_repo, monkeypatch): + monkeypatch.setenv("GITHUB_BASE_REF", "main") + files = _detect_git_changed_files(str(pr_repo), mode="auto") + assert sorted(files) == ["base.py", "feat.py"] + + def test_auto_falls_back_to_staged_without_base_ref(self, pr_repo): + # No GITHUB_BASE_REF and no base_ref -> staged changes (none staged here) + (pr_repo / "staged.py").write_text("s = 1") + _git(pr_repo, "add", "staged.py") + files = _detect_git_changed_files(str(pr_repo), mode="auto") + assert files == ["staged.py"] + + def test_non_git_dir_returns_empty(self, tmp_path, monkeypatch): + monkeypatch.delenv("GITHUB_WORKSPACE", raising=False) + assert _detect_git_changed_files(str(tmp_path), mode="pr", base_ref="main") == [] + + def test_delete_only_pr_config_creation_keeps_empty_scope(self, tmp_path, monkeypatch): + monkeypatch.delenv("GITHUB_WORKSPACE", raising=False) + monkeypatch.setenv("GITHUB_BASE_REF", "main") + + _git(tmp_path, "init", "-b", "main") + (tmp_path / "old.py").write_text("old = 1") + _git(tmp_path, "add", ".") + _git(tmp_path, "commit", "-m", "base") + + _git(tmp_path, "checkout", "-b", "feature") + (tmp_path / "old.py").unlink() + _git(tmp_path, "add", "-A") + _git(tmp_path, "commit", "-m", "delete old") + + cfg = create_config_from_args(_config_args(tmp_path, "pr")) + + assert cfg.get("changed_files") == [] + assert cfg.get_scan_targets() == []