diff --git a/README.md b/README.md index 52dbf6e..869095c 100644 --- a/README.md +++ b/README.md @@ -102,6 +102,7 @@ Note that there are other parameters that can also be added to the config but no * `metrics`: Streams performance metrics to the console. * `shutdown`: Shutdown a model by providing its Slurm job ID. * `list`: List all available model names, or view the default/cached configuration of a specific model, `--json-mode` supported. +* `cleanup`: Remove old log directories. You can filter by `--model-family`, `--model-name`, `--job-id`, and/or `--before-job-id`. Use `--dry-run` to preview what would be deleted. For more details on the usage of these commands, refer to the [User Guide](https://vectorinstitute.github.io/vector-inference/user_guide/) diff --git a/tests/vec_inf/cli/test_cli.py b/tests/vec_inf/cli/test_cli.py index 155b091..ed7b721 100644 --- a/tests/vec_inf/cli/test_cli.py +++ b/tests/vec_inf/cli/test_cli.py @@ -531,3 +531,77 @@ def test_metrics_command_request_failed( in result.output ) assert "Connection refused" in result.output + + +def test_cli_cleanup_logs_dry_run(runner, tmp_path): + """Test CLI cleanup command in dry-run mode.""" + model_dir = tmp_path / "fam_a" / "model_a.123" + model_dir.mkdir(parents=True) + + result = runner.invoke( + cli, + [ + "cleanup", + "--log-dir", + str(tmp_path), + "--model-family", + "fam_a", + "--model-name", + "model_a", + "--dry-run", + ], + ) + + assert result.exit_code == 0 + assert "would be deleted" in result.output + assert "model_a.123" in result.output + + +def test_cli_cleanup_logs_delete(tmp_path): + """Test cleanup_logs CLI deletes matching directories when not in dry-run mode.""" + fam_dir = tmp_path / "fam_a" + fam_dir.mkdir() + (fam_dir / "model_a.1").mkdir() + + runner = CliRunner() + result = runner.invoke( + cli, + [ + "cleanup", + "--log-dir", + str(tmp_path), + "--model-family", + "fam_a", + "--model-name", + "model_a", + "--job-id", + "1", + ], + ) + + assert result.exit_code == 0 + assert "Deleted 1 log directory" in result.output + assert not (fam_dir / "model_a.1").exists() + + +def test_cli_cleanup_logs_no_match(tmp_path): + """Test cleanup_logs CLI when no directories match the filters.""" + fam_dir = tmp_path / "fam_a" + fam_dir.mkdir() + (fam_dir / "model_a.1").mkdir() + + runner = CliRunner() + result = runner.invoke( + cli, + [ + "cleanup", + "--log-dir", + str(tmp_path), + "--model-family", + "fam_b", + ], + ) + + assert result.exit_code == 0 + assert "No matching log directories were deleted." in result.output + assert (fam_dir / "model_a.1").exists() diff --git a/tests/vec_inf/client/test_api.py b/tests/vec_inf/client/test_api.py index 74dc398..3cdb2c1 100644 --- a/tests/vec_inf/client/test_api.py +++ b/tests/vec_inf/client/test_api.py @@ -128,3 +128,74 @@ def test_wait_until_ready(): assert result.server_status == ModelStatus.READY assert result.base_url == "http://gpu123:8080/v1" assert mock_status.call_count == 2 + + +def test_cleanup_logs_no_match(tmp_path): + """Test when cleanup_logs returns empty list.""" + fam_a = tmp_path / "fam_a" + model_a = fam_a / "model_a.999" + model_a.mkdir(parents=True) + + client = VecInfClient() + deleted = client.cleanup_logs( + log_dir=tmp_path, + model_family="fam_b", + dry_run=False, + ) + + assert deleted == [] + assert fam_a.exists() + assert model_a.exists() + + +def test_cleanup_logs_deletes_matching_dirs(tmp_path): + """Test that cleanup_logs deletes model directories matching filters.""" + fam_a = tmp_path / "fam_a" + fam_a.mkdir() + + model_a_1 = fam_a / "model_a.10" + model_a_2 = fam_a / "model_a.20" + model_b = fam_a / "model_b.30" + + model_a_1.mkdir() + model_a_2.mkdir() + model_b.mkdir() + + client = VecInfClient() + deleted = client.cleanup_logs( + log_dir=tmp_path, + model_family="fam_a", + model_name="model_a", + before_job_id=15, + dry_run=False, + ) + + assert deleted == [model_a_1] + assert not model_a_1.exists() + assert model_a_2.exists() + assert model_b.exists() + + +def test_cleanup_logs_matching_dirs_dry_run(tmp_path): + """Test that cleanup_logs find model directories matching filters.""" + fam_a = tmp_path / "fam_a" + fam_a.mkdir() + + model_a_1 = fam_a / "model_a.10" + model_a_2 = fam_a / "model_a.20" + + model_a_1.mkdir() + model_a_2.mkdir() + + client = VecInfClient() + deleted = client.cleanup_logs( + log_dir=tmp_path, + model_family="fam_a", + model_name="model_a", + before_job_id=15, + dry_run=True, + ) + + assert deleted == [model_a_1] + assert model_a_1.exists() + assert model_a_2.exists() diff --git a/tests/vec_inf/client/test_utils.py b/tests/vec_inf/client/test_utils.py index 930d6a1..f13baa1 100644 --- a/tests/vec_inf/client/test_utils.py +++ b/tests/vec_inf/client/test_utils.py @@ -8,6 +8,7 @@ from vec_inf.client._utils import ( MODEL_READY_SIGNATURE, + find_matching_dirs, get_base_url, is_server_running, load_config, @@ -208,3 +209,135 @@ def test_load_config_invalid_user_model(tmp_path): assert "validation error" in str(excinfo.value).lower() assert "model_type" in str(excinfo.value) assert "num_gpus" in str(excinfo.value) + + +def test_find_matching_dirs_only_model_family(tmp_path): + """Return model_family directory when only model_family is provided.""" + fam_dir = tmp_path / "fam_a" + fam_dir.mkdir() + (fam_dir / "model_a.1").mkdir() + (fam_dir / "model_b.2").mkdir() + + other_dir = tmp_path / "fam_b" + other_dir.mkdir() + (other_dir / "model_c.3").mkdir() + + matches = find_matching_dirs(log_dir=tmp_path, model_family="fam_a") + assert len(matches) == 1 + assert matches[0].name == "fam_a" + + +def test_find_matching_dirs_only_model_name(tmp_path): + """Return directories matching when only model_name is provided.""" + fam_a = tmp_path / "fam_a" + fam_a.mkdir() + (fam_a / "target.1").mkdir() + (fam_a / "other.2").mkdir() + + fam_b = tmp_path / "fam_b" + fam_b.mkdir() + (fam_b / "different.3").mkdir() + + matches = find_matching_dirs(log_dir=tmp_path, model_name="target") + result_names = [p.name for p in matches] + + assert "target.1" in result_names + assert "other.2" not in result_names + assert "different.3" not in result_names + + +def test_find_matching_dirs_only_job_id(tmp_path): + """Return directories matching exact job_id.""" + fam_dir = tmp_path / "fam" + fam_dir.mkdir() + (fam_dir / "model_a.10").mkdir() + (fam_dir / "model_b.20").mkdir() + (fam_dir / "model_c.30").mkdir() + + matches = find_matching_dirs(log_dir=tmp_path, job_id=10) + result_names = [p.name for p in matches] + + assert "model_a.10" in result_names + assert "model_b.20" not in result_names + assert "model_c.30" not in result_names + + +def test_find_matching_dirs_only_before_job_id(tmp_path): + """Return directories with job_id < before_job_id.""" + fam_dir = tmp_path / "fam_a" + fam_dir.mkdir() + (fam_dir / "model_a.1").mkdir() + (fam_dir / "model_a.5").mkdir() + (fam_dir / "model_a.100").mkdir() + + fam_dir = tmp_path / "fam_b" + fam_dir.mkdir() + (fam_dir / "model_b.30").mkdir() + + matches = find_matching_dirs(log_dir=tmp_path, before_job_id=50) + result_names = [p.name for p in matches] + + assert "model_a.1" in result_names + assert "model_a.5" in result_names + assert "model_a.100" not in result_names + assert "model_b.30" in result_names + + +def test_find_matching_dirs_family_and_before_job_id(tmp_path): + """Return directories under a given family with job IDs less than before_job_id.""" + fam_dir = tmp_path / "targetfam" + fam_dir.mkdir() + (fam_dir / "model_a.10").mkdir() + (fam_dir / "model_a.20").mkdir() + (fam_dir / "model_a.99").mkdir() + (fam_dir / "model_a.150").mkdir() + + other_fam = tmp_path / "otherfam" + other_fam.mkdir() + (other_fam / "model_b.5").mkdir() + (other_fam / "model_b.10").mkdir() + (other_fam / "model_b.100").mkdir() + + matches = find_matching_dirs( + log_dir=tmp_path, + model_family="targetfam", + before_job_id=100, + ) + + result_names = [p.name for p in matches] + + assert "model_a.10" in result_names + assert "model_a.20" in result_names + assert "model_a.99" in result_names + assert "model_a.150" not in result_names + assert all("otherfam" not in str(p) for p in matches) + + +def test_find_matching_dirs_with_family_model_name_and_before_job_id(tmp_path): + """Return matching dirs with model_family, model_name, and before_job_id filters.""" + fam_dir = tmp_path / "targetfam" + fam_dir.mkdir() + (fam_dir / "model_a.1").mkdir() + (fam_dir / "model_a.50").mkdir() + (fam_dir / "model_a.150").mkdir() + (fam_dir / "model_b.40").mkdir() + + other_fam = tmp_path / "otherfam" + other_fam.mkdir() + (other_fam / "model_c.20").mkdir() + + matches = find_matching_dirs( + log_dir=tmp_path, + model_family="targetfam", + model_name="model_a", + before_job_id=100, + ) + + result_names = [p.name for p in matches] + + assert "model_a.1" in result_names + assert "model_a.50" in result_names + assert "model_a.150" not in result_names + assert "model_b.40" not in result_names + assert all("model_b" not in p for p in result_names) + assert all("otherfam" not in str(p) for p in matches) diff --git a/vec_inf/README.md b/vec_inf/README.md index a7cceea..9b9b816 100644 --- a/vec_inf/README.md +++ b/vec_inf/README.md @@ -5,5 +5,6 @@ * `metrics`: Streams performance metrics to the console. * `shutdown`: Shutdown a model by providing its Slurm job ID. * `list`: List all available model names, or view the default/cached configuration of a specific model, `--json-mode` supported. +* `cleanup`: Remove old log directories. You can filter by `--model-family`, `--model-name`, `--job-id`, and/or `--before-job-id`. Use `--dry-run` to preview what would be deleted. Use `--help` to see all available options diff --git a/vec_inf/cli/_cli.py b/vec_inf/cli/_cli.py index af102f0..d4649dd 100644 --- a/vec_inf/cli/_cli.py +++ b/vec_inf/cli/_cli.py @@ -336,5 +336,69 @@ def metrics(slurm_job_id: int, log_dir: Optional[str] = None) -> None: raise click.ClickException(f"Metrics check failed: {str(e)}") from e +@cli.command("cleanup") +@click.option("--log-dir", type=str, help="Path to SLURM log directory") +@click.option("--model-family", type=str, help="Filter by model family") +@click.option("--model-name", type=str, help="Filter by model name") +@click.option( + "--job-id", type=int, help="Only remove logs with this exact SLURM job ID" +) +@click.option( + "--before-job-id", + type=int, + help="Remove logs with job ID less than this value", +) +@click.option("--dry-run", is_flag=True, help="List matching logs without deleting") +def cleanup_logs_cli( + log_dir: Optional[str], + model_family: Optional[str], + model_name: Optional[str], + job_id: Optional[int], + before_job_id: Optional[int], + dry_run: bool, +) -> None: + """Clean up log files based on optional filters. + + Parameters + ---------- + log_dir : str or Path, optional + Root directory containing log files. Defaults to ~/.vec-inf-logs. + model_family : str, optional + Only delete logs for this model family. + model_name : str, optional + Only delete logs for this model name. + job_id : int, optional + If provided, only match directories with this exact SLURM job ID. + before_job_id : int, optional + If provided, only delete logs with job ID less than this value. + dry_run : bool + If True, return matching files without deleting them. + """ + try: + client = VecInfClient() + matched = client.cleanup_logs( + log_dir=log_dir, + model_family=model_family, + model_name=model_name, + job_id=job_id, + before_job_id=before_job_id, + dry_run=dry_run, + ) + + if not matched: + if dry_run: + click.echo("Dry run: no matching log directories found.") + else: + click.echo("No matching log directories were deleted.") + elif dry_run: + click.echo(f"Dry run: {len(matched)} directories would be deleted:") + for f in matched: + click.echo(f" - {f}") + else: + click.echo(f"Deleted {len(matched)} log directory(ies).") + except Exception as e: + raise click.ClickException(f"Cleanup failed: {str(e)}") from e + + if __name__ == "__main__": cli() diff --git a/vec_inf/client/_utils.py b/vec_inf/client/_utils.py index 64f7879..6bb4b85 100644 --- a/vec_inf/client/_utils.py +++ b/vec_inf/client/_utils.py @@ -285,3 +285,70 @@ def parse_launch_output(output: str) -> tuple[str, dict[str, str]]: config_dict[key.lower().replace(" ", "_")] = value return slurm_job_id, config_dict + + +def find_matching_dirs( + log_dir: Path, + model_family: Optional[str] = None, + model_name: Optional[str] = None, + job_id: Optional[int] = None, + before_job_id: Optional[int] = None, +) -> list[Path]: + """ + Find log directories based on filtering criteria. + + Parameters + ---------- + log_dir : Path + The base directory containing model family directories. + model_family : str, optional + Filter to only search inside this family. + model_name : str, optional + Filter to only match model names. + job_id : int, optional + Filter to only match this exact SLURM job ID. + before_job_id : int, optional + Filter to only include job IDs less than this value. + + Returns + ------- + list[Path] + List of directories that match the criteria and can be deleted. + """ + matched = [] + + if not log_dir.exists() or not log_dir.is_dir(): + raise FileNotFoundError(f"Log directory does not exist: {log_dir}") + + if not model_family and not model_name and not job_id and not before_job_id: + return [log_dir] + + for family_dir in log_dir.iterdir(): + if not family_dir.is_dir(): + continue + if model_family and family_dir.name != model_family: + continue + + if model_family and not model_name and not job_id and not before_job_id: + return [family_dir] + + for job_dir in family_dir.iterdir(): + if not job_dir.is_dir(): + continue + + try: + name_part, id_part = job_dir.name.rsplit(".", 1) + parsed_id = int(id_part) + except ValueError: + continue + + if model_name and name_part != model_name: + continue + if job_id is not None and parsed_id != job_id: + continue + if before_job_id is not None and parsed_id >= before_job_id: + continue + + matched.append(job_dir) + + return matched diff --git a/vec_inf/client/api.py b/vec_inf/client/api.py index 3073888..fa05d2d 100644 --- a/vec_inf/client/api.py +++ b/vec_inf/client/api.py @@ -10,8 +10,10 @@ vec_inf.client.models : Data models for API responses """ +import shutil import time import warnings +from pathlib import Path from typing import Any, Optional, Union from vec_inf.client._exceptions import ( @@ -24,7 +26,7 @@ ModelStatusMonitor, PerformanceMetricsCollector, ) -from vec_inf.client._utils import run_bash_command +from vec_inf.client._utils import find_matching_dirs, run_bash_command from vec_inf.client.config import ModelConfig from vec_inf.client.models import ( LaunchOptions, @@ -60,6 +62,9 @@ class VecInfClient: wait_until_ready(slurm_job_id, timeout_seconds, poll_interval_seconds, log_dir) Wait for a model to become ready + cleanup_logs(log_dir, model_name, model_family, job_id, dry_run) + Remove logs from the log directory. + Examples -------- >>> from vec_inf.api import VecInfClient @@ -300,3 +305,51 @@ def wait_until_ready( # Wait before checking again time.sleep(poll_interval_seconds) + + def cleanup_logs( + self, + log_dir: Optional[Union[str, Path]] = None, + model_family: Optional[str] = None, + model_name: Optional[str] = None, + job_id: Optional[int] = None, + before_job_id: Optional[int] = None, + dry_run: bool = False, + ) -> list[Path]: + """Remove logs from the log directory. + + Parameters + ---------- + log_dir : str or Path, optional + Root directory containing log files. Defaults to ~/.vec-inf-logs. + model_family : str, optional + Only delete logs for this model family. + model_name : str, optional + Only delete logs for this model name. + job_id : int, optional + If provided, only match directories with this exact SLURM job ID. + before_job_id : int, optional + If provided, only delete logs with job ID less than this value. + dry_run : bool + If True, return matching files without deleting them. + + Returns + ------- + list[Path] + List of deleted (or matched if dry_run) log file paths. + """ + log_root = Path(log_dir) if log_dir else Path.home() / ".vec-inf-logs" + matched = find_matching_dirs( + log_dir=log_root, + model_family=model_family, + model_name=model_name, + job_id=job_id, + before_job_id=before_job_id, + ) + + if dry_run: + return matched + + for path in matched: + shutil.rmtree(path) + + return matched