Skip to content

added cleanup command functionality to cli. #106

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ Note that there are other parameters that can also be added to the config but no
* `metrics`: Streams performance metrics to the console.
* `shutdown`: Shutdown a model by providing its Slurm job ID.
* `list`: List all available model names, or view the default/cached configuration of a specific model, `--json-mode` supported.
* `cleanup`: Remove old log directories. You can filter by `--model-family`, `--model-name`, `--job-id`, and/or `--before-job-id`. Use `--dry-run` to preview what would be deleted.

For more details on the usage of these commands, refer to the [User Guide](https://vectorinstitute.github.io/vector-inference/user_guide/)

Expand Down
74 changes: 74 additions & 0 deletions tests/vec_inf/cli/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -531,3 +531,77 @@ def test_metrics_command_request_failed(
in result.output
)
assert "Connection refused" in result.output


def test_cli_cleanup_logs_dry_run(runner, tmp_path):
"""Test CLI cleanup command in dry-run mode."""
model_dir = tmp_path / "fam_a" / "model_a.123"
model_dir.mkdir(parents=True)

result = runner.invoke(
cli,
[
"cleanup",
"--log-dir",
str(tmp_path),
"--model-family",
"fam_a",
"--model-name",
"model_a",
"--dry-run",
],
)

assert result.exit_code == 0
assert "would be deleted" in result.output
assert "model_a.123" in result.output


def test_cli_cleanup_logs_delete(tmp_path):
"""Test cleanup_logs CLI deletes matching directories when not in dry-run mode."""
fam_dir = tmp_path / "fam_a"
fam_dir.mkdir()
(fam_dir / "model_a.1").mkdir()

runner = CliRunner()
result = runner.invoke(
cli,
[
"cleanup",
"--log-dir",
str(tmp_path),
"--model-family",
"fam_a",
"--model-name",
"model_a",
"--job-id",
"1",
],
)

assert result.exit_code == 0
assert "Deleted 1 log directory" in result.output
assert not (fam_dir / "model_a.1").exists()


def test_cli_cleanup_logs_no_match(tmp_path):
"""Test cleanup_logs CLI when no directories match the filters."""
fam_dir = tmp_path / "fam_a"
fam_dir.mkdir()
(fam_dir / "model_a.1").mkdir()

runner = CliRunner()
result = runner.invoke(
cli,
[
"cleanup",
"--log-dir",
str(tmp_path),
"--model-family",
"fam_b",
],
)

assert result.exit_code == 0
assert "No matching log directories were deleted." in result.output
assert (fam_dir / "model_a.1").exists()
71 changes: 71 additions & 0 deletions tests/vec_inf/client/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,3 +128,74 @@ def test_wait_until_ready():
assert result.server_status == ModelStatus.READY
assert result.base_url == "http://gpu123:8080/v1"
assert mock_status.call_count == 2


def test_cleanup_logs_no_match(tmp_path):
"""Test when cleanup_logs returns empty list."""
fam_a = tmp_path / "fam_a"
model_a = fam_a / "model_a.999"
model_a.mkdir(parents=True)

client = VecInfClient()
deleted = client.cleanup_logs(
log_dir=tmp_path,
model_family="fam_b",
dry_run=False,
)

assert deleted == []
assert fam_a.exists()
assert model_a.exists()


def test_cleanup_logs_deletes_matching_dirs(tmp_path):
"""Test that cleanup_logs deletes model directories matching filters."""
fam_a = tmp_path / "fam_a"
fam_a.mkdir()

model_a_1 = fam_a / "model_a.10"
model_a_2 = fam_a / "model_a.20"
model_b = fam_a / "model_b.30"

model_a_1.mkdir()
model_a_2.mkdir()
model_b.mkdir()

client = VecInfClient()
deleted = client.cleanup_logs(
log_dir=tmp_path,
model_family="fam_a",
model_name="model_a",
before_job_id=15,
dry_run=False,
)

assert deleted == [model_a_1]
assert not model_a_1.exists()
assert model_a_2.exists()
assert model_b.exists()


def test_cleanup_logs_matching_dirs_dry_run(tmp_path):
"""Test that cleanup_logs find model directories matching filters."""
fam_a = tmp_path / "fam_a"
fam_a.mkdir()

model_a_1 = fam_a / "model_a.10"
model_a_2 = fam_a / "model_a.20"

model_a_1.mkdir()
model_a_2.mkdir()

client = VecInfClient()
deleted = client.cleanup_logs(
log_dir=tmp_path,
model_family="fam_a",
model_name="model_a",
before_job_id=15,
dry_run=True,
)

assert deleted == [model_a_1]
assert model_a_1.exists()
assert model_a_2.exists()
133 changes: 133 additions & 0 deletions tests/vec_inf/client/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

from vec_inf.client._utils import (
MODEL_READY_SIGNATURE,
find_matching_dirs,
get_base_url,
is_server_running,
load_config,
Expand Down Expand Up @@ -208,3 +209,135 @@ def test_load_config_invalid_user_model(tmp_path):
assert "validation error" in str(excinfo.value).lower()
assert "model_type" in str(excinfo.value)
assert "num_gpus" in str(excinfo.value)


def test_find_matching_dirs_only_model_family(tmp_path):
"""Return model_family directory when only model_family is provided."""
fam_dir = tmp_path / "fam_a"
fam_dir.mkdir()
(fam_dir / "model_a.1").mkdir()
(fam_dir / "model_b.2").mkdir()

other_dir = tmp_path / "fam_b"
other_dir.mkdir()
(other_dir / "model_c.3").mkdir()

matches = find_matching_dirs(log_dir=tmp_path, model_family="fam_a")
assert len(matches) == 1
assert matches[0].name == "fam_a"


def test_find_matching_dirs_only_model_name(tmp_path):
"""Return directories matching when only model_name is provided."""
fam_a = tmp_path / "fam_a"
fam_a.mkdir()
(fam_a / "target.1").mkdir()
(fam_a / "other.2").mkdir()

fam_b = tmp_path / "fam_b"
fam_b.mkdir()
(fam_b / "different.3").mkdir()

matches = find_matching_dirs(log_dir=tmp_path, model_name="target")
result_names = [p.name for p in matches]

assert "target.1" in result_names
assert "other.2" not in result_names
assert "different.3" not in result_names


def test_find_matching_dirs_only_job_id(tmp_path):
"""Return directories matching exact job_id."""
fam_dir = tmp_path / "fam"
fam_dir.mkdir()
(fam_dir / "model_a.10").mkdir()
(fam_dir / "model_b.20").mkdir()
(fam_dir / "model_c.30").mkdir()

matches = find_matching_dirs(log_dir=tmp_path, job_id=10)
result_names = [p.name for p in matches]

assert "model_a.10" in result_names
assert "model_b.20" not in result_names
assert "model_c.30" not in result_names


def test_find_matching_dirs_only_before_job_id(tmp_path):
"""Return directories with job_id < before_job_id."""
fam_dir = tmp_path / "fam_a"
fam_dir.mkdir()
(fam_dir / "model_a.1").mkdir()
(fam_dir / "model_a.5").mkdir()
(fam_dir / "model_a.100").mkdir()

fam_dir = tmp_path / "fam_b"
fam_dir.mkdir()
(fam_dir / "model_b.30").mkdir()

matches = find_matching_dirs(log_dir=tmp_path, before_job_id=50)
result_names = [p.name for p in matches]

assert "model_a.1" in result_names
assert "model_a.5" in result_names
assert "model_a.100" not in result_names
assert "model_b.30" in result_names


def test_find_matching_dirs_family_and_before_job_id(tmp_path):
"""Return directories under a given family with job IDs less than before_job_id."""
fam_dir = tmp_path / "targetfam"
fam_dir.mkdir()
(fam_dir / "model_a.10").mkdir()
(fam_dir / "model_a.20").mkdir()
(fam_dir / "model_a.99").mkdir()
(fam_dir / "model_a.150").mkdir()

other_fam = tmp_path / "otherfam"
other_fam.mkdir()
(other_fam / "model_b.5").mkdir()
(other_fam / "model_b.10").mkdir()
(other_fam / "model_b.100").mkdir()

matches = find_matching_dirs(
log_dir=tmp_path,
model_family="targetfam",
before_job_id=100,
)

result_names = [p.name for p in matches]

assert "model_a.10" in result_names
assert "model_a.20" in result_names
assert "model_a.99" in result_names
assert "model_a.150" not in result_names
assert all("otherfam" not in str(p) for p in matches)


def test_find_matching_dirs_with_family_model_name_and_before_job_id(tmp_path):
"""Return matching dirs with model_family, model_name, and before_job_id filters."""
fam_dir = tmp_path / "targetfam"
fam_dir.mkdir()
(fam_dir / "model_a.1").mkdir()
(fam_dir / "model_a.50").mkdir()
(fam_dir / "model_a.150").mkdir()
(fam_dir / "model_b.40").mkdir()

other_fam = tmp_path / "otherfam"
other_fam.mkdir()
(other_fam / "model_c.20").mkdir()

matches = find_matching_dirs(
log_dir=tmp_path,
model_family="targetfam",
model_name="model_a",
before_job_id=100,
)

result_names = [p.name for p in matches]

assert "model_a.1" in result_names
assert "model_a.50" in result_names
assert "model_a.150" not in result_names
assert "model_b.40" not in result_names
assert all("model_b" not in p for p in result_names)
assert all("otherfam" not in str(p) for p in matches)
1 change: 1 addition & 0 deletions vec_inf/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,6 @@
* `metrics`: Streams performance metrics to the console.
* `shutdown`: Shutdown a model by providing its Slurm job ID.
* `list`: List all available model names, or view the default/cached configuration of a specific model, `--json-mode` supported.
* `cleanup`: Remove old log directories. You can filter by `--model-family`, `--model-name`, `--job-id`, and/or `--before-job-id`. Use `--dry-run` to preview what would be deleted.

Use `--help` to see all available options
64 changes: 64 additions & 0 deletions vec_inf/cli/_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -336,5 +336,69 @@ def metrics(slurm_job_id: int, log_dir: Optional[str] = None) -> None:
raise click.ClickException(f"Metrics check failed: {str(e)}") from e


@cli.command("cleanup")
@click.option("--log-dir", type=str, help="Path to SLURM log directory")
@click.option("--model-family", type=str, help="Filter by model family")
@click.option("--model-name", type=str, help="Filter by model name")
@click.option(
"--job-id", type=int, help="Only remove logs with this exact SLURM job ID"
)
@click.option(
"--before-job-id",
type=int,
help="Remove logs with job ID less than this value",
)
@click.option("--dry-run", is_flag=True, help="List matching logs without deleting")
def cleanup_logs_cli(
log_dir: Optional[str],
model_family: Optional[str],
model_name: Optional[str],
job_id: Optional[int],
before_job_id: Optional[int],
dry_run: bool,
) -> None:
"""Clean up log files based on optional filters.

Parameters
----------
log_dir : str or Path, optional
Root directory containing log files. Defaults to ~/.vec-inf-logs.
model_family : str, optional
Only delete logs for this model family.
model_name : str, optional
Only delete logs for this model name.
job_id : int, optional
If provided, only match directories with this exact SLURM job ID.
before_job_id : int, optional
If provided, only delete logs with job ID less than this value.
dry_run : bool
If True, return matching files without deleting them.
"""
try:
client = VecInfClient()
matched = client.cleanup_logs(
log_dir=log_dir,
model_family=model_family,
model_name=model_name,
job_id=job_id,
before_job_id=before_job_id,
dry_run=dry_run,
)

if not matched:
if dry_run:
click.echo("Dry run: no matching log directories found.")
else:
click.echo("No matching log directories were deleted.")
elif dry_run:
click.echo(f"Dry run: {len(matched)} directories would be deleted:")
for f in matched:
click.echo(f" - {f}")
else:
click.echo(f"Deleted {len(matched)} log directory(ies).")
except Exception as e:
raise click.ClickException(f"Cleanup failed: {str(e)}") from e


if __name__ == "__main__":
cli()
Loading
Loading