From 4cc3700b52ffa76ba4c77c2ed70f5a064befe3f9 Mon Sep 17 00:00:00 2001 From: Bartosz Burda Date: Sun, 26 Apr 2026 17:47:09 +0200 Subject: [PATCH 01/52] feat(demos/ota): scaffold pack_artifact CLI --- demos/ota_nav2_sensor_fix/scripts/.gitignore | 4 ++++ demos/ota_nav2_sensor_fix/scripts/conftest.py | 7 +++++++ .../scripts/pack_artifact.py | 21 +++++++++++++++++++ .../scripts/test_pack_artifact.py | 14 +++++++++++++ 4 files changed, 46 insertions(+) create mode 100644 demos/ota_nav2_sensor_fix/scripts/.gitignore create mode 100644 demos/ota_nav2_sensor_fix/scripts/conftest.py create mode 100644 demos/ota_nav2_sensor_fix/scripts/pack_artifact.py create mode 100644 demos/ota_nav2_sensor_fix/scripts/test_pack_artifact.py diff --git a/demos/ota_nav2_sensor_fix/scripts/.gitignore b/demos/ota_nav2_sensor_fix/scripts/.gitignore new file mode 100644 index 0000000..17cd2fc --- /dev/null +++ b/demos/ota_nav2_sensor_fix/scripts/.gitignore @@ -0,0 +1,4 @@ +.venv/ +__pycache__/ +*.pyc +.pytest_cache/ diff --git a/demos/ota_nav2_sensor_fix/scripts/conftest.py b/demos/ota_nav2_sensor_fix/scripts/conftest.py new file mode 100644 index 0000000..85d7c2d --- /dev/null +++ b/demos/ota_nav2_sensor_fix/scripts/conftest.py @@ -0,0 +1,7 @@ +"""Pytest fixtures for pack_artifact tests.""" +from __future__ import annotations + +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent)) diff --git a/demos/ota_nav2_sensor_fix/scripts/pack_artifact.py b/demos/ota_nav2_sensor_fix/scripts/pack_artifact.py new file mode 100644 index 0000000..700153e --- /dev/null +++ b/demos/ota_nav2_sensor_fix/scripts/pack_artifact.py @@ -0,0 +1,21 @@ +#!/usr/bin/env python3 +"""Pack a ROS 2 package into an OTA artifact + SOVD-shaped catalog entry.""" +from __future__ import annotations + +import argparse +import json +import subprocess +import sys +import tarfile +from pathlib import Path +from typing import Literal + +Kind = Literal["update", "install", "uninstall"] + + +def main(argv: list[str] | None = None) -> int: + raise NotImplementedError + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/demos/ota_nav2_sensor_fix/scripts/test_pack_artifact.py b/demos/ota_nav2_sensor_fix/scripts/test_pack_artifact.py new file mode 100644 index 0000000..0ea65a2 --- /dev/null +++ b/demos/ota_nav2_sensor_fix/scripts/test_pack_artifact.py @@ -0,0 +1,14 @@ +"""Tests for pack_artifact.py.""" +from __future__ import annotations + +import json +import tarfile +from pathlib import Path + +import pytest + +import pack_artifact + + +def test_imports(): + assert hasattr(pack_artifact, "main") From 0a3ddafb8ad7324a482e808573f15ddad2bc9cca Mon Sep 17 00:00:00 2001 From: Bartosz Burda Date: Sun, 26 Apr 2026 17:47:41 +0200 Subject: [PATCH 02/52] feat(demos/ota): pack_artifact argparse + dispatcher signature --- .../scripts/pack_artifact.py | 64 ++++++++++++++++++- .../scripts/test_pack_artifact.py | 36 +++++++++++ 2 files changed, 99 insertions(+), 1 deletion(-) diff --git a/demos/ota_nav2_sensor_fix/scripts/pack_artifact.py b/demos/ota_nav2_sensor_fix/scripts/pack_artifact.py index 700153e..ae5e591 100644 --- a/demos/ota_nav2_sensor_fix/scripts/pack_artifact.py +++ b/demos/ota_nav2_sensor_fix/scripts/pack_artifact.py @@ -13,9 +13,71 @@ Kind = Literal["update", "install", "uninstall"] -def main(argv: list[str] | None = None) -> int: +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + description="Pack a ROS 2 package into an OTA artifact + SOVD catalog entry.", + ) + parser.add_argument("--package", required=True, help="ROS 2 package name to pack.") + parser.add_argument( + "--version", + default="", + help="Semantic version of the artifact (omit for uninstall).", + ) + parser.add_argument( + "--kind", + required=True, + choices=["update", "install", "uninstall"], + help="Catalog entry kind.", + ) + parser.add_argument( + "--target-component", + required=True, + help="SOVD component the entry targets.", + ) + parser.add_argument( + "--executable", + default="", + help="Executable name inside install//lib (required for install).", + ) + parser.add_argument("--notes", default="", help="Free-text notes for the catalog entry.") + parser.add_argument( + "--duration", + type=int, + default=10, + help="Estimated install duration in seconds.", + ) + parser.add_argument( + "--out-dir", + default="artifacts", + help="Output directory for tarballs.", + ) + parser.add_argument( + "--catalog", + default="artifacts/catalog.json", + help="Path to the SOVD catalog JSON file.", + ) + parser.add_argument( + "--skip-build", + action="store_true", + help="Skip running colcon build; reuse existing install/ tree.", + ) + parser.add_argument( + "--workspace", + default=".", + help="Path to the colcon workspace root.", + ) + return parser + + +def run(**kwargs) -> int: raise NotImplementedError +def main(argv: list[str] | None = None) -> int: + parser = build_parser() + args = parser.parse_args(argv) + return run(**vars(args)) + + if __name__ == "__main__": sys.exit(main()) diff --git a/demos/ota_nav2_sensor_fix/scripts/test_pack_artifact.py b/demos/ota_nav2_sensor_fix/scripts/test_pack_artifact.py index 0ea65a2..6ba155b 100644 --- a/demos/ota_nav2_sensor_fix/scripts/test_pack_artifact.py +++ b/demos/ota_nav2_sensor_fix/scripts/test_pack_artifact.py @@ -12,3 +12,39 @@ def test_imports(): assert hasattr(pack_artifact, "main") + + +def test_main_requires_package(): + with pytest.raises(SystemExit): + pack_artifact.main([]) + + +def test_main_parses_basic_args(monkeypatch, tmp_path): + captured = {} + + def fake_run(**kwargs): + captured.update(kwargs) + return 0 + + monkeypatch.setattr(pack_artifact, "run", fake_run) + rc = pack_artifact.main( + [ + "--package", "fixed_lidar", + "--version", "2.1.0", + "--kind", "update", + "--target-component", "scan_sensor_node", + "--executable", "fixed_lidar_node", + "--notes", "noise filter fix", + "--out-dir", str(tmp_path / "artifacts"), + "--catalog", str(tmp_path / "artifacts" / "catalog.json"), + "--skip-build", + ] + ) + assert rc == 0 + assert captured["package"] == "fixed_lidar" + assert captured["version"] == "2.1.0" + assert captured["kind"] == "update" + assert captured["target_component"] == "scan_sensor_node" + assert captured["executable"] == "fixed_lidar_node" + assert captured["notes"] == "noise filter fix" + assert captured["skip_build"] is True From 3a5b41e809a46f802d2312092029e304b77d5b8b Mon Sep 17 00:00:00 2001 From: Bartosz Burda Date: Sun, 26 Apr 2026 17:48:06 +0200 Subject: [PATCH 03/52] feat(demos/ota): build SOVD-shaped catalog entry --- .../scripts/pack_artifact.py | 46 ++++++++++++++ .../scripts/test_pack_artifact.py | 61 +++++++++++++++++++ 2 files changed, 107 insertions(+) diff --git a/demos/ota_nav2_sensor_fix/scripts/pack_artifact.py b/demos/ota_nav2_sensor_fix/scripts/pack_artifact.py index ae5e591..c8dfc0e 100644 --- a/demos/ota_nav2_sensor_fix/scripts/pack_artifact.py +++ b/demos/ota_nav2_sensor_fix/scripts/pack_artifact.py @@ -69,6 +69,52 @@ def build_parser() -> argparse.ArgumentParser: return parser +def slug(package: str, version: str) -> str: + return f"{package}_{version.replace('.', '_')}" if version else package + + +def build_entry( + *, + package: str, + version: str, + kind: Kind, + target_component: str, + executable: str, + notes: str, + duration: int, + size_bytes: int, +) -> dict: + entry: dict = { + "id": slug(package, version) if kind != "uninstall" else f"{package}_remove", + "name": f"{package} {version}".strip(), + "automated": False, + "origins": ["remote"], + "notes": notes, + "duration": duration, + } + if version: + entry["version"] = version + if size_bytes > 0: + entry["size"] = max(1, size_bytes // 1024) + + if kind == "update": + entry["updated_components"] = [target_component] + elif kind == "install": + entry["added_components"] = [target_component] + else: # uninstall + entry["removed_components"] = [target_component] + + if kind != "uninstall": + entry["x_medkit_artifact_url"] = f"/artifacts/{package}-{version}.tar.gz" + entry["x_medkit_target_package"] = package + if executable: + entry["x_medkit_executable"] = executable + else: + entry["x_medkit_target_package"] = package + + return entry + + def run(**kwargs) -> int: raise NotImplementedError diff --git a/demos/ota_nav2_sensor_fix/scripts/test_pack_artifact.py b/demos/ota_nav2_sensor_fix/scripts/test_pack_artifact.py index 6ba155b..20208a8 100644 --- a/demos/ota_nav2_sensor_fix/scripts/test_pack_artifact.py +++ b/demos/ota_nav2_sensor_fix/scripts/test_pack_artifact.py @@ -48,3 +48,64 @@ def fake_run(**kwargs): assert captured["executable"] == "fixed_lidar_node" assert captured["notes"] == "noise filter fix" assert captured["skip_build"] is True + + +def test_build_entry_update_kind(): + entry = pack_artifact.build_entry( + package="fixed_lidar", + version="2.1.0", + kind="update", + target_component="scan_sensor_node", + executable="fixed_lidar_node", + notes="fix noise", + duration=10, + size_bytes=2048, + ) + assert entry["id"] == "fixed_lidar_2_1_0" + assert entry["name"] == "fixed_lidar 2.1.0" + assert entry["version"] == "2.1.0" + assert entry["automated"] is False + assert entry["origins"] == ["remote"] + assert entry["notes"] == "fix noise" + assert entry["size"] == 2 # KB rounded + assert entry["duration"] == 10 + assert entry["updated_components"] == ["scan_sensor_node"] + assert "added_components" not in entry + assert "removed_components" not in entry + assert entry["x_medkit_target_package"] == "fixed_lidar" + assert entry["x_medkit_executable"] == "fixed_lidar_node" + assert entry["x_medkit_artifact_url"] == "/artifacts/fixed_lidar-2.1.0.tar.gz" + + +def test_build_entry_install_kind(): + entry = pack_artifact.build_entry( + package="obstacle_classifier_v2", + version="1.0.0", + kind="install", + target_component="obstacle_classifier", + executable="obstacle_classifier_node", + notes="extra safety", + duration=15, + size_bytes=4096, + ) + assert entry["added_components"] == ["obstacle_classifier"] + assert "updated_components" not in entry + assert "removed_components" not in entry + + +def test_build_entry_uninstall_kind(): + entry = pack_artifact.build_entry( + package="broken_lidar_legacy", + version="", + kind="uninstall", + target_component="broken_lidar_legacy", + executable="", + notes="cleanup", + duration=5, + size_bytes=0, + ) + assert entry["removed_components"] == ["broken_lidar_legacy"] + assert "added_components" not in entry + assert "updated_components" not in entry + assert "x_medkit_artifact_url" not in entry + assert "x_medkit_executable" not in entry From 3d22b162bee28e1ae7c776df9bfb33af481f3bea Mon Sep 17 00:00:00 2001 From: Bartosz Burda Date: Sun, 26 Apr 2026 17:48:28 +0200 Subject: [PATCH 04/52] feat(demos/ota): merge_catalog with id-based replace --- .../scripts/pack_artifact.py | 12 +++++++++ .../scripts/test_pack_artifact.py | 26 +++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/demos/ota_nav2_sensor_fix/scripts/pack_artifact.py b/demos/ota_nav2_sensor_fix/scripts/pack_artifact.py index c8dfc0e..d26bf74 100644 --- a/demos/ota_nav2_sensor_fix/scripts/pack_artifact.py +++ b/demos/ota_nav2_sensor_fix/scripts/pack_artifact.py @@ -115,6 +115,18 @@ def build_entry( return entry +def merge_catalog(catalog_path: Path, entry: dict) -> None: + catalog_path = Path(catalog_path) + catalog_path.parent.mkdir(parents=True, exist_ok=True) + if catalog_path.exists(): + data = json.loads(catalog_path.read_text()) + else: + data = [] + data = [e for e in data if e.get("id") != entry["id"]] + data.append(entry) + catalog_path.write_text(json.dumps(data, indent=2) + "\n") + + def run(**kwargs) -> int: raise NotImplementedError diff --git a/demos/ota_nav2_sensor_fix/scripts/test_pack_artifact.py b/demos/ota_nav2_sensor_fix/scripts/test_pack_artifact.py index 20208a8..f2d9e5d 100644 --- a/demos/ota_nav2_sensor_fix/scripts/test_pack_artifact.py +++ b/demos/ota_nav2_sensor_fix/scripts/test_pack_artifact.py @@ -109,3 +109,29 @@ def test_build_entry_uninstall_kind(): assert "updated_components" not in entry assert "x_medkit_artifact_url" not in entry assert "x_medkit_executable" not in entry + + +def test_merge_catalog_creates_file(tmp_path): + catalog = tmp_path / "catalog.json" + entry = {"id": "a", "name": "a"} + pack_artifact.merge_catalog(catalog, entry) + data = json.loads(catalog.read_text()) + assert data == [entry] + + +def test_merge_catalog_appends(tmp_path): + catalog = tmp_path / "catalog.json" + catalog.write_text(json.dumps([{"id": "a", "name": "a"}])) + entry = {"id": "b", "name": "b"} + pack_artifact.merge_catalog(catalog, entry) + data = json.loads(catalog.read_text()) + assert [e["id"] for e in data] == ["a", "b"] + + +def test_merge_catalog_replaces_same_id(tmp_path): + catalog = tmp_path / "catalog.json" + catalog.write_text(json.dumps([{"id": "a", "name": "old"}])) + entry = {"id": "a", "name": "new"} + pack_artifact.merge_catalog(catalog, entry) + data = json.loads(catalog.read_text()) + assert data == [entry] From 876b960c85a538299c895532767bd47a8697b5b0 Mon Sep 17 00:00:00 2001 From: Bartosz Burda Date: Sun, 26 Apr 2026 17:49:03 +0200 Subject: [PATCH 05/52] feat(demos/ota): tarball creation from install dir --- .../scripts/pack_artifact.py | 18 ++++++++++++++++++ .../scripts/test_pack_artifact.py | 18 ++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/demos/ota_nav2_sensor_fix/scripts/pack_artifact.py b/demos/ota_nav2_sensor_fix/scripts/pack_artifact.py index d26bf74..719b589 100644 --- a/demos/ota_nav2_sensor_fix/scripts/pack_artifact.py +++ b/demos/ota_nav2_sensor_fix/scripts/pack_artifact.py @@ -127,6 +127,24 @@ def merge_catalog(catalog_path: Path, entry: dict) -> None: catalog_path.write_text(json.dumps(data, indent=2) + "\n") +def create_tarball( + *, + package: str, + version: str, + install_dir: Path, + out_dir: Path, +) -> Path: + install_dir = Path(install_dir) + if not install_dir.exists(): + raise FileNotFoundError(f"install dir does not exist: {install_dir}") + out_dir = Path(out_dir) + out_dir.mkdir(parents=True, exist_ok=True) + out_path = out_dir / f"{package}-{version}.tar.gz" + with tarfile.open(out_path, "w:gz") as tf: + tf.add(install_dir, arcname=package) + return out_path + + def run(**kwargs) -> int: raise NotImplementedError diff --git a/demos/ota_nav2_sensor_fix/scripts/test_pack_artifact.py b/demos/ota_nav2_sensor_fix/scripts/test_pack_artifact.py index f2d9e5d..e0aa942 100644 --- a/demos/ota_nav2_sensor_fix/scripts/test_pack_artifact.py +++ b/demos/ota_nav2_sensor_fix/scripts/test_pack_artifact.py @@ -135,3 +135,21 @@ def test_merge_catalog_replaces_same_id(tmp_path): pack_artifact.merge_catalog(catalog, entry) data = json.loads(catalog.read_text()) assert data == [entry] + + +def test_create_tarball(tmp_path): + install = tmp_path / "install" / "fixed_lidar" + (install / "lib").mkdir(parents=True) + (install / "lib" / "fixed_lidar_node").write_text("binary") + out_dir = tmp_path / "artifacts" + out_path = pack_artifact.create_tarball( + package="fixed_lidar", + version="2.1.0", + install_dir=install, + out_dir=out_dir, + ) + assert out_path == out_dir / "fixed_lidar-2.1.0.tar.gz" + assert out_path.exists() + with tarfile.open(out_path) as tf: + names = tf.getnames() + assert "fixed_lidar/lib/fixed_lidar_node" in names From f86c7658cd7861f091b40c05b1685486330631dc Mon Sep 17 00:00:00 2001 From: Bartosz Burda Date: Sun, 26 Apr 2026 17:49:37 +0200 Subject: [PATCH 06/52] feat(demos/ota): pack_artifact end-to-end run() with kind dispatch --- .../scripts/pack_artifact.py | 60 +++++++++++++++- .../scripts/test_pack_artifact.py | 72 +++++++++++++++++++ 2 files changed, 130 insertions(+), 2 deletions(-) diff --git a/demos/ota_nav2_sensor_fix/scripts/pack_artifact.py b/demos/ota_nav2_sensor_fix/scripts/pack_artifact.py index 719b589..c8d663c 100644 --- a/demos/ota_nav2_sensor_fix/scripts/pack_artifact.py +++ b/demos/ota_nav2_sensor_fix/scripts/pack_artifact.py @@ -145,8 +145,64 @@ def create_tarball( return out_path -def run(**kwargs) -> int: - raise NotImplementedError +def colcon_build(workspace: Path, package: str) -> None: + cmd = ["colcon", "build", "--packages-select", package, "--symlink-install"] + completed = subprocess.run(cmd, cwd=workspace, check=False) + if completed.returncode != 0: + raise SystemExit(f"colcon build failed for {package}") + + +def run( + *, + package: str, + version: str, + kind: Kind, + target_component: str, + executable: str, + notes: str, + duration: int, + out_dir: str, + catalog: str, + skip_build: bool, + workspace: str, +) -> int: + if kind == "install" and not executable: + sys.stderr.write("--executable is required for install\n") + raise SystemExit(2) + + out_dir_p = Path(out_dir) + catalog_p = Path(catalog) + workspace_p = Path(workspace) + + size_bytes = 0 + if kind != "uninstall": + if not skip_build: + colcon_build(workspace_p, package) + install_dir = workspace_p / "install" / package + if not install_dir.exists(): + sys.stderr.write(f"install dir missing: {install_dir}\n") + raise SystemExit(3) + tarball = create_tarball( + package=package, + version=version, + install_dir=install_dir, + out_dir=out_dir_p, + ) + size_bytes = tarball.stat().st_size + + entry = build_entry( + package=package, + version=version, + kind=kind, + target_component=target_component, + executable=executable, + notes=notes, + duration=duration, + size_bytes=size_bytes, + ) + merge_catalog(catalog_p, entry) + print(f"packed {entry['id']}") + return 0 def main(argv: list[str] | None = None) -> int: diff --git a/demos/ota_nav2_sensor_fix/scripts/test_pack_artifact.py b/demos/ota_nav2_sensor_fix/scripts/test_pack_artifact.py index e0aa942..871f5cd 100644 --- a/demos/ota_nav2_sensor_fix/scripts/test_pack_artifact.py +++ b/demos/ota_nav2_sensor_fix/scripts/test_pack_artifact.py @@ -153,3 +153,75 @@ def test_create_tarball(tmp_path): with tarfile.open(out_path) as tf: names = tf.getnames() assert "fixed_lidar/lib/fixed_lidar_node" in names + + +def test_run_update_kind_e2e(tmp_path, monkeypatch): + workspace = tmp_path / "ws" + install = workspace / "install" / "fixed_lidar" / "lib" + install.mkdir(parents=True) + (install / "fixed_lidar_node").write_text("bin") + out_dir = tmp_path / "artifacts" + catalog = out_dir / "catalog.json" + + rc = pack_artifact.run( + package="fixed_lidar", + version="2.1.0", + kind="update", + target_component="scan_sensor_node", + executable="fixed_lidar_node", + notes="fix", + duration=10, + out_dir=str(out_dir), + catalog=str(catalog), + skip_build=True, + workspace=str(workspace), + ) + + assert rc == 0 + assert (out_dir / "fixed_lidar-2.1.0.tar.gz").exists() + data = json.loads(catalog.read_text()) + assert data[0]["id"] == "fixed_lidar_2_1_0" + assert data[0]["updated_components"] == ["scan_sensor_node"] + + +def test_run_uninstall_skips_tarball(tmp_path): + workspace = tmp_path / "ws" + workspace.mkdir() + out_dir = tmp_path / "artifacts" + catalog = out_dir / "catalog.json" + + rc = pack_artifact.run( + package="broken_lidar_legacy", + version="", + kind="uninstall", + target_component="broken_lidar_legacy", + executable="", + notes="cleanup", + duration=5, + out_dir=str(out_dir), + catalog=str(catalog), + skip_build=True, + workspace=str(workspace), + ) + + assert rc == 0 + assert not list(out_dir.glob("*.tar.gz")) + data = json.loads(catalog.read_text()) + assert data[0]["removed_components"] == ["broken_lidar_legacy"] + + +def test_run_install_requires_executable(tmp_path): + with pytest.raises(SystemExit): + pack_artifact.run( + package="obstacle_classifier_v2", + version="1.0.0", + kind="install", + target_component="obstacle_classifier", + executable="", + notes="", + duration=10, + out_dir=str(tmp_path / "out"), + catalog=str(tmp_path / "out" / "catalog.json"), + skip_build=True, + workspace=str(tmp_path / "ws"), + ) From 161b707935ba326a415db0a869aa7a6cbf411d7e Mon Sep 17 00:00:00 2001 From: Bartosz Burda Date: Sun, 26 Apr 2026 17:53:58 +0200 Subject: [PATCH 07/52] fix(demos/ota): version default to 0.0.0 + cleanup unused symbols + pyright config --- demos/ota_nav2_sensor_fix/scripts/pack_artifact.py | 4 ++-- demos/ota_nav2_sensor_fix/scripts/pyrightconfig.json | 5 +++++ demos/ota_nav2_sensor_fix/scripts/test_pack_artifact.py | 3 +-- 3 files changed, 8 insertions(+), 4 deletions(-) create mode 100644 demos/ota_nav2_sensor_fix/scripts/pyrightconfig.json diff --git a/demos/ota_nav2_sensor_fix/scripts/pack_artifact.py b/demos/ota_nav2_sensor_fix/scripts/pack_artifact.py index c8d663c..4413524 100644 --- a/demos/ota_nav2_sensor_fix/scripts/pack_artifact.py +++ b/demos/ota_nav2_sensor_fix/scripts/pack_artifact.py @@ -20,8 +20,8 @@ def build_parser() -> argparse.ArgumentParser: parser.add_argument("--package", required=True, help="ROS 2 package name to pack.") parser.add_argument( "--version", - default="", - help="Semantic version of the artifact (omit for uninstall).", + default="0.0.0", + help="Semantic version of the artifact (pass '' for uninstall).", ) parser.add_argument( "--kind", diff --git a/demos/ota_nav2_sensor_fix/scripts/pyrightconfig.json b/demos/ota_nav2_sensor_fix/scripts/pyrightconfig.json new file mode 100644 index 0000000..5b3e8d0 --- /dev/null +++ b/demos/ota_nav2_sensor_fix/scripts/pyrightconfig.json @@ -0,0 +1,5 @@ +{ + "extraPaths": ["."], + "venvPath": ".", + "venv": ".venv" +} diff --git a/demos/ota_nav2_sensor_fix/scripts/test_pack_artifact.py b/demos/ota_nav2_sensor_fix/scripts/test_pack_artifact.py index 871f5cd..6b3f49f 100644 --- a/demos/ota_nav2_sensor_fix/scripts/test_pack_artifact.py +++ b/demos/ota_nav2_sensor_fix/scripts/test_pack_artifact.py @@ -3,7 +3,6 @@ import json import tarfile -from pathlib import Path import pytest @@ -155,7 +154,7 @@ def test_create_tarball(tmp_path): assert "fixed_lidar/lib/fixed_lidar_node" in names -def test_run_update_kind_e2e(tmp_path, monkeypatch): +def test_run_update_kind_e2e(tmp_path): workspace = tmp_path / "ws" install = workspace / "install" / "fixed_lidar" / "lib" install.mkdir(parents=True) From 398842bda46f585c0c6c43104d7a4aa6df1545d1 Mon Sep 17 00:00:00 2001 From: Bartosz Burda Date: Sun, 26 Apr 2026 17:57:24 +0200 Subject: [PATCH 08/52] test(demos/ota): cover colcon_build, install e2e, version-required guard --- .../scripts/pack_artifact.py | 3 + .../scripts/test_pack_artifact.py | 84 ++++++++++++++++++- 2 files changed, 83 insertions(+), 4 deletions(-) diff --git a/demos/ota_nav2_sensor_fix/scripts/pack_artifact.py b/demos/ota_nav2_sensor_fix/scripts/pack_artifact.py index 4413524..06f3df1 100644 --- a/demos/ota_nav2_sensor_fix/scripts/pack_artifact.py +++ b/demos/ota_nav2_sensor_fix/scripts/pack_artifact.py @@ -169,6 +169,9 @@ def run( if kind == "install" and not executable: sys.stderr.write("--executable is required for install\n") raise SystemExit(2) + if kind != "uninstall" and not version: + sys.stderr.write(f"--version is required for kind={kind}\n") + raise SystemExit(2) out_dir_p = Path(out_dir) catalog_p = Path(catalog) diff --git a/demos/ota_nav2_sensor_fix/scripts/test_pack_artifact.py b/demos/ota_nav2_sensor_fix/scripts/test_pack_artifact.py index 6b3f49f..bced302 100644 --- a/demos/ota_nav2_sensor_fix/scripts/test_pack_artifact.py +++ b/demos/ota_nav2_sensor_fix/scripts/test_pack_artifact.py @@ -9,10 +9,6 @@ import pack_artifact -def test_imports(): - assert hasattr(pack_artifact, "main") - - def test_main_requires_package(): with pytest.raises(SystemExit): pack_artifact.main([]) @@ -224,3 +220,83 @@ def test_run_install_requires_executable(tmp_path): skip_build=True, workspace=str(tmp_path / "ws"), ) + + +def test_run_update_requires_version(tmp_path): + with pytest.raises(SystemExit): + pack_artifact.run( + package="fixed_lidar", + version="", + kind="update", + target_component="scan_sensor_node", + executable="fixed_lidar_node", + notes="", + duration=10, + out_dir=str(tmp_path / "out"), + catalog=str(tmp_path / "out" / "catalog.json"), + skip_build=True, + workspace=str(tmp_path / "ws"), + ) + + +def test_run_install_kind_e2e(tmp_path): + workspace = tmp_path / "ws" + install = workspace / "install" / "obstacle_classifier_v2" / "lib" + install.mkdir(parents=True) + (install / "obstacle_classifier_node").write_text("bin") + out_dir = tmp_path / "artifacts" + catalog = out_dir / "catalog.json" + + rc = pack_artifact.run( + package="obstacle_classifier_v2", + version="1.0.0", + kind="install", + target_component="obstacle_classifier", + executable="obstacle_classifier_node", + notes="extra safety", + duration=15, + out_dir=str(out_dir), + catalog=str(catalog), + skip_build=True, + workspace=str(workspace), + ) + + assert rc == 0 + assert (out_dir / "obstacle_classifier_v2-1.0.0.tar.gz").exists() + data = json.loads(catalog.read_text()) + assert data[0]["id"] == "obstacle_classifier_v2_1_0_0" + assert data[0]["added_components"] == ["obstacle_classifier"] + assert data[0]["x_medkit_executable"] == "obstacle_classifier_node" + + +def test_colcon_build_invokes_subprocess(tmp_path, monkeypatch): + captured = {} + + class FakeCompleted: + returncode = 0 + + def fake_run(cmd, cwd, check): + captured["cmd"] = cmd + captured["cwd"] = cwd + captured["check"] = check + return FakeCompleted() + + monkeypatch.setattr(pack_artifact.subprocess, "run", fake_run) + pack_artifact.colcon_build(tmp_path, "broken_lidar") + + assert captured["cmd"] == [ + "colcon", "build", "--packages-select", "broken_lidar", "--symlink-install" + ] + assert captured["cwd"] == tmp_path + assert captured["check"] is False + + +def test_colcon_build_raises_on_nonzero(tmp_path, monkeypatch): + class FakeCompleted: + returncode = 1 + + monkeypatch.setattr( + pack_artifact.subprocess, "run", lambda *_args, **_kwargs: FakeCompleted() + ) + with pytest.raises(SystemExit): + pack_artifact.colcon_build(tmp_path, "broken_lidar") From 216acc75dfac67bc33399d6c39ae5e93688993d3 Mon Sep 17 00:00:00 2001 From: Bartosz Burda Date: Sun, 26 Apr 2026 18:00:23 +0200 Subject: [PATCH 09/52] feat(demos/ota): ota_update_server scaffold --- .../ota_update_server/.gitignore | 3 ++ .../ota_update_server/__init__.py | 3 ++ .../ota_update_server/main.py | 43 +++++++++++++++++++ .../ota_update_server/pyproject.toml | 22 ++++++++++ .../ota_update_server/tests/__init__.py | 0 .../ota_update_server/tests/test_main.py | 20 +++++++++ 6 files changed, 91 insertions(+) create mode 100644 demos/ota_nav2_sensor_fix/ota_update_server/.gitignore create mode 100644 demos/ota_nav2_sensor_fix/ota_update_server/ota_update_server/__init__.py create mode 100644 demos/ota_nav2_sensor_fix/ota_update_server/ota_update_server/main.py create mode 100644 demos/ota_nav2_sensor_fix/ota_update_server/pyproject.toml create mode 100644 demos/ota_nav2_sensor_fix/ota_update_server/tests/__init__.py create mode 100644 demos/ota_nav2_sensor_fix/ota_update_server/tests/test_main.py diff --git a/demos/ota_nav2_sensor_fix/ota_update_server/.gitignore b/demos/ota_nav2_sensor_fix/ota_update_server/.gitignore new file mode 100644 index 0000000..d0ad410 --- /dev/null +++ b/demos/ota_nav2_sensor_fix/ota_update_server/.gitignore @@ -0,0 +1,3 @@ +.venv/ +*.egg-info/ +__pycache__/ diff --git a/demos/ota_nav2_sensor_fix/ota_update_server/ota_update_server/__init__.py b/demos/ota_nav2_sensor_fix/ota_update_server/ota_update_server/__init__.py new file mode 100644 index 0000000..813f14c --- /dev/null +++ b/demos/ota_nav2_sensor_fix/ota_update_server/ota_update_server/__init__.py @@ -0,0 +1,3 @@ +from .main import create_app + +__all__ = ["create_app"] diff --git a/demos/ota_nav2_sensor_fix/ota_update_server/ota_update_server/main.py b/demos/ota_nav2_sensor_fix/ota_update_server/ota_update_server/main.py new file mode 100644 index 0000000..23d1d44 --- /dev/null +++ b/demos/ota_nav2_sensor_fix/ota_update_server/ota_update_server/main.py @@ -0,0 +1,43 @@ +"""Minimal FastAPI artifact host for the OTA demo.""" +from __future__ import annotations + +import json +import os +from pathlib import Path + +from fastapi import FastAPI, HTTPException +from fastapi.responses import FileResponse + + +def create_app(artifacts_dir: Path) -> FastAPI: + app = FastAPI(title="OTA Update Server") + artifacts_dir = Path(artifacts_dir) + + @app.get("/catalog") + def catalog() -> list[dict]: + catalog_file = artifacts_dir / "catalog.json" + if not catalog_file.exists(): + return [] + return json.loads(catalog_file.read_text()) + + @app.get("/artifacts/{filename}") + def artifact(filename: str) -> FileResponse: + if "/" in filename or ".." in filename: + raise HTTPException(status_code=400, detail="invalid filename") + path = artifacts_dir / filename + if not path.exists(): + raise HTTPException(status_code=404, detail="not found") + return FileResponse(path, media_type="application/gzip", filename=filename) + + return app + + +def run() -> None: + import uvicorn + artifacts_dir = Path(os.environ.get("OTA_ARTIFACTS_DIR", "/artifacts")) + host = os.environ.get("OTA_HOST", "0.0.0.0") + port = int(os.environ.get("OTA_PORT", "9000")) + uvicorn.run(create_app(artifacts_dir), host=host, port=port) + + +__all__ = ["create_app", "run"] diff --git a/demos/ota_nav2_sensor_fix/ota_update_server/pyproject.toml b/demos/ota_nav2_sensor_fix/ota_update_server/pyproject.toml new file mode 100644 index 0000000..3479fa6 --- /dev/null +++ b/demos/ota_nav2_sensor_fix/ota_update_server/pyproject.toml @@ -0,0 +1,22 @@ +[project] +name = "ota_update_server" +version = "0.1.0" +description = "Minimal FastAPI artifact host for OTA demo" +requires-python = ">=3.11" +dependencies = [ + "fastapi>=0.110", + "uvicorn[standard]>=0.29", +] + +[project.optional-dependencies] +dev = [ + "pytest>=8", + "httpx>=0.27", +] + +[project.scripts] +ota-update-server = "ota_update_server.main:run" + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" diff --git a/demos/ota_nav2_sensor_fix/ota_update_server/tests/__init__.py b/demos/ota_nav2_sensor_fix/ota_update_server/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/demos/ota_nav2_sensor_fix/ota_update_server/tests/test_main.py b/demos/ota_nav2_sensor_fix/ota_update_server/tests/test_main.py new file mode 100644 index 0000000..367f485 --- /dev/null +++ b/demos/ota_nav2_sensor_fix/ota_update_server/tests/test_main.py @@ -0,0 +1,20 @@ +"""Tests for the FastAPI update server.""" +from __future__ import annotations + +import json +from pathlib import Path + +import pytest +from fastapi.testclient import TestClient + +from ota_update_server import create_app + + +@pytest.fixture +def artifacts_dir(tmp_path) -> Path: + return tmp_path + + +@pytest.fixture +def client(artifacts_dir): + return TestClient(create_app(artifacts_dir)) From e42af7e6d101b6316a30d6bbbf40de5e2f1ebab6 Mon Sep 17 00:00:00 2001 From: Bartosz Burda Date: Sun, 26 Apr 2026 18:00:44 +0200 Subject: [PATCH 10/52] test(ota_server): /catalog endpoint coverage --- .../ota_update_server/tests/test_main.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/demos/ota_nav2_sensor_fix/ota_update_server/tests/test_main.py b/demos/ota_nav2_sensor_fix/ota_update_server/tests/test_main.py index 367f485..c02066f 100644 --- a/demos/ota_nav2_sensor_fix/ota_update_server/tests/test_main.py +++ b/demos/ota_nav2_sensor_fix/ota_update_server/tests/test_main.py @@ -18,3 +18,20 @@ def artifacts_dir(tmp_path) -> Path: @pytest.fixture def client(artifacts_dir): return TestClient(create_app(artifacts_dir)) + + +def test_catalog_empty_when_missing(client): + resp = client.get("/catalog") + assert resp.status_code == 200 + assert resp.json() == [] + + +def test_catalog_returns_file_contents(client, artifacts_dir): + payload = [ + {"id": "fixed_lidar_2_1_0", "updated_components": ["scan_sensor_node"]}, + {"id": "obstacle_classifier_v2_install", "added_components": ["obstacle_classifier"]}, + ] + (artifacts_dir / "catalog.json").write_text(json.dumps(payload)) + resp = client.get("/catalog") + assert resp.status_code == 200 + assert resp.json() == payload From ab10c2987ff9ef337243d8d1530bcdd71b67d3aa Mon Sep 17 00:00:00 2001 From: Bartosz Burda Date: Sun, 26 Apr 2026 18:01:04 +0200 Subject: [PATCH 11/52] test(ota_server): /artifacts endpoint + path traversal guard --- .../ota_update_server/tests/test_main.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/demos/ota_nav2_sensor_fix/ota_update_server/tests/test_main.py b/demos/ota_nav2_sensor_fix/ota_update_server/tests/test_main.py index c02066f..5414d0e 100644 --- a/demos/ota_nav2_sensor_fix/ota_update_server/tests/test_main.py +++ b/demos/ota_nav2_sensor_fix/ota_update_server/tests/test_main.py @@ -35,3 +35,21 @@ def test_catalog_returns_file_contents(client, artifacts_dir): resp = client.get("/catalog") assert resp.status_code == 200 assert resp.json() == payload + + +def test_artifact_returns_file(client, artifacts_dir): + (artifacts_dir / "fixed_lidar-2.1.0.tar.gz").write_bytes(b"BIN") + resp = client.get("/artifacts/fixed_lidar-2.1.0.tar.gz") + assert resp.status_code == 200 + assert resp.content == b"BIN" + + +def test_artifact_404_when_missing(client): + resp = client.get("/artifacts/missing.tar.gz") + assert resp.status_code == 404 + + +def test_artifact_rejects_path_traversal(client, artifacts_dir): + (artifacts_dir.parent / "secret.txt").write_text("hush") + resp = client.get("/artifacts/..%2Fsecret.txt") + assert resp.status_code in (400, 404) From 9b71e5a86ff0cbdb2148791a094a7e3365bc57c8 Mon Sep 17 00:00:00 2001 From: Bartosz Burda Date: Sun, 26 Apr 2026 18:03:29 +0200 Subject: [PATCH 12/52] feat(demos/ota): ota_update_server Dockerfile --- .../ota_update_server/Dockerfile | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 demos/ota_nav2_sensor_fix/ota_update_server/Dockerfile diff --git a/demos/ota_nav2_sensor_fix/ota_update_server/Dockerfile b/demos/ota_nav2_sensor_fix/ota_update_server/Dockerfile new file mode 100644 index 0000000..e5a59af --- /dev/null +++ b/demos/ota_nav2_sensor_fix/ota_update_server/Dockerfile @@ -0,0 +1,13 @@ +FROM python:3.11-slim + +WORKDIR /app +COPY pyproject.toml ./ +COPY ota_update_server ./ota_update_server +RUN pip install --no-cache-dir . + +ENV OTA_ARTIFACTS_DIR=/artifacts +ENV OTA_HOST=0.0.0.0 +ENV OTA_PORT=9000 +EXPOSE 9000 + +CMD ["ota-update-server"] From 1f838deba4ad5bc76308e24e80fe8495414c6814 Mon Sep 17 00:00:00 2001 From: Bartosz Burda Date: Sun, 26 Apr 2026 18:04:26 +0200 Subject: [PATCH 13/52] chore(ota_server): pyright config to silence venv import warnings --- .../ota_update_server/pyrightconfig.json | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 demos/ota_nav2_sensor_fix/ota_update_server/pyrightconfig.json diff --git a/demos/ota_nav2_sensor_fix/ota_update_server/pyrightconfig.json b/demos/ota_nav2_sensor_fix/ota_update_server/pyrightconfig.json new file mode 100644 index 0000000..49f6c16 --- /dev/null +++ b/demos/ota_nav2_sensor_fix/ota_update_server/pyrightconfig.json @@ -0,0 +1,6 @@ +{ + "include": ["ota_update_server", "tests"], + "venvPath": ".", + "venv": ".venv", + "reportUnusedFunction": "none" +} From 653902be34b8d443881c50d61a9d0d5c127b0991 Mon Sep 17 00:00:00 2001 From: Bartosz Burda Date: Sun, 26 Apr 2026 18:07:07 +0200 Subject: [PATCH 14/52] fix(ota_server): mark /artifacts route response_class=FileResponse --- .../ota_update_server/ota_update_server/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/demos/ota_nav2_sensor_fix/ota_update_server/ota_update_server/main.py b/demos/ota_nav2_sensor_fix/ota_update_server/ota_update_server/main.py index 23d1d44..159580c 100644 --- a/demos/ota_nav2_sensor_fix/ota_update_server/ota_update_server/main.py +++ b/demos/ota_nav2_sensor_fix/ota_update_server/ota_update_server/main.py @@ -20,7 +20,7 @@ def catalog() -> list[dict]: return [] return json.loads(catalog_file.read_text()) - @app.get("/artifacts/{filename}") + @app.get("/artifacts/{filename}", response_class=FileResponse) def artifact(filename: str) -> FileResponse: if "/" in filename or ".." in filename: raise HTTPException(status_code=400, detail="invalid filename") From 248ef325376e8385d91c8a9060f5e4b2be1773ac Mon Sep 17 00:00:00 2001 From: Bartosz Burda Date: Sun, 26 Apr 2026 18:12:20 +0200 Subject: [PATCH 15/52] feat(demos/ota): broken_lidar node with phantom /scan return --- .../ros2_packages/broken_lidar/CMakeLists.txt | 21 +++++++++ .../ros2_packages/broken_lidar/package.xml | 17 +++++++ .../broken_lidar/src/broken_lidar_node.cpp | 44 +++++++++++++++++++ 3 files changed, 82 insertions(+) create mode 100644 demos/ota_nav2_sensor_fix/ros2_packages/broken_lidar/CMakeLists.txt create mode 100644 demos/ota_nav2_sensor_fix/ros2_packages/broken_lidar/package.xml create mode 100644 demos/ota_nav2_sensor_fix/ros2_packages/broken_lidar/src/broken_lidar_node.cpp diff --git a/demos/ota_nav2_sensor_fix/ros2_packages/broken_lidar/CMakeLists.txt b/demos/ota_nav2_sensor_fix/ros2_packages/broken_lidar/CMakeLists.txt new file mode 100644 index 0000000..9ecd54e --- /dev/null +++ b/demos/ota_nav2_sensor_fix/ros2_packages/broken_lidar/CMakeLists.txt @@ -0,0 +1,21 @@ +cmake_minimum_required(VERSION 3.16) +project(broken_lidar) + +if(NOT CMAKE_CXX_STANDARD) + set(CMAKE_CXX_STANDARD 17) +endif() + +if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang") + add_compile_options(-Wall -Wextra -Wpedantic) +endif() + +find_package(ament_cmake REQUIRED) +find_package(rclcpp REQUIRED) +find_package(sensor_msgs REQUIRED) + +add_executable(broken_lidar_node src/broken_lidar_node.cpp) +ament_target_dependencies(broken_lidar_node rclcpp sensor_msgs) + +install(TARGETS broken_lidar_node DESTINATION lib/${PROJECT_NAME}) + +ament_package() diff --git a/demos/ota_nav2_sensor_fix/ros2_packages/broken_lidar/package.xml b/demos/ota_nav2_sensor_fix/ros2_packages/broken_lidar/package.xml new file mode 100644 index 0000000..1b4ab49 --- /dev/null +++ b/demos/ota_nav2_sensor_fix/ros2_packages/broken_lidar/package.xml @@ -0,0 +1,17 @@ + + + broken_lidar + 1.0.0 + Broken lidar node that publishes /scan with a phantom obstacle (demo target of OTA update). + bburda + Apache-2.0 + + ament_cmake + + rclcpp + sensor_msgs + + + ament_cmake + + diff --git a/demos/ota_nav2_sensor_fix/ros2_packages/broken_lidar/src/broken_lidar_node.cpp b/demos/ota_nav2_sensor_fix/ros2_packages/broken_lidar/src/broken_lidar_node.cpp new file mode 100644 index 0000000..856f760 --- /dev/null +++ b/demos/ota_nav2_sensor_fix/ros2_packages/broken_lidar/src/broken_lidar_node.cpp @@ -0,0 +1,44 @@ +// Copyright 2026 bburda. Apache-2.0. +#include +#include +#include + +#include +#include + +using std::chrono_literals::operator""ms; + +class BrokenLidarNode : public rclcpp::Node { + public: + BrokenLidarNode() : Node("scan_sensor_node") { + pub_ = create_publisher("scan", 10); + timer_ = create_wall_timer(100ms, [this]() { publish_scan(); }); + } + + private: + void publish_scan() { + sensor_msgs::msg::LaserScan msg; + msg.header.stamp = now(); + msg.header.frame_id = "base_scan"; + msg.angle_min = -static_cast(M_PI); + msg.angle_max = static_cast(M_PI); + msg.angle_increment = static_cast(M_PI / 180.0); + msg.range_min = 0.05f; + msg.range_max = 10.0f; + constexpr int kRays = 360; + msg.ranges.assign(kRays, msg.range_max); + // Inject a 1 m phantom return at angle 0 (straight ahead, ray index 180) + msg.ranges[180] = 1.0f; + pub_->publish(msg); + } + + rclcpp::Publisher::SharedPtr pub_; + rclcpp::TimerBase::SharedPtr timer_; +}; + +int main(int argc, char ** argv) { + rclcpp::init(argc, argv); + rclcpp::spin(std::make_shared()); + rclcpp::shutdown(); + return 0; +} From c8e10eb1348240304ae46b30b9b5e8f12a989865 Mon Sep 17 00:00:00 2001 From: Bartosz Burda Date: Sun, 26 Apr 2026 18:13:10 +0200 Subject: [PATCH 16/52] feat(demos/ota): fixed_lidar (clean /scan, no phantom) --- .../ros2_packages/fixed_lidar/CMakeLists.txt | 21 ++++++++++ .../ros2_packages/fixed_lidar/package.xml | 17 ++++++++ .../fixed_lidar/src/fixed_lidar_node.cpp | 42 +++++++++++++++++++ 3 files changed, 80 insertions(+) create mode 100644 demos/ota_nav2_sensor_fix/ros2_packages/fixed_lidar/CMakeLists.txt create mode 100644 demos/ota_nav2_sensor_fix/ros2_packages/fixed_lidar/package.xml create mode 100644 demos/ota_nav2_sensor_fix/ros2_packages/fixed_lidar/src/fixed_lidar_node.cpp diff --git a/demos/ota_nav2_sensor_fix/ros2_packages/fixed_lidar/CMakeLists.txt b/demos/ota_nav2_sensor_fix/ros2_packages/fixed_lidar/CMakeLists.txt new file mode 100644 index 0000000..d08580f --- /dev/null +++ b/demos/ota_nav2_sensor_fix/ros2_packages/fixed_lidar/CMakeLists.txt @@ -0,0 +1,21 @@ +cmake_minimum_required(VERSION 3.16) +project(fixed_lidar) + +if(NOT CMAKE_CXX_STANDARD) + set(CMAKE_CXX_STANDARD 17) +endif() + +if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang") + add_compile_options(-Wall -Wextra -Wpedantic) +endif() + +find_package(ament_cmake REQUIRED) +find_package(rclcpp REQUIRED) +find_package(sensor_msgs REQUIRED) + +add_executable(fixed_lidar_node src/fixed_lidar_node.cpp) +ament_target_dependencies(fixed_lidar_node rclcpp sensor_msgs) + +install(TARGETS fixed_lidar_node DESTINATION lib/${PROJECT_NAME}) + +ament_package() diff --git a/demos/ota_nav2_sensor_fix/ros2_packages/fixed_lidar/package.xml b/demos/ota_nav2_sensor_fix/ros2_packages/fixed_lidar/package.xml new file mode 100644 index 0000000..d0315d7 --- /dev/null +++ b/demos/ota_nav2_sensor_fix/ros2_packages/fixed_lidar/package.xml @@ -0,0 +1,17 @@ + + + fixed_lidar + 2.1.0 + Fixed lidar node that publishes clean /scan. + bburda + Apache-2.0 + + ament_cmake + + rclcpp + sensor_msgs + + + ament_cmake + + diff --git a/demos/ota_nav2_sensor_fix/ros2_packages/fixed_lidar/src/fixed_lidar_node.cpp b/demos/ota_nav2_sensor_fix/ros2_packages/fixed_lidar/src/fixed_lidar_node.cpp new file mode 100644 index 0000000..587b97c --- /dev/null +++ b/demos/ota_nav2_sensor_fix/ros2_packages/fixed_lidar/src/fixed_lidar_node.cpp @@ -0,0 +1,42 @@ +// Copyright 2026 bburda. Apache-2.0. +#include +#include +#include + +#include +#include + +using std::chrono_literals::operator""ms; + +class FixedLidarNode : public rclcpp::Node { + public: + FixedLidarNode() : Node("scan_sensor_node") { + pub_ = create_publisher("scan", 10); + timer_ = create_wall_timer(100ms, [this]() { publish_scan(); }); + } + + private: + void publish_scan() { + sensor_msgs::msg::LaserScan msg; + msg.header.stamp = now(); + msg.header.frame_id = "base_scan"; + msg.angle_min = -static_cast(M_PI); + msg.angle_max = static_cast(M_PI); + msg.angle_increment = static_cast(M_PI / 180.0); + msg.range_min = 0.05f; + msg.range_max = 10.0f; + constexpr int kRays = 360; + msg.ranges.assign(kRays, msg.range_max); + pub_->publish(msg); + } + + rclcpp::Publisher::SharedPtr pub_; + rclcpp::TimerBase::SharedPtr timer_; +}; + +int main(int argc, char ** argv) { + rclcpp::init(argc, argv); + rclcpp::spin(std::make_shared()); + rclcpp::shutdown(); + return 0; +} From a14243b30a8ba6d0f3e5f109b5db72f6c2c231a8 Mon Sep 17 00:00:00 2001 From: Bartosz Burda Date: Sun, 26 Apr 2026 18:13:54 +0200 Subject: [PATCH 17/52] feat(demos/ota): broken_lidar_legacy do-nothing node (uninstall target) --- .../broken_lidar_legacy/CMakeLists.txt | 20 ++++++++++++++++ .../broken_lidar_legacy/package.xml | 16 +++++++++++++ .../broken_lidar_legacy/src/legacy_node.cpp | 24 +++++++++++++++++++ 3 files changed, 60 insertions(+) create mode 100644 demos/ota_nav2_sensor_fix/ros2_packages/broken_lidar_legacy/CMakeLists.txt create mode 100644 demos/ota_nav2_sensor_fix/ros2_packages/broken_lidar_legacy/package.xml create mode 100644 demos/ota_nav2_sensor_fix/ros2_packages/broken_lidar_legacy/src/legacy_node.cpp diff --git a/demos/ota_nav2_sensor_fix/ros2_packages/broken_lidar_legacy/CMakeLists.txt b/demos/ota_nav2_sensor_fix/ros2_packages/broken_lidar_legacy/CMakeLists.txt new file mode 100644 index 0000000..eda65f7 --- /dev/null +++ b/demos/ota_nav2_sensor_fix/ros2_packages/broken_lidar_legacy/CMakeLists.txt @@ -0,0 +1,20 @@ +cmake_minimum_required(VERSION 3.16) +project(broken_lidar_legacy) + +if(NOT CMAKE_CXX_STANDARD) + set(CMAKE_CXX_STANDARD 17) +endif() + +if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang") + add_compile_options(-Wall -Wextra -Wpedantic) +endif() + +find_package(ament_cmake REQUIRED) +find_package(rclcpp REQUIRED) + +add_executable(broken_lidar_legacy src/legacy_node.cpp) +ament_target_dependencies(broken_lidar_legacy rclcpp) + +install(TARGETS broken_lidar_legacy DESTINATION lib/${PROJECT_NAME}) + +ament_package() diff --git a/demos/ota_nav2_sensor_fix/ros2_packages/broken_lidar_legacy/package.xml b/demos/ota_nav2_sensor_fix/ros2_packages/broken_lidar_legacy/package.xml new file mode 100644 index 0000000..2cd0f61 --- /dev/null +++ b/demos/ota_nav2_sensor_fix/ros2_packages/broken_lidar_legacy/package.xml @@ -0,0 +1,16 @@ + + + broken_lidar_legacy + 1.0.0 + Do-nothing legacy package, target of uninstall demo scene. + bburda + Apache-2.0 + + ament_cmake + + rclcpp + + + ament_cmake + + diff --git a/demos/ota_nav2_sensor_fix/ros2_packages/broken_lidar_legacy/src/legacy_node.cpp b/demos/ota_nav2_sensor_fix/ros2_packages/broken_lidar_legacy/src/legacy_node.cpp new file mode 100644 index 0000000..144b7b4 --- /dev/null +++ b/demos/ota_nav2_sensor_fix/ros2_packages/broken_lidar_legacy/src/legacy_node.cpp @@ -0,0 +1,24 @@ +// Copyright 2026 bburda. Apache-2.0. +#include +#include + +#include + +using std::chrono_literals::operator""s; + +class LegacyNode : public rclcpp::Node { + public: + LegacyNode() : Node("broken_lidar_legacy") { + timer_ = create_wall_timer(5s, []() {}); + } + + private: + rclcpp::TimerBase::SharedPtr timer_; +}; + +int main(int argc, char ** argv) { + rclcpp::init(argc, argv); + rclcpp::spin(std::make_shared()); + rclcpp::shutdown(); + return 0; +} From 01e5fc84e89c4e5d29ab5c54da6fb847e4664139 Mon Sep 17 00:00:00 2001 From: Bartosz Burda Date: Sun, 26 Apr 2026 18:14:45 +0200 Subject: [PATCH 18/52] feat(demos/ota): obstacle_classifier_v2 (install target, /scan -> /safety_overlay) --- .../obstacle_classifier_v2/CMakeLists.txt | 22 ++++++++ .../obstacle_classifier_v2/package.xml | 18 +++++++ .../src/obstacle_classifier_node.cpp | 50 +++++++++++++++++++ 3 files changed, 90 insertions(+) create mode 100644 demos/ota_nav2_sensor_fix/ros2_packages/obstacle_classifier_v2/CMakeLists.txt create mode 100644 demos/ota_nav2_sensor_fix/ros2_packages/obstacle_classifier_v2/package.xml create mode 100644 demos/ota_nav2_sensor_fix/ros2_packages/obstacle_classifier_v2/src/obstacle_classifier_node.cpp diff --git a/demos/ota_nav2_sensor_fix/ros2_packages/obstacle_classifier_v2/CMakeLists.txt b/demos/ota_nav2_sensor_fix/ros2_packages/obstacle_classifier_v2/CMakeLists.txt new file mode 100644 index 0000000..cda1852 --- /dev/null +++ b/demos/ota_nav2_sensor_fix/ros2_packages/obstacle_classifier_v2/CMakeLists.txt @@ -0,0 +1,22 @@ +cmake_minimum_required(VERSION 3.16) +project(obstacle_classifier_v2) + +if(NOT CMAKE_CXX_STANDARD) + set(CMAKE_CXX_STANDARD 17) +endif() + +if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang") + add_compile_options(-Wall -Wextra -Wpedantic) +endif() + +find_package(ament_cmake REQUIRED) +find_package(rclcpp REQUIRED) +find_package(sensor_msgs REQUIRED) +find_package(visualization_msgs REQUIRED) + +add_executable(obstacle_classifier_node src/obstacle_classifier_node.cpp) +ament_target_dependencies(obstacle_classifier_node rclcpp sensor_msgs visualization_msgs) + +install(TARGETS obstacle_classifier_node DESTINATION lib/${PROJECT_NAME}) + +ament_package() diff --git a/demos/ota_nav2_sensor_fix/ros2_packages/obstacle_classifier_v2/package.xml b/demos/ota_nav2_sensor_fix/ros2_packages/obstacle_classifier_v2/package.xml new file mode 100644 index 0000000..6d214bd --- /dev/null +++ b/demos/ota_nav2_sensor_fix/ros2_packages/obstacle_classifier_v2/package.xml @@ -0,0 +1,18 @@ + + + obstacle_classifier_v2 + 1.0.0 + Extra safety layer for nav2 (target of install demo scene). + bburda + Apache-2.0 + + ament_cmake + + rclcpp + sensor_msgs + visualization_msgs + + + ament_cmake + + diff --git a/demos/ota_nav2_sensor_fix/ros2_packages/obstacle_classifier_v2/src/obstacle_classifier_node.cpp b/demos/ota_nav2_sensor_fix/ros2_packages/obstacle_classifier_v2/src/obstacle_classifier_node.cpp new file mode 100644 index 0000000..914c689 --- /dev/null +++ b/demos/ota_nav2_sensor_fix/ros2_packages/obstacle_classifier_v2/src/obstacle_classifier_node.cpp @@ -0,0 +1,50 @@ +// Copyright 2026 bburda. Apache-2.0. +#include + +#include +#include +#include + +class ObstacleClassifierNode : public rclcpp::Node { + public: + ObstacleClassifierNode() : Node("obstacle_classifier") { + pub_ = create_publisher("safety_overlay", 10); + sub_ = create_subscription( + "scan", 10, + [this](sensor_msgs::msg::LaserScan::SharedPtr msg) { on_scan(*msg); }); + } + + private: + void on_scan(const sensor_msgs::msg::LaserScan & scan) { + visualization_msgs::msg::MarkerArray markers; + visualization_msgs::msg::Marker m; + m.header = scan.header; + m.ns = "safety_overlay"; + m.id = 0; + m.type = visualization_msgs::msg::Marker::CYLINDER; + m.action = visualization_msgs::msg::Marker::ADD; + m.scale.x = 0.4; + m.scale.y = 0.4; + m.scale.z = 0.4; + m.color.r = 0.0f; + m.color.g = 1.0f; + m.color.b = 0.4f; + m.color.a = 0.6f; + m.pose.position.x = 0.0; + m.pose.position.y = 0.0; + m.pose.position.z = 0.2; + m.pose.orientation.w = 1.0; + markers.markers.push_back(m); + pub_->publish(markers); + } + + rclcpp::Publisher::SharedPtr pub_; + rclcpp::Subscription::SharedPtr sub_; +}; + +int main(int argc, char ** argv) { + rclcpp::init(argc, argv); + rclcpp::spin(std::make_shared()); + rclcpp::shutdown(); + return 0; +} From 9defc1a1b84f2ad54ba0e88708ec49079acc00d4 Mon Sep 17 00:00:00 2001 From: Bartosz Burda Date: Sun, 26 Apr 2026 18:15:59 +0200 Subject: [PATCH 19/52] feat(demos/ota): build_artifacts.sh + gitignore generated tarballs --- .../ota_nav2_sensor_fix/artifacts/.gitignore | 2 + demos/ota_nav2_sensor_fix/ros2_ws/.gitignore | 4 ++ .../scripts/build_artifacts.sh | 51 +++++++++++++++++++ 3 files changed, 57 insertions(+) create mode 100644 demos/ota_nav2_sensor_fix/artifacts/.gitignore create mode 100644 demos/ota_nav2_sensor_fix/ros2_ws/.gitignore create mode 100755 demos/ota_nav2_sensor_fix/scripts/build_artifacts.sh diff --git a/demos/ota_nav2_sensor_fix/artifacts/.gitignore b/demos/ota_nav2_sensor_fix/artifacts/.gitignore new file mode 100644 index 0000000..ae78201 --- /dev/null +++ b/demos/ota_nav2_sensor_fix/artifacts/.gitignore @@ -0,0 +1,2 @@ +*.tar.gz +catalog.json diff --git a/demos/ota_nav2_sensor_fix/ros2_ws/.gitignore b/demos/ota_nav2_sensor_fix/ros2_ws/.gitignore new file mode 100644 index 0000000..fb3674e --- /dev/null +++ b/demos/ota_nav2_sensor_fix/ros2_ws/.gitignore @@ -0,0 +1,4 @@ +build/ +install/ +log/ +src/ diff --git a/demos/ota_nav2_sensor_fix/scripts/build_artifacts.sh b/demos/ota_nav2_sensor_fix/scripts/build_artifacts.sh new file mode 100755 index 0000000..e13ae6f --- /dev/null +++ b/demos/ota_nav2_sensor_fix/scripts/build_artifacts.sh @@ -0,0 +1,51 @@ +#!/usr/bin/env bash +set -eo pipefail +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +DEMO_DIR="$(dirname "$SCRIPT_DIR")" +WS="$DEMO_DIR/ros2_ws" +ARTIFACTS="$DEMO_DIR/artifacts" + +# shellcheck disable=SC1091 +source /opt/ros/jazzy/setup.bash +set -u + +mkdir -p "$WS/src" +for pkg in broken_lidar fixed_lidar broken_lidar_legacy obstacle_classifier_v2; do + ln -sfn "$DEMO_DIR/ros2_packages/$pkg" "$WS/src/$pkg" +done + +(cd "$WS" && colcon build --packages-select fixed_lidar obstacle_classifier_v2) + +mkdir -p "$ARTIFACTS" +rm -f "$ARTIFACTS/catalog.json" + +PACK="$SCRIPT_DIR/.venv/bin/python $SCRIPT_DIR/pack_artifact.py" + +env -i PATH=/usr/bin:/bin HOME="$HOME" $PACK \ + --package fixed_lidar --version 2.1.0 \ + --kind update --target-component scan_sensor_node \ + --executable fixed_lidar_node \ + --notes "Fix /scan noise filter" \ + --skip-build --workspace "$WS" \ + --out-dir "$ARTIFACTS" --catalog "$ARTIFACTS/catalog.json" + +env -i PATH=/usr/bin:/bin HOME="$HOME" $PACK \ + --package obstacle_classifier_v2 --version 1.0.0 \ + --kind install --target-component obstacle_classifier \ + --executable obstacle_classifier_node \ + --notes "Extra safety layer for nav2" \ + --skip-build --workspace "$WS" \ + --out-dir "$ARTIFACTS" --catalog "$ARTIFACTS/catalog.json" + +env -i PATH=/usr/bin:/bin HOME="$HOME" $PACK \ + --package broken_lidar_legacy --version "" \ + --kind uninstall --target-component broken_lidar_legacy \ + --notes "Clean up deprecated package" \ + --skip-build --workspace "$WS" \ + --out-dir "$ARTIFACTS" --catalog "$ARTIFACTS/catalog.json" + +if command -v jq >/dev/null 2>&1; then + echo "Built catalog with $(jq length "$ARTIFACTS/catalog.json") entries" +else + echo "Built catalog: $(wc -l < "$ARTIFACTS/catalog.json") lines" +fi From 706c62c4fc736f5d08a46d6e3b419b23e6b16186 Mon Sep 17 00:00:00 2001 From: Bartosz Burda Date: Sun, 26 Apr 2026 18:21:43 +0200 Subject: [PATCH 20/52] fix(demos/ota): use array for pack_artifact invocation in build script --- demos/ota_nav2_sensor_fix/scripts/build_artifacts.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/demos/ota_nav2_sensor_fix/scripts/build_artifacts.sh b/demos/ota_nav2_sensor_fix/scripts/build_artifacts.sh index e13ae6f..6588466 100755 --- a/demos/ota_nav2_sensor_fix/scripts/build_artifacts.sh +++ b/demos/ota_nav2_sensor_fix/scripts/build_artifacts.sh @@ -19,9 +19,9 @@ done mkdir -p "$ARTIFACTS" rm -f "$ARTIFACTS/catalog.json" -PACK="$SCRIPT_DIR/.venv/bin/python $SCRIPT_DIR/pack_artifact.py" +PACK=("$SCRIPT_DIR/.venv/bin/python" "$SCRIPT_DIR/pack_artifact.py") -env -i PATH=/usr/bin:/bin HOME="$HOME" $PACK \ +env -i PATH=/usr/bin:/bin HOME="$HOME" "${PACK[@]}" \ --package fixed_lidar --version 2.1.0 \ --kind update --target-component scan_sensor_node \ --executable fixed_lidar_node \ @@ -29,7 +29,7 @@ env -i PATH=/usr/bin:/bin HOME="$HOME" $PACK \ --skip-build --workspace "$WS" \ --out-dir "$ARTIFACTS" --catalog "$ARTIFACTS/catalog.json" -env -i PATH=/usr/bin:/bin HOME="$HOME" $PACK \ +env -i PATH=/usr/bin:/bin HOME="$HOME" "${PACK[@]}" \ --package obstacle_classifier_v2 --version 1.0.0 \ --kind install --target-component obstacle_classifier \ --executable obstacle_classifier_node \ @@ -37,7 +37,7 @@ env -i PATH=/usr/bin:/bin HOME="$HOME" $PACK \ --skip-build --workspace "$WS" \ --out-dir "$ARTIFACTS" --catalog "$ARTIFACTS/catalog.json" -env -i PATH=/usr/bin:/bin HOME="$HOME" $PACK \ +env -i PATH=/usr/bin:/bin HOME="$HOME" "${PACK[@]}" \ --package broken_lidar_legacy --version "" \ --kind uninstall --target-component broken_lidar_legacy \ --notes "Clean up deprecated package" \ From 1a9f20a147bf12a740a75a9470861c9b60930b0d Mon Sep 17 00:00:00 2001 From: Bartosz Burda Date: Sun, 26 Apr 2026 18:32:43 +0200 Subject: [PATCH 21/52] feat(demos/ota): ota_update_plugin C++ gateway plugin Implements GatewayPlugin + UpdateProvider for the OTA demo. Polls a FastAPI catalog at boot and supports update / install / uninstall operations derived from SOVD ISO 17978-3 metadata. Process model: SIGTERM old executable, swap files on disk, fork+exec new executable. No lifecycle commands. Operation kind is classified from updated_components / added_components / removed_components. Components: - OtaUpdatePlugin: list/get/register/delete/prepare/execute/supports_automated - CatalogClient: cpp-httplib GET /catalog and artifact download, with parse_url - OperationDispatcher: SOVD metadata -> Update/Install/Uninstall/Unknown - ProcessRunner: pgrep via /proc, kill_by_executable with SIGTERM->SIGKILL fallback, fork+exec spawn 21 gtests pass (7 dispatcher, 6 parse_url, 8 plugin smoke). --- .../ota_update_plugin/CMakeLists.txt | 86 +++++ .../ota_update_plugin/ota_update_plugin.hpp | 83 +++++ .../ota_update_plugin/package.xml | 20 ++ .../ota_update_plugin/src/catalog_client.cpp | 171 ++++++++++ .../ota_update_plugin/src/catalog_client.hpp | 61 ++++ .../src/operation_dispatcher.cpp | 48 +++ .../src/operation_dispatcher.hpp | 33 ++ .../src/ota_update_plugin.cpp | 294 ++++++++++++++++++ .../ota_update_plugin/src/plugin_exports.cpp | 25 ++ .../ota_update_plugin/src/process_runner.cpp | 130 ++++++++ .../ota_update_plugin/src/process_runner.hpp | 48 +++ .../test/test_catalog_client.cpp | 59 ++++ .../test/test_operation_dispatcher.cpp | 60 ++++ .../test/test_plugin_smoke.cpp | 141 +++++++++ 14 files changed, 1259 insertions(+) create mode 100644 demos/ota_nav2_sensor_fix/ota_update_plugin/CMakeLists.txt create mode 100644 demos/ota_nav2_sensor_fix/ota_update_plugin/include/ota_update_plugin/ota_update_plugin.hpp create mode 100644 demos/ota_nav2_sensor_fix/ota_update_plugin/package.xml create mode 100644 demos/ota_nav2_sensor_fix/ota_update_plugin/src/catalog_client.cpp create mode 100644 demos/ota_nav2_sensor_fix/ota_update_plugin/src/catalog_client.hpp create mode 100644 demos/ota_nav2_sensor_fix/ota_update_plugin/src/operation_dispatcher.cpp create mode 100644 demos/ota_nav2_sensor_fix/ota_update_plugin/src/operation_dispatcher.hpp create mode 100644 demos/ota_nav2_sensor_fix/ota_update_plugin/src/ota_update_plugin.cpp create mode 100644 demos/ota_nav2_sensor_fix/ota_update_plugin/src/plugin_exports.cpp create mode 100644 demos/ota_nav2_sensor_fix/ota_update_plugin/src/process_runner.cpp create mode 100644 demos/ota_nav2_sensor_fix/ota_update_plugin/src/process_runner.hpp create mode 100644 demos/ota_nav2_sensor_fix/ota_update_plugin/test/test_catalog_client.cpp create mode 100644 demos/ota_nav2_sensor_fix/ota_update_plugin/test/test_operation_dispatcher.cpp create mode 100644 demos/ota_nav2_sensor_fix/ota_update_plugin/test/test_plugin_smoke.cpp diff --git a/demos/ota_nav2_sensor_fix/ota_update_plugin/CMakeLists.txt b/demos/ota_nav2_sensor_fix/ota_update_plugin/CMakeLists.txt new file mode 100644 index 0000000..9798bd7 --- /dev/null +++ b/demos/ota_nav2_sensor_fix/ota_update_plugin/CMakeLists.txt @@ -0,0 +1,86 @@ +# Copyright 2026 bburda +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +cmake_minimum_required(VERSION 3.16) +project(ota_update_plugin CXX) + +if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang") + add_compile_options(-Wall -Wextra -Wpedantic) +endif() + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_POSITION_INDEPENDENT_CODE ON) + +find_package(ament_cmake REQUIRED) +find_package(ros2_medkit_cmake REQUIRED) +include(ROS2MedkitCompat) +find_package(ros2_medkit_gateway REQUIRED) +find_package(nlohmann_json REQUIRED) + +# CatalogClient uses cpp-httplib for HTTP. Use gateway's vendored copy as fallback. +set(_gw_vendored "${ros2_medkit_gateway_DIR}/../vendored/cpp_httplib") +medkit_find_cpp_httplib(VENDORED_DIR "${_gw_vendored}") +unset(_gw_vendored) + +# Static core library: plugin + tests both link against this. +add_library(ota_update_plugin_core STATIC + src/ota_update_plugin.cpp + src/catalog_client.cpp + src/operation_dispatcher.cpp + src/process_runner.cpp +) +target_include_directories(ota_update_plugin_core + PUBLIC + $ + $ +) +ament_target_dependencies(ota_update_plugin_core ros2_medkit_gateway) +target_link_libraries(ota_update_plugin_core + PUBLIC + nlohmann_json::nlohmann_json + cpp_httplib_target +) +set_target_properties(ota_update_plugin_core PROPERTIES POSITION_INDEPENDENT_CODE ON) + +# MODULE target: loaded via dlopen at runtime by PluginManager. +# Symbols from gateway_lib are resolved from the host process at runtime. +add_library(ota_update_plugin MODULE src/plugin_exports.cpp) +target_link_libraries(ota_update_plugin PRIVATE ota_update_plugin_core) +set_target_properties(ota_update_plugin PROPERTIES + PREFIX "" + OUTPUT_NAME "ota_update_plugin" +) +# Allow unresolved symbols - they resolve from the host process at runtime +target_link_options(ota_update_plugin PRIVATE + -Wl,--unresolved-symbols=ignore-all +) + +install(TARGETS ota_update_plugin + LIBRARY DESTINATION lib/${PROJECT_NAME} +) +install(DIRECTORY include/ DESTINATION include) + +if(BUILD_TESTING) + find_package(ament_cmake_gtest REQUIRED) + ament_add_gtest(test_ota_update_plugin + test/test_operation_dispatcher.cpp + test/test_catalog_client.cpp + test/test_plugin_smoke.cpp + ) + target_link_libraries(test_ota_update_plugin ota_update_plugin_core) + target_include_directories(test_ota_update_plugin PRIVATE src) +endif() + +ament_package() diff --git a/demos/ota_nav2_sensor_fix/ota_update_plugin/include/ota_update_plugin/ota_update_plugin.hpp b/demos/ota_nav2_sensor_fix/ota_update_plugin/include/ota_update_plugin/ota_update_plugin.hpp new file mode 100644 index 0000000..adb1b49 --- /dev/null +++ b/demos/ota_nav2_sensor_fix/ota_update_plugin/include/ota_update_plugin/ota_update_plugin.hpp @@ -0,0 +1,83 @@ +// Copyright 2026 bburda +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include + +#include +#include +#include + +namespace ota_update_plugin { + +class CatalogClient; +class ProcessRunner; + +/// OTA update plugin: implements both GatewayPlugin and UpdateProvider. +/// Polls a FastAPI catalog at boot and supports update / install / uninstall +/// operations derived from SOVD ISO 17978-3 metadata. +class OtaUpdatePlugin : public ros2_medkit_gateway::GatewayPlugin, public ros2_medkit_gateway::UpdateProvider { + public: + OtaUpdatePlugin(); + ~OtaUpdatePlugin() override; + + OtaUpdatePlugin(const OtaUpdatePlugin &) = delete; + OtaUpdatePlugin & operator=(const OtaUpdatePlugin &) = delete; + OtaUpdatePlugin(OtaUpdatePlugin &&) = delete; + OtaUpdatePlugin & operator=(OtaUpdatePlugin &&) = delete; + + // GatewayPlugin + std::string name() const override { + return "ota_update_plugin"; + } + void configure(const nlohmann::json & config) override; + void set_context(ros2_medkit_gateway::PluginContext & context) override; + + // UpdateProvider + tl::expected, ros2_medkit_gateway::UpdateBackendErrorInfo> list_updates( + const ros2_medkit_gateway::UpdateFilter & filter) override; + tl::expected get_update(const std::string & id) override; + tl::expected register_update( + const nlohmann::json & metadata) override; + tl::expected delete_update(const std::string & id) override; + tl::expected prepare( + const std::string & id, ros2_medkit_gateway::UpdateProgressReporter & reporter) override; + tl::expected execute( + const std::string & id, ros2_medkit_gateway::UpdateProgressReporter & reporter) override; + tl::expected supports_automated(const std::string & id) override; + + // Test seams + void set_catalog_client_for_test(std::unique_ptr client); + void set_process_runner_for_test(std::unique_ptr runner); + void poll_and_register_catalog(); + + private: + std::string catalog_url_; + std::string staging_dir_; + std::string install_dir_; + + std::mutex mu_; + std::map registry_; + std::map staged_artifacts_; + + std::unique_ptr catalog_client_; + std::unique_ptr process_runner_; +}; + +} // namespace ota_update_plugin diff --git a/demos/ota_nav2_sensor_fix/ota_update_plugin/package.xml b/demos/ota_nav2_sensor_fix/ota_update_plugin/package.xml new file mode 100644 index 0000000..200c826 --- /dev/null +++ b/demos/ota_nav2_sensor_fix/ota_update_plugin/package.xml @@ -0,0 +1,20 @@ + + + ota_update_plugin + 0.1.0 + Dev-grade OTA plugin for ros2_medkit gateway: update / install / uninstall via simple HTTP catalog. + bburda + Apache-2.0 + + ament_cmake + + ros2_medkit_cmake + ros2_medkit_gateway + nlohmann-json-dev + + ament_cmake_gtest + + + ament_cmake + + diff --git a/demos/ota_nav2_sensor_fix/ota_update_plugin/src/catalog_client.cpp b/demos/ota_nav2_sensor_fix/ota_update_plugin/src/catalog_client.cpp new file mode 100644 index 0000000..1849025 --- /dev/null +++ b/demos/ota_nav2_sensor_fix/ota_update_plugin/src/catalog_client.cpp @@ -0,0 +1,171 @@ +// Copyright 2026 bburda +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "catalog_client.hpp" + +#include +#include +#include + +#include + +namespace ota_update_plugin { + +namespace { + +bool starts_with(const std::string & s, const std::string & prefix) { + return s.size() >= prefix.size() && s.compare(0, prefix.size(), prefix) == 0; +} + +} // namespace + +ParsedUrl parse_url(const std::string & url) { + ParsedUrl out{}; + std::string rest; + if (starts_with(url, "https://")) { + out.tls = true; + out.port = 443; + rest = url.substr(8); + } else if (starts_with(url, "http://")) { + out.tls = false; + out.port = 80; + rest = url.substr(7); + } else { + throw std::invalid_argument("unsupported URL scheme: " + url); + } + + // Split host[:port] from path. + const auto slash = rest.find('/'); + std::string authority; + if (slash == std::string::npos) { + authority = rest; + out.path = "/"; + } else { + authority = rest.substr(0, slash); + out.path = rest.substr(slash); + } + + // Split host from port if present. + const auto colon = authority.find(':'); + if (colon == std::string::npos) { + out.host = authority; + } else { + out.host = authority.substr(0, colon); + try { + out.port = std::stoi(authority.substr(colon + 1)); + } catch (const std::exception & e) { + throw std::invalid_argument(std::string("invalid port in URL: ") + url + " (" + e.what() + ")"); + } + } + + if (out.host.empty()) { + throw std::invalid_argument("missing host in URL: " + url); + } + return out; +} + +CatalogClient::CatalogClient(std::string base_url) : base_url_(std::move(base_url)) { +} + +tl::expected CatalogClient::fetch_catalog() { + ParsedUrl parsed; + try { + parsed = parse_url(base_url_); + } catch (const std::exception & e) { + return tl::make_unexpected(std::string("invalid catalog url: ") + e.what()); + } + + if (parsed.tls) { + return tl::make_unexpected("https not supported by demo CatalogClient"); + } + + // Strip trailing slash from base path, then append /catalog. + std::string base_path = parsed.path; + if (!base_path.empty() && base_path.back() == '/') { + base_path.pop_back(); + } + const std::string target = base_path + "/catalog"; + + httplib::Client cli(parsed.host, parsed.port); + cli.set_connection_timeout(5, 0); + cli.set_read_timeout(5, 0); + + auto res = cli.Get(target.c_str()); + if (!res) { + return tl::make_unexpected("catalog GET failed: " + httplib::to_string(res.error())); + } + if (res->status < 200 || res->status >= 300) { + return tl::make_unexpected("catalog GET returned status " + std::to_string(res->status)); + } + try { + return nlohmann::json::parse(res->body); + } catch (const std::exception & e) { + return tl::make_unexpected(std::string("catalog json parse failed: ") + e.what()); + } +} + +tl::expected CatalogClient::download_artifact(const std::string & url_or_path, + const std::string & out_path) { + // If url_or_path is an absolute URL, parse it directly. Otherwise treat as a + // path relative to base_url_. + std::string full_url; + if (starts_with(url_or_path, "http://") || starts_with(url_or_path, "https://")) { + full_url = url_or_path; + } else { + std::string base = base_url_; + // Strip trailing slash on base, leading slash on relative path. + while (!base.empty() && base.back() == '/') { + base.pop_back(); + } + std::string rel = url_or_path; + if (rel.empty() || rel.front() != '/') { + rel = "/" + rel; + } + full_url = base + rel; + } + + ParsedUrl parsed; + try { + parsed = parse_url(full_url); + } catch (const std::exception & e) { + return tl::make_unexpected(std::string("invalid artifact url: ") + e.what()); + } + if (parsed.tls) { + return tl::make_unexpected("https not supported by demo CatalogClient"); + } + + httplib::Client cli(parsed.host, parsed.port); + cli.set_connection_timeout(5, 0); + cli.set_read_timeout(30, 0); + + auto res = cli.Get(parsed.path.c_str()); + if (!res) { + return tl::make_unexpected("artifact GET failed: " + httplib::to_string(res.error())); + } + if (res->status < 200 || res->status >= 300) { + return tl::make_unexpected("artifact GET returned status " + std::to_string(res->status)); + } + + std::ofstream o(out_path, std::ios::binary); + if (!o) { + return tl::make_unexpected("cannot open output file: " + out_path); + } + o.write(res->body.data(), static_cast(res->body.size())); + if (!o) { + return tl::make_unexpected("write to output file failed: " + out_path); + } + return out_path; +} + +} // namespace ota_update_plugin diff --git a/demos/ota_nav2_sensor_fix/ota_update_plugin/src/catalog_client.hpp b/demos/ota_nav2_sensor_fix/ota_update_plugin/src/catalog_client.hpp new file mode 100644 index 0000000..0b8a9b2 --- /dev/null +++ b/demos/ota_nav2_sensor_fix/ota_update_plugin/src/catalog_client.hpp @@ -0,0 +1,61 @@ +// Copyright 2026 bburda +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include +#include + +namespace ota_update_plugin { + +/// Decomposed URL components used by the HTTP client. +struct ParsedUrl { + std::string host; + int port; + bool tls; + std::string path; +}; + +/// Parse an http:// or https:// URL into components. +/// Throws std::invalid_argument for unsupported schemes. +ParsedUrl parse_url(const std::string & url); + +/// HTTP client that fetches the FastAPI catalog and downloads artifacts. +/// Virtual methods so tests can substitute a fake without touching real HTTP. +class CatalogClient { + public: + explicit CatalogClient(std::string base_url); + virtual ~CatalogClient() = default; + + CatalogClient(const CatalogClient &) = delete; + CatalogClient & operator=(const CatalogClient &) = delete; + CatalogClient(CatalogClient &&) = delete; + CatalogClient & operator=(CatalogClient &&) = delete; + + /// GET {base_url}/catalog and parse JSON. Returns the JSON array on success. + virtual tl::expected fetch_catalog(); + + /// Download an artifact. `url_or_path` may be either an absolute URL or a + /// path (interpreted relative to `base_url`). Body is written to `out_path`. + /// Returns the absolute output path on success. + virtual tl::expected download_artifact(const std::string & url_or_path, + const std::string & out_path); + + protected: + std::string base_url_; +}; + +} // namespace ota_update_plugin diff --git a/demos/ota_nav2_sensor_fix/ota_update_plugin/src/operation_dispatcher.cpp b/demos/ota_nav2_sensor_fix/ota_update_plugin/src/operation_dispatcher.cpp new file mode 100644 index 0000000..b28b2af --- /dev/null +++ b/demos/ota_nav2_sensor_fix/ota_update_plugin/src/operation_dispatcher.cpp @@ -0,0 +1,48 @@ +// Copyright 2026 bburda +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "operation_dispatcher.hpp" + +namespace ota_update_plugin { + +namespace { + +bool non_empty_array(const nlohmann::json & j, const char * key) { + if (!j.contains(key)) { + return false; + } + const auto & v = j.at(key); + return v.is_array() && !v.empty(); +} + +} // namespace + +OperationKind OperationDispatcher::classify(const nlohmann::json & metadata) { + const bool has_updated = non_empty_array(metadata, "updated_components"); + const bool has_added = non_empty_array(metadata, "added_components"); + const bool has_removed = non_empty_array(metadata, "removed_components"); + const int populated = static_cast(has_updated) + static_cast(has_added) + static_cast(has_removed); + if (populated != 1) { + return OperationKind::Unknown; + } + if (has_updated) { + return OperationKind::Update; + } + if (has_added) { + return OperationKind::Install; + } + return OperationKind::Uninstall; +} + +} // namespace ota_update_plugin diff --git a/demos/ota_nav2_sensor_fix/ota_update_plugin/src/operation_dispatcher.hpp b/demos/ota_nav2_sensor_fix/ota_update_plugin/src/operation_dispatcher.hpp new file mode 100644 index 0000000..3398c2e --- /dev/null +++ b/demos/ota_nav2_sensor_fix/ota_update_plugin/src/operation_dispatcher.hpp @@ -0,0 +1,33 @@ +// Copyright 2026 bburda +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace ota_update_plugin { + +/// Operation kind classified from SOVD update metadata. +enum class OperationKind { Update, Install, Uninstall, Unknown }; + +/// Maps SOVD update metadata to a concrete operation kind based on which of +/// updated_components / added_components / removed_components is populated. +class OperationDispatcher { + public: + /// Classify an update's operation kind. Returns Unknown if zero or more + /// than one of the three component arrays is non-empty. + static OperationKind classify(const nlohmann::json & metadata); +}; + +} // namespace ota_update_plugin diff --git a/demos/ota_nav2_sensor_fix/ota_update_plugin/src/ota_update_plugin.cpp b/demos/ota_nav2_sensor_fix/ota_update_plugin/src/ota_update_plugin.cpp new file mode 100644 index 0000000..6b170c2 --- /dev/null +++ b/demos/ota_nav2_sensor_fix/ota_update_plugin/src/ota_update_plugin.cpp @@ -0,0 +1,294 @@ +// Copyright 2026 bburda +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "ota_update_plugin/ota_update_plugin.hpp" + +#include +#include +#include +#include + +#include "catalog_client.hpp" +#include "operation_dispatcher.hpp" +#include "process_runner.hpp" + +namespace ota_update_plugin { + +namespace fs = std::filesystem; +using ros2_medkit_gateway::UpdateBackendError; +using ros2_medkit_gateway::UpdateBackendErrorInfo; + +namespace { + +/// Extract a packed tarball into a staging directory, then atomically replace +/// `${install_dir}/${target_package}` with the freshly extracted contents. +/// The artifacts are produced by pack_artifact.py and contain a single +/// top-level directory named after the target package. +tl::expected extract_and_swap(const std::string & staged_tarball, const std::string & install_dir, + const std::string & target_package) { + if (target_package.empty()) { + return tl::make_unexpected("target_package is empty"); + } + const std::string staging_extracted = staged_tarball + ".extracted"; + std::error_code ec; + fs::remove_all(staging_extracted, ec); + fs::create_directories(staging_extracted, ec); + + const std::string cmd = "tar -xzf " + staged_tarball + " -C " + staging_extracted; + if (std::system(cmd.c_str()) != 0) { + return tl::make_unexpected("tar extraction failed: " + cmd); + } + + const std::string source = staging_extracted + "/" + target_package; + if (!fs::exists(source)) { + return tl::make_unexpected("artifact missing top-level directory '" + target_package + "' after extraction"); + } + + fs::create_directories(install_dir, ec); + const std::string target = install_dir + "/" + target_package; + fs::remove_all(target, ec); + fs::copy(source, target, fs::copy_options::recursive | fs::copy_options::overwrite_existing, ec); + if (ec) { + return tl::make_unexpected("copy failed: " + ec.message()); + } + return {}; +} + +} // namespace + +OtaUpdatePlugin::OtaUpdatePlugin() : process_runner_(std::make_unique()) { +} + +OtaUpdatePlugin::~OtaUpdatePlugin() = default; + +void OtaUpdatePlugin::configure(const nlohmann::json & config) { + catalog_url_ = config.value("catalog_url", "http://ota_update_server:9000"); + staging_dir_ = config.value("staging_dir", "/tmp/ota_staging"); + install_dir_ = config.value("install_dir", "/ws/install"); +} + +void OtaUpdatePlugin::set_context(ros2_medkit_gateway::PluginContext & /*context*/) { + poll_and_register_catalog(); +} + +void OtaUpdatePlugin::poll_and_register_catalog() { + if (!catalog_client_) { + catalog_client_ = std::make_unique(catalog_url_); + } + auto fetched = catalog_client_->fetch_catalog(); + if (!fetched) { + std::fprintf(stderr, "[ota_update_plugin] catalog fetch failed: %s\n", fetched.error().c_str()); + return; + } + if (!fetched->is_array()) { + std::fprintf(stderr, "[ota_update_plugin] catalog payload is not an array\n"); + return; + } + for (const auto & entry : *fetched) { + auto rc = register_update(entry); + if (!rc) { + const std::string id = entry.value("id", "?"); + std::fprintf(stderr, "[ota_update_plugin] register %s failed: %s\n", id.c_str(), rc.error().message.c_str()); + } + } +} + +void OtaUpdatePlugin::set_catalog_client_for_test(std::unique_ptr client) { + catalog_client_ = std::move(client); +} + +void OtaUpdatePlugin::set_process_runner_for_test(std::unique_ptr runner) { + process_runner_ = std::move(runner); +} + +tl::expected, UpdateBackendErrorInfo> OtaUpdatePlugin::list_updates( + const ros2_medkit_gateway::UpdateFilter & /*filter*/) { + std::lock_guard lk(mu_); + std::vector ids; + ids.reserve(registry_.size()); + for (const auto & kv : registry_) { + ids.push_back(kv.first); + } + return ids; +} + +tl::expected OtaUpdatePlugin::get_update(const std::string & id) { + std::lock_guard lk(mu_); + auto it = registry_.find(id); + if (it == registry_.end()) { + return tl::make_unexpected(UpdateBackendErrorInfo{UpdateBackendError::NotFound, "update not registered"}); + } + return it->second; +} + +tl::expected OtaUpdatePlugin::register_update(const nlohmann::json & metadata) { + if (!metadata.contains("id") || !metadata["id"].is_string()) { + return tl::make_unexpected(UpdateBackendErrorInfo{UpdateBackendError::InvalidInput, "metadata missing id"}); + } + std::lock_guard lk(mu_); + registry_[metadata["id"].get()] = metadata; + return {}; +} + +tl::expected OtaUpdatePlugin::delete_update(const std::string & id) { + std::lock_guard lk(mu_); + registry_.erase(id); + staged_artifacts_.erase(id); + return {}; +} + +tl::expected OtaUpdatePlugin::prepare( + const std::string & id, ros2_medkit_gateway::UpdateProgressReporter & reporter) { + nlohmann::json metadata; + { + std::lock_guard lk(mu_); + auto it = registry_.find(id); + if (it == registry_.end()) { + return tl::make_unexpected(UpdateBackendErrorInfo{UpdateBackendError::NotFound, "no such update"}); + } + metadata = it->second; + } + + const auto kind = OperationDispatcher::classify(metadata); + if (kind == OperationKind::Unknown) { + return tl::make_unexpected(UpdateBackendErrorInfo{ + UpdateBackendError::InvalidInput, + "update package must populate exactly one of " + "updated_components / added_components / removed_components"}); + } + + if (kind == OperationKind::Uninstall) { + reporter.set_progress(100); + return {}; + } + + if (!metadata.contains("x_medkit_artifact_url") || !metadata["x_medkit_artifact_url"].is_string()) { + return tl::make_unexpected( + UpdateBackendErrorInfo{UpdateBackendError::InvalidInput, "missing x_medkit_artifact_url"}); + } + + std::error_code ec; + fs::create_directories(staging_dir_, ec); + const std::string url = metadata["x_medkit_artifact_url"].get(); + const std::string staged_path = staging_dir_ + "/" + id + ".tar.gz"; + + reporter.set_progress(10); + if (!catalog_client_) { + catalog_client_ = std::make_unique(catalog_url_); + } + auto dl = catalog_client_->download_artifact(url, staged_path); + if (!dl) { + return tl::make_unexpected(UpdateBackendErrorInfo{UpdateBackendError::Internal, "download failed: " + dl.error()}); + } + reporter.set_progress(80); + + { + std::lock_guard lk(mu_); + staged_artifacts_[id] = *dl; + } + reporter.set_progress(100); + return {}; +} + +tl::expected OtaUpdatePlugin::execute( + const std::string & id, ros2_medkit_gateway::UpdateProgressReporter & reporter) { + nlohmann::json metadata; + std::string staged; + { + std::lock_guard lk(mu_); + auto it = registry_.find(id); + if (it == registry_.end()) { + return tl::make_unexpected(UpdateBackendErrorInfo{UpdateBackendError::NotFound, "no such update"}); + } + metadata = it->second; + auto sit = staged_artifacts_.find(id); + staged = (sit != staged_artifacts_.end()) ? sit->second : ""; + } + + const auto kind = OperationDispatcher::classify(metadata); + const std::string target_package = metadata.value("x_medkit_target_package", ""); + const std::string executable = metadata.value("x_medkit_executable", ""); + + if (kind == OperationKind::Update) { + if (staged.empty()) { + return tl::make_unexpected(UpdateBackendErrorInfo{UpdateBackendError::InvalidInput, "call prepare() first"}); + } + if (executable.empty()) { + return tl::make_unexpected( + UpdateBackendErrorInfo{UpdateBackendError::InvalidInput, "missing x_medkit_executable"}); + } + reporter.set_progress(20); + auto kr = process_runner_->kill_by_executable(executable); + if (!kr) { + return tl::make_unexpected(UpdateBackendErrorInfo{UpdateBackendError::Internal, "kill failed: " + kr.error()}); + } + reporter.set_progress(40); + if (auto sw = extract_and_swap(staged, install_dir_, target_package); !sw) { + return tl::make_unexpected(UpdateBackendErrorInfo{UpdateBackendError::Internal, "swap failed: " + sw.error()}); + } + reporter.set_progress(70); + const std::string bin = install_dir_ + "/" + target_package + "/lib/" + target_package + "/" + executable; + auto sp = process_runner_->spawn(bin); + if (!sp) { + return tl::make_unexpected(UpdateBackendErrorInfo{UpdateBackendError::Internal, "spawn failed: " + sp.error()}); + } + reporter.set_progress(100); + return {}; + } + + if (kind == OperationKind::Install) { + if (staged.empty()) { + return tl::make_unexpected(UpdateBackendErrorInfo{UpdateBackendError::InvalidInput, "call prepare() first"}); + } + if (executable.empty()) { + return tl::make_unexpected( + UpdateBackendErrorInfo{UpdateBackendError::InvalidInput, "missing x_medkit_executable"}); + } + reporter.set_progress(30); + if (auto sw = extract_and_swap(staged, install_dir_, target_package); !sw) { + return tl::make_unexpected(UpdateBackendErrorInfo{UpdateBackendError::Internal, "swap failed: " + sw.error()}); + } + reporter.set_progress(70); + const std::string bin = install_dir_ + "/" + target_package + "/lib/" + target_package + "/" + executable; + auto sp = process_runner_->spawn(bin); + if (!sp) { + return tl::make_unexpected(UpdateBackendErrorInfo{UpdateBackendError::Internal, "spawn failed: " + sp.error()}); + } + reporter.set_progress(100); + return {}; + } + + if (kind == OperationKind::Uninstall) { + reporter.set_progress(30); + if (!target_package.empty()) { + // Best-effort kill: legacy nodes may use the package name as their executable basename. + // Failures are tolerated since the process may already be gone. + auto kr = process_runner_->kill_by_executable(target_package); + (void)kr; + reporter.set_progress(70); + std::error_code ec; + fs::remove_all(install_dir_ + "/" + target_package, ec); + } + reporter.set_progress(100); + return {}; + } + + return tl::make_unexpected(UpdateBackendErrorInfo{UpdateBackendError::Internal, "unknown operation kind"}); +} + +tl::expected OtaUpdatePlugin::supports_automated(const std::string & /*id*/) { + return false; +} + +} // namespace ota_update_plugin diff --git a/demos/ota_nav2_sensor_fix/ota_update_plugin/src/plugin_exports.cpp b/demos/ota_nav2_sensor_fix/ota_update_plugin/src/plugin_exports.cpp new file mode 100644 index 0000000..a5916e0 --- /dev/null +++ b/demos/ota_nav2_sensor_fix/ota_update_plugin/src/plugin_exports.cpp @@ -0,0 +1,25 @@ +// Copyright 2026 bburda +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "ros2_medkit_gateway/plugins/plugin_types.hpp" + +#include "ota_update_plugin/ota_update_plugin.hpp" + +extern "C" GATEWAY_PLUGIN_EXPORT int plugin_api_version() { + return ros2_medkit_gateway::PLUGIN_API_VERSION; +} + +extern "C" GATEWAY_PLUGIN_EXPORT ros2_medkit_gateway::GatewayPlugin * create_plugin() { + return new ota_update_plugin::OtaUpdatePlugin(); +} diff --git a/demos/ota_nav2_sensor_fix/ota_update_plugin/src/process_runner.cpp b/demos/ota_nav2_sensor_fix/ota_update_plugin/src/process_runner.cpp new file mode 100644 index 0000000..2c6741f --- /dev/null +++ b/demos/ota_nav2_sensor_fix/ota_update_plugin/src/process_runner.cpp @@ -0,0 +1,130 @@ +// Copyright 2026 bburda +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "process_runner.hpp" + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace ota_update_plugin { + +namespace { + +std::string proc_comm(int pid) { + std::ifstream f("/proc/" + std::to_string(pid) + "/comm"); + if (!f) { + return {}; + } + std::string line; + std::getline(f, line); + return line; +} + +bool is_pid_dir(const char * name) { + for (const char * p = name; *p; ++p) { + if (*p < '0' || *p > '9') { + return false; + } + } + return *name != '\0'; +} + +} // namespace + +std::vector ProcessRunner::pgrep(const std::string & executable_basename) { + std::vector out; + DIR * d = opendir("/proc"); + if (d == nullptr) { + return out; + } + while (auto * ent = readdir(d)) { + if (!is_pid_dir(ent->d_name)) { + continue; + } + const int pid = std::atoi(ent->d_name); + if (pid <= 0) { + continue; + } + if (proc_comm(pid) == executable_basename) { + out.push_back(pid); + } + } + closedir(d); + return out; +} + +tl::expected ProcessRunner::kill_by_executable(const std::string & executable_basename, + int timeout_ms) { + const auto pids = pgrep(executable_basename); + int signalled = 0; + for (int pid : pids) { + if (::kill(pid, SIGTERM) == 0) { + ++signalled; + } + } + if (signalled == 0) { + return 0; + } + + // Poll for exit. + const auto deadline = std::chrono::steady_clock::now() + std::chrono::milliseconds(timeout_ms); + while (std::chrono::steady_clock::now() < deadline) { + bool any_alive = false; + for (int pid : pids) { + if (::kill(pid, 0) == 0) { + any_alive = true; + break; + } + } + if (!any_alive) { + return signalled; + } + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + } + // Force-kill stragglers. + for (int pid : pids) { + if (::kill(pid, 0) == 0) { + ::kill(pid, SIGKILL); + } + } + return signalled; +} + +tl::expected ProcessRunner::spawn(const std::string & executable_path) { + pid_t pid = fork(); + if (pid < 0) { + return tl::make_unexpected(std::string("fork failed: ") + std::strerror(errno)); + } + if (pid == 0) { + // Child: detach from controlling terminal. + setsid(); + execl(executable_path.c_str(), executable_path.c_str(), nullptr); + std::fprintf(stderr, "execl %s failed: %s\n", executable_path.c_str(), std::strerror(errno)); + _exit(127); + } + return static_cast(pid); +} + +} // namespace ota_update_plugin diff --git a/demos/ota_nav2_sensor_fix/ota_update_plugin/src/process_runner.hpp b/demos/ota_nav2_sensor_fix/ota_update_plugin/src/process_runner.hpp new file mode 100644 index 0000000..70e88cb --- /dev/null +++ b/demos/ota_nav2_sensor_fix/ota_update_plugin/src/process_runner.hpp @@ -0,0 +1,48 @@ +// Copyright 2026 bburda +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include + +namespace ota_update_plugin { + +/// Process management helper for OTA operations: locate, terminate, and spawn +/// demo nodes by executable basename. Pure-virtual for test substitution. +class ProcessRunner { + public: + ProcessRunner() = default; + virtual ~ProcessRunner() = default; + + ProcessRunner(const ProcessRunner &) = delete; + ProcessRunner & operator=(const ProcessRunner &) = delete; + ProcessRunner(ProcessRunner &&) = delete; + ProcessRunner & operator=(ProcessRunner &&) = delete; + + /// Find PIDs of processes whose /proc//comm matches the given basename. + virtual std::vector pgrep(const std::string & executable_basename); + + /// Send SIGTERM to all matching PIDs, wait up to `timeout_ms` for exit, then + /// SIGKILL any stragglers. Returns the number of processes that were signalled. + virtual tl::expected kill_by_executable(const std::string & executable_basename, + int timeout_ms = 2000); + + /// fork+exec the executable at `executable_path`. Returns child PID or error. + virtual tl::expected spawn(const std::string & executable_path); +}; + +} // namespace ota_update_plugin diff --git a/demos/ota_nav2_sensor_fix/ota_update_plugin/test/test_catalog_client.cpp b/demos/ota_nav2_sensor_fix/ota_update_plugin/test/test_catalog_client.cpp new file mode 100644 index 0000000..2a671e3 --- /dev/null +++ b/demos/ota_nav2_sensor_fix/ota_update_plugin/test/test_catalog_client.cpp @@ -0,0 +1,59 @@ +// Copyright 2026 bburda +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include "catalog_client.hpp" + +using ota_update_plugin::parse_url; + +TEST(ParseUrl, HostAndPort) { + auto p = parse_url("http://server:9000"); + EXPECT_EQ(p.host, "server"); + EXPECT_EQ(p.port, 9000); + EXPECT_FALSE(p.tls); + EXPECT_EQ(p.path, "/"); +} + +TEST(ParseUrl, PathSplit) { + auto p = parse_url("http://server:9000/catalog"); + EXPECT_EQ(p.host, "server"); + EXPECT_EQ(p.port, 9000); + EXPECT_EQ(p.path, "/catalog"); +} + +TEST(ParseUrl, DefaultsHttpPort) { + auto p = parse_url("http://server/catalog"); + EXPECT_EQ(p.host, "server"); + EXPECT_EQ(p.port, 80); + EXPECT_FALSE(p.tls); + EXPECT_EQ(p.path, "/catalog"); +} + +TEST(ParseUrl, HttpsTls) { + auto p = parse_url("https://server/catalog"); + EXPECT_TRUE(p.tls); + EXPECT_EQ(p.port, 443); + EXPECT_EQ(p.path, "/catalog"); +} + +TEST(ParseUrl, RejectsInvalidScheme) { + EXPECT_THROW(parse_url("ftp://server/foo"), std::invalid_argument); +} + +TEST(ParseUrl, RejectsMissingHost) { + EXPECT_THROW(parse_url("http://:9000/foo"), std::invalid_argument); +} diff --git a/demos/ota_nav2_sensor_fix/ota_update_plugin/test/test_operation_dispatcher.cpp b/demos/ota_nav2_sensor_fix/ota_update_plugin/test/test_operation_dispatcher.cpp new file mode 100644 index 0000000..453346a --- /dev/null +++ b/demos/ota_nav2_sensor_fix/ota_update_plugin/test/test_operation_dispatcher.cpp @@ -0,0 +1,60 @@ +// Copyright 2026 bburda +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include "operation_dispatcher.hpp" + +using ota_update_plugin::OperationDispatcher; +using ota_update_plugin::OperationKind; + +TEST(OperationDispatcher, UpdateFromUpdatedComponents) { + nlohmann::json m = {{"id", "x"}, {"updated_components", {"scan_sensor_node"}}}; + EXPECT_EQ(OperationDispatcher::classify(m), OperationKind::Update); +} + +TEST(OperationDispatcher, InstallFromAddedComponents) { + nlohmann::json m = {{"id", "x"}, {"added_components", {"obstacle_classifier"}}}; + EXPECT_EQ(OperationDispatcher::classify(m), OperationKind::Install); +} + +TEST(OperationDispatcher, UninstallFromRemovedComponents) { + nlohmann::json m = {{"id", "x"}, {"removed_components", {"broken_lidar_legacy"}}}; + EXPECT_EQ(OperationDispatcher::classify(m), OperationKind::Uninstall); +} + +TEST(OperationDispatcher, UnknownWhenAllEmpty) { + nlohmann::json m = {{"id", "x"}}; + EXPECT_EQ(OperationDispatcher::classify(m), OperationKind::Unknown); +} + +TEST(OperationDispatcher, UnknownWhenMixed) { + nlohmann::json m = { + {"id", "x"}, + {"added_components", {"a"}}, + {"removed_components", {"b"}}, + }; + EXPECT_EQ(OperationDispatcher::classify(m), OperationKind::Unknown); +} + +TEST(OperationDispatcher, UnknownWhenComponentsAreEmptyArray) { + nlohmann::json m = {{"id", "x"}, {"updated_components", nlohmann::json::array()}}; + EXPECT_EQ(OperationDispatcher::classify(m), OperationKind::Unknown); +} + +TEST(OperationDispatcher, UnknownWhenComponentsIsNotArray) { + nlohmann::json m = {{"id", "x"}, {"updated_components", "scan_sensor_node"}}; + EXPECT_EQ(OperationDispatcher::classify(m), OperationKind::Unknown); +} diff --git a/demos/ota_nav2_sensor_fix/ota_update_plugin/test/test_plugin_smoke.cpp b/demos/ota_nav2_sensor_fix/ota_update_plugin/test/test_plugin_smoke.cpp new file mode 100644 index 0000000..09e109c --- /dev/null +++ b/demos/ota_nav2_sensor_fix/ota_update_plugin/test/test_plugin_smoke.cpp @@ -0,0 +1,141 @@ +// Copyright 2026 bburda +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include + +#include + +#include + +#include "catalog_client.hpp" +#include "ota_update_plugin/ota_update_plugin.hpp" + +namespace { + +class FakeCatalogClient : public ota_update_plugin::CatalogClient { + public: + using CatalogClient::CatalogClient; + + nlohmann::json catalog_payload = nlohmann::json::array(); + std::string artifact_to_return = "TARDATA"; + std::string requested_url; + + tl::expected fetch_catalog() override { + return catalog_payload; + } + + tl::expected download_artifact(const std::string & url, const std::string & out) override { + requested_url = url; + std::ofstream o(out, std::ios::binary); + o << artifact_to_return; + return out; + } +}; + +ros2_medkit_gateway::UpdateProgressReporter make_reporter(ros2_medkit_gateway::UpdateStatusInfo & info, + std::mutex & mu) { + return ros2_medkit_gateway::UpdateProgressReporter(info, mu); +} + +} // namespace + +TEST(OtaUpdatePluginSmoke, NameAndConstructible) { + ota_update_plugin::OtaUpdatePlugin plugin; + EXPECT_EQ(plugin.name(), "ota_update_plugin"); +} + +TEST(OtaUpdatePluginSmoke, RegisterListGet) { + ota_update_plugin::OtaUpdatePlugin plugin; + nlohmann::json md = {{"id", "u1"}, {"updated_components", {"x"}}}; + ASSERT_TRUE(plugin.register_update(md)); + auto ids = plugin.list_updates({}); + ASSERT_TRUE(ids); + ASSERT_EQ(ids->size(), 1u); + EXPECT_EQ((*ids)[0], "u1"); + auto got = plugin.get_update("u1"); + ASSERT_TRUE(got); + EXPECT_EQ((*got)["id"], "u1"); +} + +TEST(OtaUpdatePluginSmoke, RegisterRequiresId) { + ota_update_plugin::OtaUpdatePlugin plugin; + auto rc = plugin.register_update(nlohmann::json::object()); + EXPECT_FALSE(rc); + EXPECT_EQ(rc.error().code, ros2_medkit_gateway::UpdateBackendError::InvalidInput); +} + +TEST(OtaUpdatePluginSmoke, GetUpdateReturnsNotFoundForUnknownId) { + ota_update_plugin::OtaUpdatePlugin plugin; + auto got = plugin.get_update("does-not-exist"); + ASSERT_FALSE(got); + EXPECT_EQ(got.error().code, ros2_medkit_gateway::UpdateBackendError::NotFound); +} + +TEST(OtaUpdatePluginSmoke, DeleteRemovesEntry) { + ota_update_plugin::OtaUpdatePlugin plugin; + ASSERT_TRUE(plugin.register_update({{"id", "to-delete"}, {"updated_components", {"x"}}})); + ASSERT_TRUE(plugin.delete_update("to-delete")); + auto got = plugin.get_update("to-delete"); + EXPECT_FALSE(got); +} + +TEST(OtaUpdatePluginSmoke, BootPollPopulates) { + ota_update_plugin::OtaUpdatePlugin plugin; + plugin.configure(nlohmann::json::object()); + auto fake = std::make_unique("http://x"); + fake->catalog_payload = nlohmann::json::array({ + {{"id", "a"}, + {"updated_components", {"scan"}}, + {"x_medkit_artifact_url", "/artifacts/a.tgz"}, + {"x_medkit_target_package", "a"}}, + }); + plugin.set_catalog_client_for_test(std::move(fake)); + plugin.poll_and_register_catalog(); + + auto ids = plugin.list_updates({}); + ASSERT_TRUE(ids); + ASSERT_EQ(ids->size(), 1u); + EXPECT_EQ((*ids)[0], "a"); +} + +TEST(OtaUpdatePluginSmoke, PrepareRejectsUnknownOperationKind) { + ota_update_plugin::OtaUpdatePlugin plugin; + ASSERT_TRUE(plugin.register_update({{"id", "bad"}})); + ros2_medkit_gateway::UpdateStatusInfo info; + std::mutex mu; + auto reporter = make_reporter(info, mu); + auto rc = plugin.prepare("bad", reporter); + ASSERT_FALSE(rc); + EXPECT_EQ(rc.error().code, ros2_medkit_gateway::UpdateBackendError::InvalidInput); +} + +TEST(OtaUpdatePluginSmoke, PrepareUninstallSkipsDownload) { + ota_update_plugin::OtaUpdatePlugin plugin; + plugin.configure(nlohmann::json::object()); + // No download should happen for uninstall, but provide a fake just in case. + auto fake = std::make_unique("http://x"); + plugin.set_catalog_client_for_test(std::move(fake)); + ASSERT_TRUE(plugin.register_update({{"id", "rm"}, {"removed_components", {"legacy"}}})); + + ros2_medkit_gateway::UpdateStatusInfo info; + std::mutex mu; + auto reporter = make_reporter(info, mu); + auto rc = plugin.prepare("rm", reporter); + EXPECT_TRUE(rc); + EXPECT_EQ(info.progress.value_or(-1), 100); +} From 78816db4d85dec09423627cb150e66b59aecca7e Mon Sep 17 00:00:00 2001 From: Bartosz Burda Date: Sun, 26 Apr 2026 18:37:40 +0200 Subject: [PATCH 22/52] fix(ota_plugin): double-fork to avoid zombies, init catalog client in configure, add -Wshadow -Wconversion --- .../ota_update_plugin/CMakeLists.txt | 2 +- .../src/ota_update_plugin.cpp | 9 +++----- .../ota_update_plugin/src/process_runner.cpp | 22 ++++++++++++++----- 3 files changed, 21 insertions(+), 12 deletions(-) diff --git a/demos/ota_nav2_sensor_fix/ota_update_plugin/CMakeLists.txt b/demos/ota_nav2_sensor_fix/ota_update_plugin/CMakeLists.txt index 9798bd7..050ca09 100644 --- a/demos/ota_nav2_sensor_fix/ota_update_plugin/CMakeLists.txt +++ b/demos/ota_nav2_sensor_fix/ota_update_plugin/CMakeLists.txt @@ -16,7 +16,7 @@ cmake_minimum_required(VERSION 3.16) project(ota_update_plugin CXX) if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang") - add_compile_options(-Wall -Wextra -Wpedantic) + add_compile_options(-Wall -Wextra -Wpedantic -Wshadow -Wconversion) endif() set(CMAKE_CXX_STANDARD 17) diff --git a/demos/ota_nav2_sensor_fix/ota_update_plugin/src/ota_update_plugin.cpp b/demos/ota_nav2_sensor_fix/ota_update_plugin/src/ota_update_plugin.cpp index 6b170c2..23a15aa 100644 --- a/demos/ota_nav2_sensor_fix/ota_update_plugin/src/ota_update_plugin.cpp +++ b/demos/ota_nav2_sensor_fix/ota_update_plugin/src/ota_update_plugin.cpp @@ -76,6 +76,9 @@ void OtaUpdatePlugin::configure(const nlohmann::json & config) { catalog_url_ = config.value("catalog_url", "http://ota_update_server:9000"); staging_dir_ = config.value("staging_dir", "/tmp/ota_staging"); install_dir_ = config.value("install_dir", "/ws/install"); + if (!catalog_client_) { + catalog_client_ = std::make_unique(catalog_url_); + } } void OtaUpdatePlugin::set_context(ros2_medkit_gateway::PluginContext & /*context*/) { @@ -83,9 +86,6 @@ void OtaUpdatePlugin::set_context(ros2_medkit_gateway::PluginContext & /*context } void OtaUpdatePlugin::poll_and_register_catalog() { - if (!catalog_client_) { - catalog_client_ = std::make_unique(catalog_url_); - } auto fetched = catalog_client_->fetch_catalog(); if (!fetched) { std::fprintf(stderr, "[ota_update_plugin] catalog fetch failed: %s\n", fetched.error().c_str()); @@ -184,9 +184,6 @@ tl::expected OtaUpdatePlugin::prepare( const std::string staged_path = staging_dir_ + "/" + id + ".tar.gz"; reporter.set_progress(10); - if (!catalog_client_) { - catalog_client_ = std::make_unique(catalog_url_); - } auto dl = catalog_client_->download_artifact(url, staged_path); if (!dl) { return tl::make_unexpected(UpdateBackendErrorInfo{UpdateBackendError::Internal, "download failed: " + dl.error()}); diff --git a/demos/ota_nav2_sensor_fix/ota_update_plugin/src/process_runner.cpp b/demos/ota_nav2_sensor_fix/ota_update_plugin/src/process_runner.cpp index 2c6741f..3568d2c 100644 --- a/demos/ota_nav2_sensor_fix/ota_update_plugin/src/process_runner.cpp +++ b/demos/ota_nav2_sensor_fix/ota_update_plugin/src/process_runner.cpp @@ -113,17 +113,29 @@ tl::expected ProcessRunner::kill_by_executable(const std::stri } tl::expected ProcessRunner::spawn(const std::string & executable_path) { + // Double-fork so the grandchild is reparented to init and never becomes a + // zombie in the gateway process. The intermediate child exits immediately + // and is reaped here. pid_t pid = fork(); if (pid < 0) { return tl::make_unexpected(std::string("fork failed: ") + std::strerror(errno)); } if (pid == 0) { - // Child: detach from controlling terminal. - setsid(); - execl(executable_path.c_str(), executable_path.c_str(), nullptr); - std::fprintf(stderr, "execl %s failed: %s\n", executable_path.c_str(), std::strerror(errno)); - _exit(127); + pid_t grandchild = fork(); + if (grandchild < 0) { + _exit(126); + } + if (grandchild == 0) { + setsid(); + execl(executable_path.c_str(), executable_path.c_str(), nullptr); + std::fprintf(stderr, "execl %s failed: %s\n", executable_path.c_str(), + std::strerror(errno)); + _exit(127); + } + _exit(0); } + int status = 0; + ::waitpid(pid, &status, 0); return static_cast(pid); } From 3517bb80b42873cf0ed753b382ba6eca5a8849fe Mon Sep 17 00:00:00 2001 From: Bartosz Burda Date: Sun, 26 Apr 2026 18:52:58 +0200 Subject: [PATCH 23/52] feat(demos/ota): thread x_medkit_replaces_executable for update kind Adds optional --replaces-executable flag to pack_artifact.py and threads it into the catalog entry as x_medkit_replaces_executable when kind=update. This lets the gateway plugin kill the OLD executable (broken_lidar_node) before spawning the NEW one (fixed_lidar_node) when the two live in separate ROS 2 packages. --- .../scripts/build_artifacts.sh | 1 + .../ota_nav2_sensor_fix/scripts/pack_artifact.py | 14 ++++++++++++++ .../scripts/test_pack_artifact.py | 16 ++++++++++++++++ 3 files changed, 31 insertions(+) diff --git a/demos/ota_nav2_sensor_fix/scripts/build_artifacts.sh b/demos/ota_nav2_sensor_fix/scripts/build_artifacts.sh index 6588466..c34ce0e 100755 --- a/demos/ota_nav2_sensor_fix/scripts/build_artifacts.sh +++ b/demos/ota_nav2_sensor_fix/scripts/build_artifacts.sh @@ -25,6 +25,7 @@ env -i PATH=/usr/bin:/bin HOME="$HOME" "${PACK[@]}" \ --package fixed_lidar --version 2.1.0 \ --kind update --target-component scan_sensor_node \ --executable fixed_lidar_node \ + --replaces-executable broken_lidar_node \ --notes "Fix /scan noise filter" \ --skip-build --workspace "$WS" \ --out-dir "$ARTIFACTS" --catalog "$ARTIFACTS/catalog.json" diff --git a/demos/ota_nav2_sensor_fix/scripts/pack_artifact.py b/demos/ota_nav2_sensor_fix/scripts/pack_artifact.py index 06f3df1..2f80f41 100644 --- a/demos/ota_nav2_sensor_fix/scripts/pack_artifact.py +++ b/demos/ota_nav2_sensor_fix/scripts/pack_artifact.py @@ -39,6 +39,14 @@ def build_parser() -> argparse.ArgumentParser: default="", help="Executable name inside install//lib (required for install).", ) + parser.add_argument( + "--replaces-executable", + default="", + help=( + "For kind=update: name of the OLD executable to kill before " + "spawning --executable. Defaults to --executable when omitted." + ), + ) parser.add_argument("--notes", default="", help="Free-text notes for the catalog entry.") parser.add_argument( "--duration", @@ -83,6 +91,7 @@ def build_entry( notes: str, duration: int, size_bytes: int, + replaces_executable: str = "", ) -> dict: entry: dict = { "id": slug(package, version) if kind != "uninstall" else f"{package}_remove", @@ -112,6 +121,9 @@ def build_entry( else: entry["x_medkit_target_package"] = package + if kind == "update" and replaces_executable: + entry["x_medkit_replaces_executable"] = replaces_executable + return entry @@ -165,6 +177,7 @@ def run( catalog: str, skip_build: bool, workspace: str, + replaces_executable: str = "", ) -> int: if kind == "install" and not executable: sys.stderr.write("--executable is required for install\n") @@ -202,6 +215,7 @@ def run( notes=notes, duration=duration, size_bytes=size_bytes, + replaces_executable=replaces_executable, ) merge_catalog(catalog_p, entry) print(f"packed {entry['id']}") diff --git a/demos/ota_nav2_sensor_fix/scripts/test_pack_artifact.py b/demos/ota_nav2_sensor_fix/scripts/test_pack_artifact.py index bced302..fd6b0c8 100644 --- a/demos/ota_nav2_sensor_fix/scripts/test_pack_artifact.py +++ b/demos/ota_nav2_sensor_fix/scripts/test_pack_artifact.py @@ -70,6 +70,22 @@ def test_build_entry_update_kind(): assert entry["x_medkit_target_package"] == "fixed_lidar" assert entry["x_medkit_executable"] == "fixed_lidar_node" assert entry["x_medkit_artifact_url"] == "/artifacts/fixed_lidar-2.1.0.tar.gz" + assert "x_medkit_replaces_executable" not in entry + + +def test_build_entry_update_kind_with_replaces(): + entry = pack_artifact.build_entry( + package="fixed_lidar", + version="2.1.0", + kind="update", + target_component="scan_sensor_node", + executable="fixed_lidar_node", + replaces_executable="broken_lidar_node", + notes="", + duration=10, + size_bytes=1024, + ) + assert entry["x_medkit_replaces_executable"] == "broken_lidar_node" def test_build_entry_install_kind(): From 3bb6b1bd3a4f511b21fb21978b578dd7423d3ede Mon Sep 17 00:00:00 2001 From: Bartosz Burda Date: Sun, 26 Apr 2026 18:54:41 +0200 Subject: [PATCH 24/52] fix(ota_plugin): honor x_medkit_replaces_executable when killing old process When a SOVD update package swaps a node across ROS 2 packages (e.g. broken_lidar -> fixed_lidar), the OLD process binary basename differs from the new one. Read x_medkit_replaces_executable from the entry metadata before issuing the kill, falling back to x_medkit_executable when the field is absent (in-package upgrades). --- .../src/ota_update_plugin.cpp | 7 +- .../test/test_plugin_smoke.cpp | 83 +++++++++++++++++++ 2 files changed, 89 insertions(+), 1 deletion(-) diff --git a/demos/ota_nav2_sensor_fix/ota_update_plugin/src/ota_update_plugin.cpp b/demos/ota_nav2_sensor_fix/ota_update_plugin/src/ota_update_plugin.cpp index 23a15aa..f081a10 100644 --- a/demos/ota_nav2_sensor_fix/ota_update_plugin/src/ota_update_plugin.cpp +++ b/demos/ota_nav2_sensor_fix/ota_update_plugin/src/ota_update_plugin.cpp @@ -225,8 +225,13 @@ tl::expected OtaUpdatePlugin::execute( return tl::make_unexpected( UpdateBackendErrorInfo{UpdateBackendError::InvalidInput, "missing x_medkit_executable"}); } + // For an update across packages (e.g. broken_lidar -> fixed_lidar) the + // OLD process binary lives in a different package than the NEW one we + // are about to spawn, so its basename differs from `executable`. Honor + // x_medkit_replaces_executable when present, fall back to executable. + const std::string kill_target = metadata.value("x_medkit_replaces_executable", executable); reporter.set_progress(20); - auto kr = process_runner_->kill_by_executable(executable); + auto kr = process_runner_->kill_by_executable(kill_target); if (!kr) { return tl::make_unexpected(UpdateBackendErrorInfo{UpdateBackendError::Internal, "kill failed: " + kr.error()}); } diff --git a/demos/ota_nav2_sensor_fix/ota_update_plugin/test/test_plugin_smoke.cpp b/demos/ota_nav2_sensor_fix/ota_update_plugin/test/test_plugin_smoke.cpp index 09e109c..438bba7 100644 --- a/demos/ota_nav2_sensor_fix/ota_update_plugin/test/test_plugin_smoke.cpp +++ b/demos/ota_nav2_sensor_fix/ota_update_plugin/test/test_plugin_smoke.cpp @@ -24,6 +24,7 @@ #include "catalog_client.hpp" #include "ota_update_plugin/ota_update_plugin.hpp" +#include "process_runner.hpp" namespace { @@ -47,6 +48,29 @@ class FakeCatalogClient : public ota_update_plugin::CatalogClient { } }; +/// ProcessRunner stub: records the basename passed to kill_by_executable so a +/// test can verify the plugin honors x_medkit_replaces_executable. Returns 0 +/// signalled processes (no-op kill) and an error from spawn so execute() halts +/// before touching the (nonexistent) install dir. +class RecordingProcessRunner : public ota_update_plugin::ProcessRunner { + public: + std::string last_kill_target; + + std::vector pgrep(const std::string & /*executable_basename*/) override { + return {}; + } + + tl::expected kill_by_executable(const std::string & executable_basename, + int /*timeout_ms*/ = 2000) override { + last_kill_target = executable_basename; + return 0; + } + + tl::expected spawn(const std::string & /*executable_path*/) override { + return tl::make_unexpected(std::string("stub: spawn intentionally not implemented")); + } +}; + ros2_medkit_gateway::UpdateProgressReporter make_reporter(ros2_medkit_gateway::UpdateStatusInfo & info, std::mutex & mu) { return ros2_medkit_gateway::UpdateProgressReporter(info, mu); @@ -139,3 +163,62 @@ TEST(OtaUpdatePluginSmoke, PrepareUninstallSkipsDownload) { EXPECT_TRUE(rc); EXPECT_EQ(info.progress.value_or(-1), 100); } + +TEST(OtaUpdatePluginSmoke, ExecuteUpdateUsesReplacesExecutableForKill) { + ota_update_plugin::OtaUpdatePlugin plugin; + plugin.configure({{"staging_dir", ::testing::TempDir() + "/replaces_test"}}); + plugin.set_catalog_client_for_test(std::make_unique("http://x")); + auto runner = std::make_unique(); + RecordingProcessRunner * runner_raw = runner.get(); + plugin.set_process_runner_for_test(std::move(runner)); + + // Update entry with separate old + new executable basenames. + ASSERT_TRUE(plugin.register_update({ + {"id", "u_replaces"}, + {"updated_components", {"scan_sensor_node"}}, + {"x_medkit_artifact_url", "/artifacts/fixed.tgz"}, + {"x_medkit_target_package", "fixed_lidar"}, + {"x_medkit_executable", "fixed_lidar_node"}, + {"x_medkit_replaces_executable", "broken_lidar_node"}, + })); + + ros2_medkit_gateway::UpdateStatusInfo info; + std::mutex mu; + auto reporter = make_reporter(info, mu); + ASSERT_TRUE(plugin.prepare("u_replaces", reporter)); + + // execute() will fail at extract_and_swap (the staged tarball is not a real + // gzipped archive) but the kill step runs first - that is what we are + // checking here. + auto rc = plugin.execute("u_replaces", reporter); + (void)rc; + EXPECT_EQ(runner_raw->last_kill_target, "broken_lidar_node"); +} + +TEST(OtaUpdatePluginSmoke, ExecuteUpdateFallsBackToExecutableWhenReplacesMissing) { + ota_update_plugin::OtaUpdatePlugin plugin; + plugin.configure({{"staging_dir", ::testing::TempDir() + "/replaces_fallback"}}); + plugin.set_catalog_client_for_test(std::make_unique("http://x")); + auto runner = std::make_unique(); + RecordingProcessRunner * runner_raw = runner.get(); + plugin.set_process_runner_for_test(std::move(runner)); + + // Update entry without x_medkit_replaces_executable - kill should target + // the same name as x_medkit_executable. + ASSERT_TRUE(plugin.register_update({ + {"id", "u_no_replaces"}, + {"updated_components", {"scan_sensor_node"}}, + {"x_medkit_artifact_url", "/artifacts/scan.tgz"}, + {"x_medkit_target_package", "scan_pkg"}, + {"x_medkit_executable", "scan_node"}, + })); + + ros2_medkit_gateway::UpdateStatusInfo info; + std::mutex mu; + auto reporter = make_reporter(info, mu); + ASSERT_TRUE(plugin.prepare("u_no_replaces", reporter)); + + auto rc = plugin.execute("u_no_replaces", reporter); + (void)rc; + EXPECT_EQ(runner_raw->last_kill_target, "scan_node"); +} From f088c7eaa1693007a67617704608aafde801e36d Mon Sep 17 00:00:00 2001 From: Bartosz Burda Date: Sun, 26 Apr 2026 18:58:38 +0200 Subject: [PATCH 25/52] feat(demos/ota): docker compose stack + gateway config + entrypoint + README --- demos/ota_nav2_sensor_fix/Dockerfile.gateway | 72 +++++++++++++++ demos/ota_nav2_sensor_fix/README.md | 88 +++++++++++++++++++ demos/ota_nav2_sensor_fix/docker-compose.yml | 37 ++++++++ demos/ota_nav2_sensor_fix/entrypoint.sh | 27 ++++++ demos/ota_nav2_sensor_fix/gateway_config.yaml | 38 ++++++++ 5 files changed, 262 insertions(+) create mode 100644 demos/ota_nav2_sensor_fix/Dockerfile.gateway create mode 100644 demos/ota_nav2_sensor_fix/README.md create mode 100644 demos/ota_nav2_sensor_fix/docker-compose.yml create mode 100755 demos/ota_nav2_sensor_fix/entrypoint.sh create mode 100644 demos/ota_nav2_sensor_fix/gateway_config.yaml diff --git a/demos/ota_nav2_sensor_fix/Dockerfile.gateway b/demos/ota_nav2_sensor_fix/Dockerfile.gateway new file mode 100644 index 0000000..01f4b36 --- /dev/null +++ b/demos/ota_nav2_sensor_fix/Dockerfile.gateway @@ -0,0 +1,72 @@ +# Copyright 2026 bburda +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Builds the ros2_medkit gateway, the ota_update_plugin, and the four demo +# ROS 2 packages into a single ROS 2 Jazzy image. Plugin loads at gateway +# startup via /etc/ros2_medkit/gateway_config.yaml and the entrypoint also +# launches the broken_lidar demo nodes that get swapped/uninstalled at +# runtime by the plugin. + +FROM ros:jazzy AS builder + +ARG GATEWAY_REPO=https://github.com/selfpatch/ros2_medkit.git +ARG GATEWAY_REF=main + +RUN apt-get update && apt-get install -y --no-install-recommends \ + git \ + python3-colcon-common-extensions \ + python3-rosdep \ + build-essential \ + cmake \ + curl \ + ca-certificates \ + && rm -rf /var/lib/apt/lists/* + +RUN rosdep init || true +RUN rosdep update --rosdistro=jazzy + +WORKDIR /ws/src +RUN git clone --depth=1 --branch ${GATEWAY_REF} ${GATEWAY_REPO} ros2_medkit + +# Copy demo packages (broken_lidar, fixed_lidar, broken_lidar_legacy, +# obstacle_classifier_v2) and the OTA plugin from the build context. +COPY ros2_packages /tmp/ros2_packages +RUN cp -r /tmp/ros2_packages/. /ws/src/ && rm -rf /tmp/ros2_packages +COPY ota_update_plugin /ws/src/ota_update_plugin + +WORKDIR /ws +# rosdep needs the apt cache populated to install gateway dependencies +# (nlohmann-json3-dev, libcpp-httplib-dev, etc.). +RUN apt-get update +RUN . /opt/ros/jazzy/setup.sh && \ + rosdep install --from-paths src --ignore-src -r -y --rosdistro=jazzy && \ + colcon build --symlink-install \ + --cmake-args -DCMAKE_BUILD_TYPE=Release && \ + rm -rf /var/lib/apt/lists/* + + +FROM ros:jazzy + +RUN apt-get update && apt-get install -y --no-install-recommends \ + ros-jazzy-rclcpp \ + ros-jazzy-sensor-msgs \ + ros-jazzy-visualization-msgs \ + ros-jazzy-launch-ros \ + curl \ + procps \ + && rm -rf /var/lib/apt/lists/* + +COPY --from=builder /ws/install /ws/install +COPY gateway_config.yaml /etc/ros2_medkit/gateway_config.yaml +COPY entrypoint.sh /usr/local/bin/entrypoint.sh +RUN chmod +x /usr/local/bin/entrypoint.sh + +ENV ROS_DOMAIN_ID=42 + +EXPOSE 8080 +ENTRYPOINT ["/usr/local/bin/entrypoint.sh"] diff --git a/demos/ota_nav2_sensor_fix/README.md b/demos/ota_nav2_sensor_fix/README.md new file mode 100644 index 0000000..77e7ecf --- /dev/null +++ b/demos/ota_nav2_sensor_fix/README.md @@ -0,0 +1,88 @@ +# OTA over SOVD - nav2 sensor fix demo + +End-to-end demo: a `ros2_medkit` gateway with a dev-grade OTA plugin that +demonstrates the full Update / Install / Uninstall lifecycle on ROS 2 nodes +without SSH-ing into the robot. + +## What this shows + +Three things you can do to a ROS 2 robot over the air: + +1. **Update** - swap a running sensor node with a fixed version (the + `broken_lidar` -> `fixed_lidar` flip). +2. **Install** - pull and start a new ROS 2 package + (`obstacle_classifier_v2`). +3. **Uninstall** - stop and remove a deprecated package + (`broken_lidar_legacy`). + +All three operations are SOVD ISO 17978-3 compliant - the kind is derived +from `updated_components` / `added_components` / `removed_components` in the +update package metadata. + +## Quickstart + +```bash +# 1. Build artifacts (compiles fixed_lidar + obstacle_classifier_v2, +# generates catalog.json + tarballs). +./scripts/build_artifacts.sh + +# 2. Start the stack (gateway + plugin + demo nodes + update server). +docker compose up --build +``` + +In another terminal, drive the demo: + +```bash +# List the registered updates. +curl -s http://localhost:8080/api/v1/updates | jq '.[].id' + +# Run an update: prepare downloads the artifact, execute swaps + restarts. +curl -X PUT http://localhost:8080/api/v1/updates/fixed_lidar_2_1_0/prepare +curl -X PUT http://localhost:8080/api/v1/updates/fixed_lidar_2_1_0/execute + +# Install a new package. +curl -X PUT http://localhost:8080/api/v1/updates/obstacle_classifier_v2_1_0_0/prepare +curl -X PUT http://localhost:8080/api/v1/updates/obstacle_classifier_v2_1_0_0/execute + +# Uninstall a deprecated one. +curl -X PUT http://localhost:8080/api/v1/updates/broken_lidar_legacy_remove/execute +``` + +Tear down: `docker compose down`. + +## Adding a Foxglove visualization + +Install the `ros2_medkit_foxglove_extension` (which now ships an Updates +panel - see https://github.com/selfpatch/ros2_medkit_foxglove_extension) +in your local Foxglove Studio, then point it at +`http://localhost:8080/api/v1`. The Updates panel exposes Prepare and +Execute buttons next to each catalog entry. + +## Adding nav2 / a sim + +This demo intentionally omits a nav2 sim from the compose so the stack stays +small and reliably reproducible. To make the visual story complete: + +- Bring up your favourite turtlebot3 sim (`turtlebot3_gazebo`) and point it + at `ROS_DOMAIN_ID=42` to share the DDS namespace with the gateway. +- The broken_lidar node publishes a phantom return on `/scan` ~1m straight + ahead. nav2's costmap will trace it as an obstacle and the planner will + refuse to drive forward. After the update flow, fixed_lidar publishes a + clean scan and the path planner unblocks. + +## Disclosures + +This is **dev-grade** OTA. Deliberately missing for production: + +- No artifact signing or signature verification +- No atomic swap (in-place overwrite) +- No A/B partition rollout +- No fleet-wide staged rollout +- No persistent update state across gateway restarts +- No automated health-gated rollback policy +- No audit log + +Perfect for: prototypes, lab robots, internal demos, dev environments. + +For production-grade OTA (rollout safety, signing, A/B partitions, +fleet-aware staging), reach out. diff --git a/demos/ota_nav2_sensor_fix/docker-compose.yml b/demos/ota_nav2_sensor_fix/docker-compose.yml new file mode 100644 index 0000000..c1bb262 --- /dev/null +++ b/demos/ota_nav2_sensor_fix/docker-compose.yml @@ -0,0 +1,37 @@ +# Copyright 2026 bburda +# Apache 2.0 +# +# Two-service stack: the gateway (with ota_update_plugin baked in plus the +# demo nodes the plugin will manage) and the FastAPI artifact server. nav2 +# and Foxglove are intentionally out of scope here - see README for how to +# bring your own. + +services: + gateway: + image: selfpatch/ota_demo_gateway:dev + build: + context: . + dockerfile: Dockerfile.gateway + container_name: ota_demo_gateway + networks: [otanet] + ports: + - "8080:8080" + environment: + ROS_DOMAIN_ID: 42 + depends_on: + - ota_update_server + + ota_update_server: + image: selfpatch/ota_update_server:dev + build: + context: ./ota_update_server + container_name: ota_demo_update_server + networks: [otanet] + ports: + - "9000:9000" + volumes: + - ./artifacts:/artifacts:ro + +networks: + otanet: + driver: bridge diff --git a/demos/ota_nav2_sensor_fix/entrypoint.sh b/demos/ota_nav2_sensor_fix/entrypoint.sh new file mode 100755 index 0000000..5cf39f1 --- /dev/null +++ b/demos/ota_nav2_sensor_fix/entrypoint.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash +# Copyright 2026 bburda +# Apache 2.0 +# +# Container entrypoint: launches the demo nodes that the OTA plugin will +# manage at runtime, then forks the gateway as PID 1's foreground process. + +set -e + +# shellcheck disable=SC1091 +source /opt/ros/jazzy/setup.bash +# shellcheck disable=SC1091 +source /ws/install/setup.bash + +# Demo nodes the plugin will swap (broken_lidar -> fixed_lidar) and +# uninstall (broken_lidar_legacy). obstacle_classifier_v2 is installed +# fresh by the demo and not started here. +ros2 run broken_lidar broken_lidar_node & +ros2 run broken_lidar_legacy broken_lidar_legacy & + +# Foreground gateway. Pass the config file directly to the gateway_node +# executable (the gateway.launch.py wrapper does not expose a config_file +# argument, so we invoke the executable directly to thread our YAML in). +exec ros2 run ros2_medkit_gateway gateway_node \ + --ros-args \ + --params-file /etc/ros2_medkit/gateway_config.yaml \ + --log-level info diff --git a/demos/ota_nav2_sensor_fix/gateway_config.yaml b/demos/ota_nav2_sensor_fix/gateway_config.yaml new file mode 100644 index 0000000..3cbda32 --- /dev/null +++ b/demos/ota_nav2_sensor_fix/gateway_config.yaml @@ -0,0 +1,38 @@ +# Copyright 2026 bburda +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# +# Gateway configuration for the OTA nav2 sensor-fix demo. +# Enables /updates endpoints and loads ota_update_plugin which polls the +# update server's /catalog at boot and exposes Update / Install / Uninstall +# operations over the SOVD HTTP API. + +ros2_medkit_gateway: + ros__parameters: + server: + host: "0.0.0.0" + port: 8080 + + refresh_interval_ms: 2000 + + # CORS so an external Foxglove panel or browser can hit the API. + cors: + allowed_origins: ["*"] + allowed_methods: ["GET", "PUT", "POST", "DELETE", "OPTIONS"] + allowed_headers: ["Content-Type", "Accept"] + allow_credentials: false + max_age_seconds: 86400 + + discovery: + mode: "runtime_only" + + # Enable /updates endpoints; provider supplied by ota_update_plugin below. + updates: + enabled: true + + plugins: ["ota_update_plugin"] + plugins.ota_update_plugin.path: "/ws/install/ota_update_plugin/lib/ota_update_plugin/ota_update_plugin.so" + plugins.ota_update_plugin.catalog_url: "http://ota_update_server:9000" + plugins.ota_update_plugin.staging_dir: "/tmp/ota_staging" + plugins.ota_update_plugin.install_dir: "/ws/install" From 2f7d8176c6c8f3ddf5ccca3c1c7506c9dcf9e8dd Mon Sep 17 00:00:00 2001 From: Bartosz Burda Date: Sun, 26 Apr 2026 19:09:09 +0200 Subject: [PATCH 26/52] fix(ota_plugin): __has_include compat for older gateway updates/ header path --- .../include/ota_update_plugin/ota_update_plugin.hpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/demos/ota_nav2_sensor_fix/ota_update_plugin/include/ota_update_plugin/ota_update_plugin.hpp b/demos/ota_nav2_sensor_fix/ota_update_plugin/include/ota_update_plugin/ota_update_plugin.hpp index adb1b49..7e67c0b 100644 --- a/demos/ota_nav2_sensor_fix/ota_update_plugin/include/ota_update_plugin/ota_update_plugin.hpp +++ b/demos/ota_nav2_sensor_fix/ota_update_plugin/include/ota_update_plugin/ota_update_plugin.hpp @@ -22,7 +22,12 @@ #include #include +// UpdateProvider lives at providers/ in newer gateway revisions and updates/ in older ones. +#if __has_include() #include +#else +#include +#endif namespace ota_update_plugin { From ec5070faa3e9ef5c357b0726e7445ecf5e103db8 Mon Sep 17 00:00:00 2001 From: Bartosz Burda Date: Sun, 26 Apr 2026 19:58:25 +0200 Subject: [PATCH 27/52] fix(ota_plugin): cmdline-based pgrep + UpdateProvider C export + runtime libs in image - ProcessRunner::pgrep now reads /proc//cmdline argv[0] basename instead of /proc//comm (which kernel truncates to 15 chars - 'broken_lidar_node' would never match). - plugin_exports.cpp exports get_update_provider so the gateway's plugin_loader can resolve the UpdateProvider interface across the dlopen boundary without relying on dynamic_cast. - Dockerfile.gateway: drop --symlink-install (broke multi-stage COPY) and add runtime libs (libcpp-httplib, libsystemd, nlohmann-json3, lifecycle, test_msgs). - ota_update_server Dockerfile: bake artifacts/ into image (WSL2 + Docker Desktop bind mounts unreliable). - Compose: gateway port configurable via OTA_GATEWAY_PORT (default 8080). Verified via end-to-end smoke against the live stack: - Plugin loads and reports as UpdateProvider - Boot poll registers all 3 catalog entries - Update flow kills broken_lidar_node and spawns fixed_lidar_node --- demos/ota_nav2_sensor_fix/Dockerfile.gateway | 7 +++++- demos/ota_nav2_sensor_fix/docker-compose.yml | 7 +++--- .../ota_update_plugin/src/plugin_exports.cpp | 8 +++++++ .../ota_update_plugin/src/process_runner.cpp | 23 ++++++++++++++----- .../ota_update_server/Dockerfile | 12 ++++++++-- 5 files changed, 44 insertions(+), 13 deletions(-) diff --git a/demos/ota_nav2_sensor_fix/Dockerfile.gateway b/demos/ota_nav2_sensor_fix/Dockerfile.gateway index 01f4b36..e355e74 100644 --- a/demos/ota_nav2_sensor_fix/Dockerfile.gateway +++ b/demos/ota_nav2_sensor_fix/Dockerfile.gateway @@ -45,7 +45,7 @@ WORKDIR /ws RUN apt-get update RUN . /opt/ros/jazzy/setup.sh && \ rosdep install --from-paths src --ignore-src -r -y --rosdistro=jazzy && \ - colcon build --symlink-install \ + colcon build \ --cmake-args -DCMAKE_BUILD_TYPE=Release && \ rm -rf /var/lib/apt/lists/* @@ -54,9 +54,14 @@ FROM ros:jazzy RUN apt-get update && apt-get install -y --no-install-recommends \ ros-jazzy-rclcpp \ + ros-jazzy-rclcpp-lifecycle \ ros-jazzy-sensor-msgs \ ros-jazzy-visualization-msgs \ ros-jazzy-launch-ros \ + ros-jazzy-test-msgs \ + libcpp-httplib-dev \ + libsystemd-dev \ + nlohmann-json3-dev \ curl \ procps \ && rm -rf /var/lib/apt/lists/* diff --git a/demos/ota_nav2_sensor_fix/docker-compose.yml b/demos/ota_nav2_sensor_fix/docker-compose.yml index c1bb262..52c8644 100644 --- a/demos/ota_nav2_sensor_fix/docker-compose.yml +++ b/demos/ota_nav2_sensor_fix/docker-compose.yml @@ -15,7 +15,7 @@ services: container_name: ota_demo_gateway networks: [otanet] ports: - - "8080:8080" + - "${OTA_GATEWAY_PORT:-8080}:8080" environment: ROS_DOMAIN_ID: 42 depends_on: @@ -24,13 +24,12 @@ services: ota_update_server: image: selfpatch/ota_update_server:dev build: - context: ./ota_update_server + context: . + dockerfile: ota_update_server/Dockerfile container_name: ota_demo_update_server networks: [otanet] ports: - "9000:9000" - volumes: - - ./artifacts:/artifacts:ro networks: otanet: diff --git a/demos/ota_nav2_sensor_fix/ota_update_plugin/src/plugin_exports.cpp b/demos/ota_nav2_sensor_fix/ota_update_plugin/src/plugin_exports.cpp index a5916e0..3a8c104 100644 --- a/demos/ota_nav2_sensor_fix/ota_update_plugin/src/plugin_exports.cpp +++ b/demos/ota_nav2_sensor_fix/ota_update_plugin/src/plugin_exports.cpp @@ -23,3 +23,11 @@ extern "C" GATEWAY_PLUGIN_EXPORT int plugin_api_version() { extern "C" GATEWAY_PLUGIN_EXPORT ros2_medkit_gateway::GatewayPlugin * create_plugin() { return new ota_update_plugin::OtaUpdatePlugin(); } + +// Explicit cross-cast so the gateway's plugin_loader can resolve the +// UpdateProvider interface without relying on dynamic_cast across the +// dlopen boundary (which is fragile when typeinfo isn't shared). +extern "C" GATEWAY_PLUGIN_EXPORT ros2_medkit_gateway::UpdateProvider * +get_update_provider(ros2_medkit_gateway::GatewayPlugin * plugin) { + return dynamic_cast(plugin); +} diff --git a/demos/ota_nav2_sensor_fix/ota_update_plugin/src/process_runner.cpp b/demos/ota_nav2_sensor_fix/ota_update_plugin/src/process_runner.cpp index 3568d2c..8ec65e9 100644 --- a/demos/ota_nav2_sensor_fix/ota_update_plugin/src/process_runner.cpp +++ b/demos/ota_nav2_sensor_fix/ota_update_plugin/src/process_runner.cpp @@ -32,14 +32,25 @@ namespace ota_update_plugin { namespace { -std::string proc_comm(int pid) { - std::ifstream f("/proc/" + std::to_string(pid) + "/comm"); +// /proc//comm is truncated to 15 characters by the kernel, which causes +// false negatives for any executable whose basename is longer (e.g. +// "broken_lidar_node" -> "broken_lidar_no"). Read /proc//cmdline +// instead - its first NUL-separated arg holds the full path / argv[0]. +std::string proc_cmdline_arg0(int pid) { + std::ifstream f("/proc/" + std::to_string(pid) + "/cmdline", std::ios::binary); if (!f) { return {}; } - std::string line; - std::getline(f, line); - return line; + std::string buf((std::istreambuf_iterator(f)), std::istreambuf_iterator()); + if (buf.empty()) { + return {}; + } + // argv[0] runs to the first NUL. + const auto nul = buf.find('\0'); + std::string arg0 = (nul == std::string::npos) ? buf : buf.substr(0, nul); + // Take the basename so callers pass executable_basename without a path. + const auto slash = arg0.rfind('/'); + return (slash == std::string::npos) ? arg0 : arg0.substr(slash + 1); } bool is_pid_dir(const char * name) { @@ -67,7 +78,7 @@ std::vector ProcessRunner::pgrep(const std::string & executable_basename) { if (pid <= 0) { continue; } - if (proc_comm(pid) == executable_basename) { + if (proc_cmdline_arg0(pid) == executable_basename) { out.push_back(pid); } } diff --git a/demos/ota_nav2_sensor_fix/ota_update_server/Dockerfile b/demos/ota_nav2_sensor_fix/ota_update_server/Dockerfile index e5a59af..7bc0599 100644 --- a/demos/ota_nav2_sensor_fix/ota_update_server/Dockerfile +++ b/demos/ota_nav2_sensor_fix/ota_update_server/Dockerfile @@ -1,10 +1,18 @@ FROM python:3.11-slim +# Build context expected to be the demo root (so we can pull in artifacts/) +# rather than ota_update_server/. Compose wires this up. + WORKDIR /app -COPY pyproject.toml ./ -COPY ota_update_server ./ota_update_server +COPY ota_update_server/pyproject.toml ./ +COPY ota_update_server/ota_update_server ./ota_update_server RUN pip install --no-cache-dir . +# Bake the demo catalog + tarballs into the image so the container is +# self-contained. Bind-mounting artifacts/ at runtime is unreliable on +# WSL2 + Docker Desktop, so we ship them in the image instead. +COPY artifacts /artifacts + ENV OTA_ARTIFACTS_DIR=/artifacts ENV OTA_HOST=0.0.0.0 ENV OTA_PORT=9000 From 80e4af11ef1a09463fbde7edd459afd1a560206c Mon Sep 17 00:00:00 2001 From: Bartosz Burda Date: Sun, 26 Apr 2026 21:29:46 +0200 Subject: [PATCH 28/52] fix(demos/ota): use SOVD spec field names update_name + x_medkit_version pack_artifact.py was emitting 'name' (not in SOVD ISO 17978-3 - spec uses 'update_name') and 'version' (not a SOVD field at all). Spec-compliant clients (ros2_medkit_web_ui, the Foxglove updates panel) expect update_name; vendor-specific data lives under x_medkit_*. Confirmed against the live demo gateway: the web UI happily renders the updated shape, all 3 catalog entries visible end-to-end. --- demos/ota_nav2_sensor_fix/scripts/pack_artifact.py | 10 ++++++++-- .../ota_nav2_sensor_fix/scripts/test_pack_artifact.py | 6 ++++-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/demos/ota_nav2_sensor_fix/scripts/pack_artifact.py b/demos/ota_nav2_sensor_fix/scripts/pack_artifact.py index 2f80f41..fabe8e4 100644 --- a/demos/ota_nav2_sensor_fix/scripts/pack_artifact.py +++ b/demos/ota_nav2_sensor_fix/scripts/pack_artifact.py @@ -95,14 +95,20 @@ def build_entry( ) -> dict: entry: dict = { "id": slug(package, version) if kind != "uninstall" else f"{package}_remove", - "name": f"{package} {version}".strip(), + # SOVD ISO 17978-3 mandates "update_name". Earlier drafts of this + # script wrote "name" - the gateway passes that through to clients + # but spec-compliant consumers (web UI, Foxglove panel) expect + # update_name. + "update_name": f"{package} {version}".strip(), "automated": False, "origins": ["remote"], "notes": notes, "duration": duration, } if version: - entry["version"] = version + # SOVD spec does not define a top-level version field on update + # detail, so we expose it as a vendor extension. + entry["x_medkit_version"] = version if size_bytes > 0: entry["size"] = max(1, size_bytes // 1024) diff --git a/demos/ota_nav2_sensor_fix/scripts/test_pack_artifact.py b/demos/ota_nav2_sensor_fix/scripts/test_pack_artifact.py index fd6b0c8..36d840c 100644 --- a/demos/ota_nav2_sensor_fix/scripts/test_pack_artifact.py +++ b/demos/ota_nav2_sensor_fix/scripts/test_pack_artifact.py @@ -57,8 +57,10 @@ def test_build_entry_update_kind(): size_bytes=2048, ) assert entry["id"] == "fixed_lidar_2_1_0" - assert entry["name"] == "fixed_lidar 2.1.0" - assert entry["version"] == "2.1.0" + assert entry["update_name"] == "fixed_lidar 2.1.0" + assert "name" not in entry, "use update_name (SOVD spec) not name" + assert entry["x_medkit_version"] == "2.1.0" + assert "version" not in entry, "version is not a SOVD field; use x_medkit_version" assert entry["automated"] is False assert entry["origins"] == ["remote"] assert entry["notes"] == "fix noise" From 8f48af129507f537f30b4952654f928da230ff59 Mon Sep 17 00:00:00 2001 From: Bartosz Burda Date: Sun, 26 Apr 2026 21:33:08 +0200 Subject: [PATCH 29/52] test(demos/ota): committable Playwright e2e smoke driving web UI against gateway Verifies the canonical SOVD client flow that the Foxglove updates panel mirrors: connect form, /api/v1/updates returns {items: []}, per-id /status calls, all 3 catalog entries render in the dashboard. --- .../scripts/e2e_webui_smoke.mjs | 97 +++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100644 demos/ota_nav2_sensor_fix/scripts/e2e_webui_smoke.mjs diff --git a/demos/ota_nav2_sensor_fix/scripts/e2e_webui_smoke.mjs b/demos/ota_nav2_sensor_fix/scripts/e2e_webui_smoke.mjs new file mode 100644 index 0000000..1937ffc --- /dev/null +++ b/demos/ota_nav2_sensor_fix/scripts/e2e_webui_smoke.mjs @@ -0,0 +1,97 @@ +// E2E smoke driver for the OTA demo. Drives ros2_medkit_web_ui (running +// at WEB_UI_URL) against the live demo gateway (GATEWAY_URL) and asserts +// that all 3 catalog entries register and that the SOVD wire format +// matches what we ship from pack_artifact.py. +// +// Why this exists: the Foxglove updates panel mirrors the same SOVD client +// patterns the web UI uses (fetchUpdateIds parses {items: [...]}, +// per-id /status, lazy /detail). Verifying the web UI flow end-to-end +// gives us a canonical reference point for both clients. +// +// Usage: +// docker compose up -d +// cd /path/to/ros2_medkit_web_ui && npm install && npm run dev +// GATEWAY_URL=http://localhost:8080 \ +// WEB_UI_URL=http://localhost:5173 \ +// node /path/to/this/e2e_webui_smoke.mjs +// +// Requires: playwright (`npm install --no-save playwright` in the web UI +// dir), chromium-headless-shell (`npx playwright install +// chromium-headless-shell`), the demo stack from ../docker-compose.yml. + +import { chromium } from "playwright"; + +const WEB_UI_URL = process.env.WEB_UI_URL ?? "http://localhost:5173/"; +const GATEWAY_URL = process.env.GATEWAY_URL ?? "http://localhost:8080"; + +const EXPECTED_IDS = [ + "fixed_lidar_2_1_0", + "obstacle_classifier_v2_1_0_0", + "broken_lidar_legacy_remove", +]; + +const EXPECTED_API_PATHS = [ + "/api/v1/updates", + "/api/v1/updates/fixed_lidar_2_1_0/status", + "/api/v1/updates/obstacle_classifier_v2_1_0_0/status", + "/api/v1/updates/broken_lidar_legacy_remove/status", +]; + +(async () => { + const browser = await chromium.launch({ + channel: "chromium-headless-shell", + headless: true, + }); + const ctx = await browser.newContext(); + const page = await ctx.newPage(); + + page.on("pageerror", (err) => console.log(`[pageerror] ${err.message}`)); + + const apiCalls = new Set(); + page.on("request", (req) => { + const u = req.url(); + if (u.includes("/api/v1/")) { + apiCalls.add(`${req.method()} ${u}`); + } + }); + + await page.goto(WEB_UI_URL, { waitUntil: "domcontentloaded" }); + + await page.getByRole("button", { name: /connect to server/i }).click(); + await page.waitForTimeout(300); + await page.locator('input[type="text"], input:not([type])').first().fill(GATEWAY_URL); + await page.getByRole("button", { name: /^connect$/i }).last().click(); + await page.waitForTimeout(2000); + + const updatesButton = page.getByRole("button", { name: /updates/i }).first(); + if (await updatesButton.count()) { + await updatesButton.click(); + await page.waitForTimeout(2000); + } + + const bodyText = await page.locator("body").textContent(); + + let failed = 0; + for (const id of EXPECTED_IDS) { + const visible = bodyText?.includes(id) ?? false; + console.log(` id ${id}: ${visible ? "PASS" : "FAIL"}`); + if (!visible) failed++; + } + + for (const path of EXPECTED_API_PATHS) { + const hit = [...apiCalls].some((c) => c.endsWith(path)); + console.log(` api ${path}: ${hit ? "PASS" : "FAIL"}`); + if (!hit) failed++; + } + + await browser.close(); + + if (failed > 0) { + console.error(`\n${failed} assertion(s) failed`); + process.exit(1); + } + console.log("\nDONE: all SOVD flows verified"); +})().catch((err) => { + console.error("FAIL:", err); + process.exit(1); +}); From a1726c401b97236093aa7c24ce73f3ff5f72dca9 Mon Sep 17 00:00:00 2001 From: Bartosz Burda Date: Mon, 27 Apr 2026 09:52:27 +0200 Subject: [PATCH 30/52] feat(demos/ota): run-demo / stop-demo / check-demo / trigger-* scripts Adopt the same script convention as sensor_diagnostics, multi_ecu_aggregation, and turtlebot3_integration: ./run-demo.sh build artifacts + bring up gateway + nodes + update server (daemon mode by default, --attached for fg) ./stop-demo.sh tear down (-v removes volumes, --images removes built images) ./check-demo.sh show registered updates + per-id status + live plugin-managed processes inside the gateway container ./trigger-update.sh broken_lidar -> fixed_lidar (the headline) ./trigger-install.sh install obstacle_classifier_v2 from scratch ./trigger-uninstall.sh remove broken_lidar_legacy OTA_GATEWAY_PORT (or OTA_GATEWAY_URL for full override) lets the user sidestep collisions with another gateway on host port 8080. README quickstart updated to point at run-demo.sh. --- demos/ota_nav2_sensor_fix/README.md | 32 ++-- demos/ota_nav2_sensor_fix/check-demo.sh | 56 +++++++ demos/ota_nav2_sensor_fix/run-demo.sh | 147 ++++++++++++++++++ demos/ota_nav2_sensor_fix/stop-demo.sh | 40 +++++ demos/ota_nav2_sensor_fix/trigger-install.sh | 35 +++++ .../ota_nav2_sensor_fix/trigger-uninstall.sh | 37 +++++ demos/ota_nav2_sensor_fix/trigger-update.sh | 36 +++++ 7 files changed, 365 insertions(+), 18 deletions(-) create mode 100755 demos/ota_nav2_sensor_fix/check-demo.sh create mode 100755 demos/ota_nav2_sensor_fix/run-demo.sh create mode 100755 demos/ota_nav2_sensor_fix/stop-demo.sh create mode 100755 demos/ota_nav2_sensor_fix/trigger-install.sh create mode 100755 demos/ota_nav2_sensor_fix/trigger-uninstall.sh create mode 100755 demos/ota_nav2_sensor_fix/trigger-update.sh diff --git a/demos/ota_nav2_sensor_fix/README.md b/demos/ota_nav2_sensor_fix/README.md index 77e7ecf..940a800 100644 --- a/demos/ota_nav2_sensor_fix/README.md +++ b/demos/ota_nav2_sensor_fix/README.md @@ -22,31 +22,27 @@ update package metadata. ## Quickstart ```bash -# 1. Build artifacts (compiles fixed_lidar + obstacle_classifier_v2, -# generates catalog.json + tarballs). -./scripts/build_artifacts.sh - -# 2. Start the stack (gateway + plugin + demo nodes + update server). -docker compose up --build +# Build artifacts + start gateway, plugin, demo nodes, update server. +./run-demo.sh ``` +The first run pulls `ros:jazzy` and builds the gateway from source - takes +~10 minutes. Subsequent runs reuse the layer cache. + In another terminal, drive the demo: ```bash -# List the registered updates. -curl -s http://localhost:8080/api/v1/updates | jq '.[].id' - -# Run an update: prepare downloads the artifact, execute swaps + restarts. -curl -X PUT http://localhost:8080/api/v1/updates/fixed_lidar_2_1_0/prepare -curl -X PUT http://localhost:8080/api/v1/updates/fixed_lidar_2_1_0/execute +./check-demo.sh # show registered updates + live process state +./trigger-update.sh # broken_lidar -> fixed_lidar (the headline scene) +./trigger-install.sh # install obstacle_classifier_v2 from scratch +./trigger-uninstall.sh # remove broken_lidar_legacy +./stop-demo.sh # tear down +``` -# Install a new package. -curl -X PUT http://localhost:8080/api/v1/updates/obstacle_classifier_v2_1_0_0/prepare -curl -X PUT http://localhost:8080/api/v1/updates/obstacle_classifier_v2_1_0_0/execute +Each trigger script issues SOVD `PUT /updates/{id}/prepare` then `/execute` +and prints the resulting status plus the live process list. -# Uninstall a deprecated one. -curl -X PUT http://localhost:8080/api/v1/updates/broken_lidar_legacy_remove/execute -``` +If host port 8080 is taken, override with `OTA_GATEWAY_PORT=8081 ./run-demo.sh`. Tear down: `docker compose down`. diff --git a/demos/ota_nav2_sensor_fix/check-demo.sh b/demos/ota_nav2_sensor_fix/check-demo.sh new file mode 100755 index 0000000..40e49fa --- /dev/null +++ b/demos/ota_nav2_sensor_fix/check-demo.sh @@ -0,0 +1,56 @@ +#!/bin/bash +# Show the live state of the OTA demo: registered updates, per-update +# status, and the demo node processes the plugin manages inside the +# gateway container. + +set -eu + +GATEWAY_URL="${OTA_GATEWAY_URL:-http://localhost:${OTA_GATEWAY_PORT:-8080}}" +API="${GATEWAY_URL}/api/v1" + +if ! command -v curl >/dev/null 2>&1; then + echo "curl is required" + exit 1 +fi + +if ! curl -fsS "${API}/health" >/dev/null 2>&1; then + echo "Gateway not reachable at ${GATEWAY_URL}. Start it with: ./run-demo.sh" + exit 1 +fi + +JQ_AVAILABLE="false" +if command -v jq >/dev/null 2>&1; then + JQ_AVAILABLE="true" +fi + +echo "Gateway: ${GATEWAY_URL}" +echo "Health: $(curl -fsS "${API}/health" | head -c 200)" +echo "" + +echo "Registered updates (GET /updates):" +if [[ "$JQ_AVAILABLE" == "true" ]]; then + curl -fsS "${API}/updates" | jq -r '.items[]' | sed 's/^/ /' +else + curl -fsS "${API}/updates" +fi +echo "" + +echo "Per-update status (GET /updates/{id}/status):" +if [[ "$JQ_AVAILABLE" == "true" ]]; then + for id in $(curl -fsS "${API}/updates" | jq -r '.items[]'); do + status=$(curl -fsS "${API}/updates/${id}/status" 2>/dev/null || echo '{"status":""}') + echo " ${id}: $(echo "$status" | jq -c '{status, progress}')" + done +else + echo " (install jq for detail)" +fi +echo "" + +echo "Plugin-managed processes inside gateway container:" +if docker ps --format '{{.Names}}' | grep -q '^ota_demo_gateway$'; then + docker exec ota_demo_gateway pgrep -af \ + 'broken_lidar_node|fixed_lidar_node|broken_lidar_legacy|obstacle_classifier' \ + 2>/dev/null | grep -v 'pgrep' | sed 's/^/ /' || echo " (none)" +else + echo " ota_demo_gateway container not running" +fi diff --git a/demos/ota_nav2_sensor_fix/run-demo.sh b/demos/ota_nav2_sensor_fix/run-demo.sh new file mode 100755 index 0000000..1989ab9 --- /dev/null +++ b/demos/ota_nav2_sensor_fix/run-demo.sh @@ -0,0 +1,147 @@ +#!/bin/bash +# OTA over SOVD - nav2 sensor-fix demo runner. +# Brings up the gateway (with the dev-grade ota_update_plugin baked in) and +# the FastAPI artifact server. The gateway entrypoint also launches +# broken_lidar (publishes /scan with a phantom obstacle) and +# broken_lidar_legacy (uninstall target). + +set -eu + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "$SCRIPT_DIR" + +DETACH_MODE="true" +UPDATE_IMAGES="false" +BUILD_ARGS="" +SKIP_ARTIFACTS="false" + +usage() { + echo "Usage: $0 [OPTIONS]" + echo "" + echo "Options:" + echo " --attached Run in foreground (default: daemon mode)" + echo " --update Pull latest images before running" + echo " --no-cache Build Docker images without cache" + echo " --skip-artifacts Skip rebuilding artifacts/catalog.json" + echo " -h, --help Show this help message" + echo "" + echo "Environment:" + echo " OTA_GATEWAY_PORT Host port for gateway HTTP API (default: 8080)" + echo "" + echo "Examples:" + echo " $0 # Daemon mode (default)" + echo " $0 --attached # Foreground with logs" + echo " OTA_GATEWAY_PORT=8081 $0 # Use a different host port" +} + +while [[ $# -gt 0 ]]; do + case "$1" in + --attached) DETACH_MODE="false" ;; + --update) UPDATE_IMAGES="true" ;; + --no-cache) BUILD_ARGS="--no-cache" ;; + --skip-artifacts) SKIP_ARTIFACTS="true" ;; + -h|--help) usage; exit 0 ;; + *) echo "Unknown option: $1"; usage; exit 1 ;; + esac + shift +done + +GATEWAY_PORT="${OTA_GATEWAY_PORT:-8080}" +GATEWAY_URL="http://localhost:${GATEWAY_PORT}" + +echo "OTA over SOVD - nav2 sensor-fix demo" +echo "====================================" +echo "" + +if ! command -v docker &> /dev/null; then + echo "Error: Docker is not installed" + exit 1 +fi + +if [[ "$SKIP_ARTIFACTS" != "true" ]]; then + if [[ ! -x "$SCRIPT_DIR/scripts/build_artifacts.sh" ]]; then + chmod +x "$SCRIPT_DIR/scripts/build_artifacts.sh" + fi + echo "[1/3] Building OTA artifacts (catalog.json + tarballs)..." + "$SCRIPT_DIR/scripts/build_artifacts.sh" + echo "" +fi + +if docker compose version &> /dev/null; then + COMPOSE_CMD="docker compose" +else + COMPOSE_CMD="docker-compose" +fi + +if [[ "$UPDATE_IMAGES" == "true" ]]; then + echo "Pulling latest images..." + ${COMPOSE_CMD} pull +fi + +echo "[2/3] Building and starting demo..." +echo " (First run pulls ros:jazzy and builds the gateway, ~10 minutes)" +echo "" + +DETACH_FLAG="" +if [[ "$DETACH_MODE" == "true" ]]; then + DETACH_FLAG="-d" +fi + +# shellcheck disable=SC2086 +if ! ${COMPOSE_CMD} build ${BUILD_ARGS}; then + echo "Docker build failed. Stopping any partially created containers..." + ${COMPOSE_CMD} down 2>/dev/null || true + exit 1 +fi + +# shellcheck disable=SC2086 +${COMPOSE_CMD} up ${DETACH_FLAG} + +if [[ "$DETACH_MODE" != "true" ]]; then + exit 0 +fi + +echo "" +echo "[3/3] Waiting for gateway to come up..." +for i in 1 2 3 4 5 6 7 8 9 10 11 12; do + if curl -fsS "${GATEWAY_URL}/api/v1/health" >/dev/null 2>&1; then + break + fi + sleep 2 +done + +if ! curl -fsS "${GATEWAY_URL}/api/v1/health" >/dev/null 2>&1; then + echo "Gateway did not respond on ${GATEWAY_URL} - check logs with:" + echo " ${COMPOSE_CMD} logs gateway" + exit 1 +fi + +echo "" +echo "Demo is up." +echo "" +echo " Gateway HTTP API: ${GATEWAY_URL}/api/v1/" +echo " Update server: http://localhost:9000/catalog" +echo "" +echo "Registered updates:" +if command -v jq >/dev/null 2>&1; then + curl -fsS "${GATEWAY_URL}/api/v1/updates" | jq -r '.items[]' | sed 's/^/ /' +else + curl -fsS "${GATEWAY_URL}/api/v1/updates" +fi +echo "" +echo "Drive the demo:" +echo " ./check-demo.sh # show current state" +echo " ./trigger-update.sh # update broken_lidar -> fixed_lidar" +echo " ./trigger-install.sh # install obstacle_classifier_v2" +echo " ./trigger-uninstall.sh # uninstall broken_lidar_legacy" +echo " ./stop-demo.sh # tear down" +echo "" +echo "Connect a UI:" +echo " Web UI (ros2_medkit_web_ui):" +echo " npm install && npm run dev" +echo " open http://localhost:5173 -> Connect -> ${GATEWAY_URL}" +echo "" +echo " Foxglove Studio (ros2_medkit_foxglove_extension):" +echo " cd ros2_medkit_foxglove_extension && npm install && npm run local-install" +echo " Open Foxglove -> add panel 'ros2_medkit Updates'" +echo " Set baseUrl in panel settings to ${GATEWAY_URL}/api/v1" diff --git a/demos/ota_nav2_sensor_fix/stop-demo.sh b/demos/ota_nav2_sensor_fix/stop-demo.sh new file mode 100755 index 0000000..1233d63 --- /dev/null +++ b/demos/ota_nav2_sensor_fix/stop-demo.sh @@ -0,0 +1,40 @@ +#!/bin/bash +# Stop the OTA over SOVD - nav2 sensor-fix demo. + +set -eu + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "$SCRIPT_DIR" + +REMOVE_VOLUMES="" +REMOVE_IMAGES="" + +usage() { + echo "Usage: $0 [OPTIONS]" + echo "" + echo "Options:" + echo " -v, --volumes Remove named volumes" + echo " --images Remove built images" + echo " -h, --help Show this help message" +} + +while [[ $# -gt 0 ]]; do + case "$1" in + -v|--volumes) REMOVE_VOLUMES="-v" ;; + --images) REMOVE_IMAGES="--rmi local" ;; + -h|--help) usage; exit 0 ;; + *) echo "Unknown option: $1"; usage; exit 1 ;; + esac + shift +done + +if docker compose version &> /dev/null; then + COMPOSE_CMD="docker compose" +else + COMPOSE_CMD="docker-compose" +fi + +# shellcheck disable=SC2086 +${COMPOSE_CMD} down ${REMOVE_VOLUMES} ${REMOVE_IMAGES} +echo "" +echo "Demo stopped." diff --git a/demos/ota_nav2_sensor_fix/trigger-install.sh b/demos/ota_nav2_sensor_fix/trigger-install.sh new file mode 100755 index 0000000..d4409b0 --- /dev/null +++ b/demos/ota_nav2_sensor_fix/trigger-install.sh @@ -0,0 +1,35 @@ +#!/bin/bash +# Trigger the SOVD install flow: deploy obstacle_classifier_v2 from scratch. + +set -eu + +GATEWAY_URL="${OTA_GATEWAY_URL:-http://localhost:${OTA_GATEWAY_PORT:-8080}}" +API="${GATEWAY_URL}/api/v1" +ID="obstacle_classifier_v2_1_0_0" + +if ! curl -fsS "${API}/health" >/dev/null 2>&1; then + echo "Gateway not reachable at ${GATEWAY_URL}. Start it with: ./run-demo.sh" + exit 1 +fi + +echo "Install: ${ID}" +echo " PUT /updates/${ID}/prepare" +curl -fsS -X PUT -H 'Content-Type: application/json' -d '{}' \ + "${API}/updates/${ID}/prepare" >/dev/null +sleep 3 + +echo " PUT /updates/${ID}/execute" +curl -fsS -X PUT -H 'Content-Type: application/json' -d '{}' \ + "${API}/updates/${ID}/execute" >/dev/null +sleep 5 + +echo "" +echo "Status after execute:" +curl -fsS "${API}/updates/${ID}/status" | (jq . 2>/dev/null || cat) + +if docker ps --format '{{.Names}}' | grep -q '^ota_demo_gateway$'; then + echo "" + echo "Live processes:" + docker exec ota_demo_gateway pgrep -af 'obstacle_classifier' \ + 2>/dev/null | grep -v 'pgrep' | sed 's/^/ /' || echo " (none)" +fi diff --git a/demos/ota_nav2_sensor_fix/trigger-uninstall.sh b/demos/ota_nav2_sensor_fix/trigger-uninstall.sh new file mode 100755 index 0000000..0f7c8fa --- /dev/null +++ b/demos/ota_nav2_sensor_fix/trigger-uninstall.sh @@ -0,0 +1,37 @@ +#!/bin/bash +# Trigger the SOVD uninstall flow: remove broken_lidar_legacy. + +set -eu + +GATEWAY_URL="${OTA_GATEWAY_URL:-http://localhost:${OTA_GATEWAY_PORT:-8080}}" +API="${GATEWAY_URL}/api/v1" +ID="broken_lidar_legacy_remove" + +if ! curl -fsS "${API}/health" >/dev/null 2>&1; then + echo "Gateway not reachable at ${GATEWAY_URL}. Start it with: ./run-demo.sh" + exit 1 +fi + +echo "Uninstall: ${ID}" +# Uninstall has no artifact to fetch but the gateway state machine still +# needs prepare->execute to advance. +echo " PUT /updates/${ID}/prepare" +curl -fsS -X PUT -H 'Content-Type: application/json' -d '{}' \ + "${API}/updates/${ID}/prepare" >/dev/null +sleep 2 + +echo " PUT /updates/${ID}/execute" +curl -fsS -X PUT -H 'Content-Type: application/json' -d '{}' \ + "${API}/updates/${ID}/execute" >/dev/null +sleep 4 + +echo "" +echo "Status after execute:" +curl -fsS "${API}/updates/${ID}/status" | (jq . 2>/dev/null || cat) + +if docker ps --format '{{.Names}}' | grep -q '^ota_demo_gateway$'; then + echo "" + echo "Live processes (broken_lidar_legacy should be gone):" + docker exec ota_demo_gateway pgrep -af 'broken_lidar_legacy' \ + 2>/dev/null | grep -v 'pgrep' | sed 's/^/ /' || echo " (none - uninstall succeeded)" +fi diff --git a/demos/ota_nav2_sensor_fix/trigger-update.sh b/demos/ota_nav2_sensor_fix/trigger-update.sh new file mode 100755 index 0000000..5f9668a --- /dev/null +++ b/demos/ota_nav2_sensor_fix/trigger-update.sh @@ -0,0 +1,36 @@ +#!/bin/bash +# Trigger the SOVD update flow: replace broken_lidar with fixed_lidar. +# Uses spec endpoints PUT /updates/{id}/prepare then PUT /updates/{id}/execute. + +set -eu + +GATEWAY_URL="${OTA_GATEWAY_URL:-http://localhost:${OTA_GATEWAY_PORT:-8080}}" +API="${GATEWAY_URL}/api/v1" +ID="fixed_lidar_2_1_0" + +if ! curl -fsS "${API}/health" >/dev/null 2>&1; then + echo "Gateway not reachable at ${GATEWAY_URL}. Start it with: ./run-demo.sh" + exit 1 +fi + +echo "Update: ${ID}" +echo " PUT /updates/${ID}/prepare" +curl -fsS -X PUT -H 'Content-Type: application/json' -d '{}' \ + "${API}/updates/${ID}/prepare" >/dev/null +sleep 3 + +echo " PUT /updates/${ID}/execute" +curl -fsS -X PUT -H 'Content-Type: application/json' -d '{}' \ + "${API}/updates/${ID}/execute" >/dev/null +sleep 5 + +echo "" +echo "Status after execute:" +curl -fsS "${API}/updates/${ID}/status" | (jq . 2>/dev/null || cat) + +if docker ps --format '{{.Names}}' | grep -q '^ota_demo_gateway$'; then + echo "" + echo "Live processes:" + docker exec ota_demo_gateway pgrep -af 'broken_lidar_node|fixed_lidar_node' \ + 2>/dev/null | grep -v 'pgrep' | sed 's/^/ /' || true +fi From bd7a54f8f8dad187ebd1fe07eddc0d60bf1e7d64 Mon Sep 17 00:00:00 2001 From: Bartosz Burda Date: Mon, 27 Apr 2026 10:28:43 +0200 Subject: [PATCH 31/52] test(demos/ota): smoke test + CI job mirroring sensor_diagnostics pattern tests/smoke_test_ota.sh asserts: - gateway /health 200 - gateway log says 'Update backend provided by plugin' (no 'no provider' warn) - GET /updates returns SOVD {items: []} envelope with all 3 catalog ids - GET /updates/{id} detail uses spec field names: update_name (not 'name'), x_medkit_version (not bare 'version'), updated_components for kind, x_medkit_replaces_executable threaded through pack_artifact - update flow: PUT prepare + execute kills broken_lidar_node and spawns fixed_lidar_node inside the gateway container - install flow: spawns obstacle_classifier_node ci.yml gets a build-and-test-ota job following the same shape as the other per-demo jobs: checkout -> install Python + ROS Jazzy on the runner -> build_artifacts.sh -> docker compose up -d --build -> run smoke -> log dumps on failure -> teardown. --- .github/workflows/ci.yml | 68 ++++++++++++++ tests/smoke_test_ota.sh | 190 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 258 insertions(+) create mode 100755 tests/smoke_test_ota.sh diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 588666e..0a009f7 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -163,3 +163,71 @@ jobs: if: always() working-directory: demos/multi_ecu_aggregation run: docker compose --profile ci down + + build-and-test-ota: + needs: lint + runs-on: ubuntu-24.04 + steps: + - name: Show triggering source + if: github.event_name == 'repository_dispatch' + run: | + SHA="${{ github.event.client_payload.sha }}" + RUN_URL="${{ github.event.client_payload.run_url }}" + echo "## Triggered by ros2_medkit" >> "$GITHUB_STEP_SUMMARY" + echo "- Commit: \`${SHA:-unknown}\`" >> "$GITHUB_STEP_SUMMARY" + if [ -n "$RUN_URL" ]; then + echo "- Run: [View triggering run]($RUN_URL)" >> "$GITHUB_STEP_SUMMARY" + else + echo "- Run: (URL not provided)" >> "$GITHUB_STEP_SUMMARY" + fi + + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Python 3.11 + uses: actions/setup-python@v5 + with: + python-version: "3.11" + + - name: Install pack_artifact dev deps + working-directory: demos/ota_nav2_sensor_fix/scripts + run: | + python -m venv .venv + .venv/bin/pip install --upgrade pip + .venv/bin/pip install pytest + + - name: Set up ROS 2 Jazzy (host) for build_artifacts.sh + run: | + sudo apt-get update + sudo apt-get install -y software-properties-common curl + sudo add-apt-repository universe -y + sudo curl -sSL https://raw.githubusercontent.com/ros/rosdistro/master/ros.key -o /usr/share/keyrings/ros-archive-keyring.gpg + echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/ros-archive-keyring.gpg] http://packages.ros.org/ros2/ubuntu $(. /etc/os-release && echo $UBUNTU_CODENAME) main" | sudo tee /etc/apt/sources.list.d/ros2.list >/dev/null + sudo apt-get update + sudo apt-get install -y ros-jazzy-ros-base python3-colcon-common-extensions + + - name: Build artifacts (catalog + tarballs) + working-directory: demos/ota_nav2_sensor_fix + run: ./scripts/build_artifacts.sh + + - name: Build and start OTA demo + working-directory: demos/ota_nav2_sensor_fix + run: docker compose up -d --build + + - name: Run smoke tests + run: ./tests/smoke_test_ota.sh + + - name: Show gateway logs on failure + if: failure() + working-directory: demos/ota_nav2_sensor_fix + run: docker compose logs gateway --tail=200 + + - name: Show update server logs on failure + if: failure() + working-directory: demos/ota_nav2_sensor_fix + run: docker compose logs ota_update_server --tail=200 + + - name: Teardown + if: always() + working-directory: demos/ota_nav2_sensor_fix + run: docker compose down diff --git a/tests/smoke_test_ota.sh b/tests/smoke_test_ota.sh new file mode 100755 index 0000000..1272c94 --- /dev/null +++ b/tests/smoke_test_ota.sh @@ -0,0 +1,190 @@ +#!/bin/bash +# Smoke tests for the ota_nav2_sensor_fix demo. +# Runs from the host against the gateway on localhost:8080 and asserts: +# - the gateway loads our ota_update_plugin as the UpdateProvider +# - the SOVD catalog is registered with the 3 expected entries +# - the update detail uses spec field names (update_name, no `name`/`version`) +# - the update flow actually swaps broken_lidar_node for fixed_lidar_node +# inside the gateway container +# +# Usage: ./tests/smoke_test_ota.sh [GATEWAY_URL] +# Default GATEWAY_URL: http://localhost:8080 + +GATEWAY_URL="${1:-http://localhost:8080}" +# shellcheck disable=SC2034 # Used by smoke_lib.sh +API_BASE="${GATEWAY_URL}/api/v1" + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=tests/smoke_lib.sh +source "${SCRIPT_DIR}/smoke_lib.sh" + +trap print_summary EXIT + +GATEWAY_CONTAINER="${OTA_DEMO_GATEWAY_CONTAINER:-ota_demo_gateway}" + +EXPECTED_IDS=( + "fixed_lidar_2_1_0" + "obstacle_classifier_v2_1_0_0" + "broken_lidar_legacy_remove" +) + +# Confirm a process is or is not running inside the gateway container. +# Usage: assert_process_running +# assert_process_gone +assert_process_running() { + local pattern="$1" + local desc="$2" + if docker exec "$GATEWAY_CONTAINER" pgrep -f "$pattern" >/dev/null 2>&1; then + pass "$desc" + else + fail "$desc" "no process matching '$pattern' in $GATEWAY_CONTAINER" + fi +} + +assert_process_gone() { + local pattern="$1" + local desc="$2" + if ! docker exec "$GATEWAY_CONTAINER" pgrep -f "$pattern" >/dev/null 2>&1; then + pass "$desc" + else + fail "$desc" "process matching '$pattern' still alive in $GATEWAY_CONTAINER" + fi +} + +# --- Wait for gateway startup --- + +wait_for_gateway 90 + +# Plugin's boot poll fetches /catalog and registers entries; wait for it. +echo " Waiting for plugin's boot poll to register catalog (max 30s)..." +if poll_until "/updates" '.items[] | select(. == "fixed_lidar_2_1_0")' 30; then + echo " Catalog registered" +else + echo " Catalog NOT registered within 30s" + exit 1 +fi + +# --- Tests --- + +section "Health" + +if api_get "/health"; then + pass "GET /health returns 200" +else + fail "GET /health returns 200" "unexpected status code" +fi + +section "UpdateProvider plugin loaded" + +if docker logs "$GATEWAY_CONTAINER" 2>&1 | grep -q "Update backend provided by plugin"; then + pass "gateway log says: 'Update backend provided by plugin'" +else + fail "gateway log says: 'Update backend provided by plugin'" "log line missing" +fi + +if docker logs "$GATEWAY_CONTAINER" 2>&1 | grep -q "Updates enabled but no UpdateProvider plugin loaded"; then + fail "no 'no UpdateProvider' warning" "warning was logged" +else + pass "no 'no UpdateProvider' warning" +fi + +section "Catalog (GET /updates returns SOVD {items})" + +if api_get "/updates"; then + pass "GET /updates returns 200" +else + fail "GET /updates returns 200" "unexpected status code" +fi + +if echo "$RESPONSE" | jq -e '.items | type == "array"' >/dev/null 2>&1; then + pass "/updates response has items array" +else + fail "/updates response has items array" "envelope mismatch (SOVD spec violation)" +fi + +for id in "${EXPECTED_IDS[@]}"; do + if echo "$RESPONSE" | jq -e --arg id "$id" '.items[] | select(. == $id)' >/dev/null 2>&1; then + pass "/updates contains '$id'" + else + fail "/updates contains '$id'" "id missing" + fi +done + +section "Detail field shape (SOVD ISO 17978-3 compliance)" + +# fixed_lidar update detail: must use spec field names +if api_get "/updates/fixed_lidar_2_1_0"; then + pass "GET /updates/fixed_lidar_2_1_0 returns 200" + + if echo "$RESPONSE" | jq -e '.update_name' >/dev/null 2>&1; then + pass "detail has update_name (SOVD spec)" + else + fail "detail has update_name (SOVD spec)" "field missing - spec violation" + fi + + if echo "$RESPONSE" | jq -e '.name' >/dev/null 2>&1; then + fail "detail does NOT have 'name'" "found 'name' instead of 'update_name'" + else + pass "detail does NOT have 'name'" + fi + + if echo "$RESPONSE" | jq -e '.version' >/dev/null 2>&1; then + fail "detail does NOT have plain 'version'" "should be x_medkit_version (vendor extension)" + else + pass "detail does NOT have plain 'version'" + fi + + if echo "$RESPONSE" | jq -e '.x_medkit_version == "2.1.0"' >/dev/null 2>&1; then + pass "detail has x_medkit_version = 2.1.0" + else + fail "detail has x_medkit_version = 2.1.0" "field missing or wrong value" + fi + + if echo "$RESPONSE" | jq -e '.updated_components | index("scan_sensor_node")' >/dev/null 2>&1; then + pass "detail has updated_components: ['scan_sensor_node']" + else + fail "detail has updated_components: ['scan_sensor_node']" "kind metadata missing" + fi + + if echo "$RESPONSE" | jq -e '.x_medkit_replaces_executable == "broken_lidar_node"' >/dev/null 2>&1; then + pass "detail has x_medkit_replaces_executable = broken_lidar_node" + else + fail "detail has x_medkit_replaces_executable" "field missing" + fi +fi + +section "Initial process state" + +assert_process_running "/lib/broken_lidar/broken_lidar_node" "broken_lidar_node running before update" +assert_process_running "broken_lidar_legacy" "broken_lidar_legacy running before uninstall" + +section "Update flow: PUT /updates/fixed_lidar_2_1_0/prepare + /execute" + +curl -fsS -X PUT -H 'Content-Type: application/json' -d '{}' \ + "${API_BASE}/updates/fixed_lidar_2_1_0/prepare" >/dev/null +sleep 4 +curl -fsS -X PUT -H 'Content-Type: application/json' -d '{}' \ + "${API_BASE}/updates/fixed_lidar_2_1_0/execute" >/dev/null +sleep 6 + +if api_get "/updates/fixed_lidar_2_1_0/status"; then + if echo "$RESPONSE" | jq -e '.status == "completed"' >/dev/null 2>&1; then + pass "fixed_lidar_2_1_0 status is completed" + else + fail "fixed_lidar_2_1_0 status is completed" "got $(echo "$RESPONSE" | jq -c .)" + fi +fi + +assert_process_gone "/lib/broken_lidar/broken_lidar_node" "broken_lidar_node killed after update" +assert_process_running "/lib/fixed_lidar/fixed_lidar_node" "fixed_lidar_node spawned after update" + +section "Install flow: PUT /updates/obstacle_classifier_v2_1_0_0/prepare + /execute" + +curl -fsS -X PUT -H 'Content-Type: application/json' -d '{}' \ + "${API_BASE}/updates/obstacle_classifier_v2_1_0_0/prepare" >/dev/null +sleep 4 +curl -fsS -X PUT -H 'Content-Type: application/json' -d '{}' \ + "${API_BASE}/updates/obstacle_classifier_v2_1_0_0/execute" >/dev/null +sleep 5 + +assert_process_running "obstacle_classifier_node" "obstacle_classifier_node spawned after install" From 5baa086e3e1fb64185b666b48363ea4999c3387c Mon Sep 17 00:00:00 2001 From: Bartosz Burda Date: Mon, 27 Apr 2026 10:52:31 +0200 Subject: [PATCH 32/52] fix(demos/ota): rename unused loop var in run-demo to satisfy shellcheck SC2034 --- demos/ota_nav2_sensor_fix/run-demo.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/demos/ota_nav2_sensor_fix/run-demo.sh b/demos/ota_nav2_sensor_fix/run-demo.sh index 1989ab9..143b9df 100755 --- a/demos/ota_nav2_sensor_fix/run-demo.sh +++ b/demos/ota_nav2_sensor_fix/run-demo.sh @@ -103,7 +103,7 @@ fi echo "" echo "[3/3] Waiting for gateway to come up..." -for i in 1 2 3 4 5 6 7 8 9 10 11 12; do +for _ in 1 2 3 4 5 6 7 8 9 10 11 12; do if curl -fsS "${GATEWAY_URL}/api/v1/health" >/dev/null 2>&1; then break fi From bf6d5401330e1803753f0ed191374167b5023b28 Mon Sep 17 00:00:00 2001 From: Bartosz Burda Date: Mon, 27 Apr 2026 11:03:00 +0200 Subject: [PATCH 33/52] ci(ota): build artifacts inside ros:jazzy container instead of installing ROS on runner The previous step installed ros-jazzy-ros-base + colcon on the runner, but that did not pull in python3-catkin-pkg, so the colcon build inside build_artifacts.sh tripped 'No module named catkin_pkg'. Easier and more faithful to how end-users build the demo: run build_artifacts.sh inside a ros:jazzy container with the demo dir bind-mounted, install only the build deps that the script actually needs, and chown the resulting catalog/tarballs back to the runner user. --- .github/workflows/ci.yml | 52 +++++++++++++++++++++------------------- 1 file changed, 28 insertions(+), 24 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0a009f7..0432951 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -184,31 +184,35 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 - - name: Set up Python 3.11 - uses: actions/setup-python@v5 - with: - python-version: "3.11" - - - name: Install pack_artifact dev deps - working-directory: demos/ota_nav2_sensor_fix/scripts - run: | - python -m venv .venv - .venv/bin/pip install --upgrade pip - .venv/bin/pip install pytest - - - name: Set up ROS 2 Jazzy (host) for build_artifacts.sh - run: | - sudo apt-get update - sudo apt-get install -y software-properties-common curl - sudo add-apt-repository universe -y - sudo curl -sSL https://raw.githubusercontent.com/ros/rosdistro/master/ros.key -o /usr/share/keyrings/ros-archive-keyring.gpg - echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/ros-archive-keyring.gpg] http://packages.ros.org/ros2/ubuntu $(. /etc/os-release && echo $UBUNTU_CODENAME) main" | sudo tee /etc/apt/sources.list.d/ros2.list >/dev/null - sudo apt-get update - sudo apt-get install -y ros-jazzy-ros-base python3-colcon-common-extensions - - - name: Build artifacts (catalog + tarballs) + - name: Build artifacts (catalog + tarballs) inside ros:jazzy working-directory: demos/ota_nav2_sensor_fix - run: ./scripts/build_artifacts.sh + run: | + docker run --rm \ + -v "$PWD":/work \ + -w /work \ + ros:jazzy \ + bash -c ' + set -eu + apt-get update + apt-get install -y --no-install-recommends \ + python3-colcon-common-extensions \ + python3-catkin-pkg \ + python3-venv \ + python3-pip \ + build-essential \ + cmake \ + ros-jazzy-rclcpp \ + ros-jazzy-sensor-msgs \ + ros-jazzy-visualization-msgs + cd scripts + python3 -m venv .venv + .venv/bin/pip install --upgrade pip + .venv/bin/pip install pytest + cd .. + ./scripts/build_artifacts.sh + ' + # Restore ownership of files the container created as root. + sudo chown -R "$USER:$USER" . - name: Build and start OTA demo working-directory: demos/ota_nav2_sensor_fix From e89628e9e3b0f195bc908976213e81667ee1b633 Mon Sep 17 00:00:00 2001 From: Bartosz Burda Date: Mon, 27 Apr 2026 15:57:23 +0200 Subject: [PATCH 34/52] docs: list ota_nav2_sensor_fix in top-level README + smoke test catalog --- README.md | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index c21c948..164af8b 100644 --- a/README.md +++ b/README.md @@ -49,6 +49,7 @@ All demos support: | [TurtleBot3 Integration](demos/turtlebot3_integration/) | Full ros2_medkit integration with TurtleBot3 and Nav2 | SOVD-compliant API, manifest-based discovery, fault management | ✅ Ready | | [MoveIt Pick-and-Place](demos/moveit_pick_place/) | Panda 7-DOF arm with MoveIt 2 manipulation and ros2_medkit | Planning fault detection, controller monitoring, joint limits | ✅ Ready | | [Multi-ECU Aggregation](demos/multi_ecu_aggregation/) | Multi-ECU peer aggregation with 3 ECUs (perception, planning, actuation), mDNS discovery, cross-ECU functions | Peer aggregation, mDNS discovery, cross-ECU functions | ✅ Ready | +| [OTA over SOVD - nav2 sensor fix](demos/ota_nav2_sensor_fix/) | Dev-grade OTA plugin showing the SOVD `/updates` lifecycle - update a broken lidar node, install a new safety classifier, uninstall a deprecated package | SOVD-spec update / install / uninstall, native binary swap, fork+exec process management, Foxglove panel + curl scripts | ✅ Ready | ### Quick Start @@ -150,6 +151,32 @@ cd demos/multi_ecu_aggregation - Unified SOVD-compliant REST API spanning all ECUs - Web UI for browsing aggregated entity hierarchy +#### OTA over SOVD Demo (Dev-grade Update / Install / Uninstall) + +End-to-end demo of the SOVD `/updates` resource: a broken lidar node is +swapped with a fixed version over HTTP, an extra safety classifier is +installed from scratch, and a deprecated package is uninstalled - all +without SSH, all spec-compliant. + +```bash +cd demos/ota_nav2_sensor_fix +./run-demo.sh # build artifacts + bring up gateway/plugin/update server +./check-demo.sh # show registered updates + per-id status + live process state +./trigger-update.sh # broken_lidar -> fixed_lidar (the headline) +./trigger-install.sh # install obstacle_classifier_v2 +./trigger-uninstall.sh # remove broken_lidar_legacy +./stop-demo.sh +``` + +**Features:** + +- Dev-grade `ota_update_plugin` C++ gateway plugin (UpdateProvider + GatewayPlugin) +- SOVD ISO 17978-3 compliant `/updates` resource: kind derived from + `updated_components` / `added_components` / `removed_components` metadata +- Native binary swap + `fork+exec` process management (no containers, no signing) +- Foxglove Studio panel mirrors the same SOVD client patterns as the web UI +- Pairs with the [`ros2_medkit_foxglove_extension`](https://github.com/selfpatch/ros2_medkit_foxglove_extension) Updates panel + ## Getting Started ### Prerequisites @@ -209,9 +236,11 @@ Each demo has automated smoke tests that verify the gateway starts and the REST ./tests/smoke_test.sh # Sensor diagnostics (full API coverage + fault injection + beacons) ./tests/smoke_test_turtlebot3.sh # TurtleBot3 (discovery, data, operations, scripts, triggers, logs) ./tests/smoke_test_moveit.sh # MoveIt pick-and-place (discovery, data, operations, scripts, triggers, logs) +./tests/smoke_test_multi_ecu.sh # Multi-ECU aggregation (per-ECU discovery + aggregated view) +./tests/smoke_test_ota.sh # OTA over SOVD (catalog, /updates spec shape, prepare/execute, process swap) ``` -CI runs all 4 demos in parallel - each job builds the Docker image, starts the container, and runs the smoke tests against it. See [CI workflow](.github/workflows/ci.yml). +CI runs all demos in parallel - each job builds the Docker image, starts the container, and runs the smoke tests against it. See [CI workflow](.github/workflows/ci.yml). ## Related Projects From 43a66e32076a7f1bf2d8f23b8a972b393368bfa8 Mon Sep 17 00:00:00 2001 From: Bartosz Burda Date: Mon, 27 Apr 2026 16:37:28 +0200 Subject: [PATCH 35/52] feat(demos/ota): bake foxglove_bridge into gateway image (port 8765) Without foxglove_bridge there is nothing for Foxglove Studio to subscribe to - no /scan, no /tf, no 3D visual story. The Updates panel itself is just the SOVD HTTP client and works without it, but the broader demo narrative (phantom obstacle visible, robot stuck) needs the topic stream. Adds ros-jazzy-foxglove-bridge to the runtime stage of Dockerfile.gateway, launches it from entrypoint.sh on port 8765 (0.0.0.0), and maps the port through compose with OTA_FOXGLOVE_BRIDGE_PORT override. Verified live: 'Server listening on port 8765' and channels for /scan, /rosout, /fault_manager/events advertised at startup. --- demos/ota_nav2_sensor_fix/Dockerfile.gateway | 3 ++- demos/ota_nav2_sensor_fix/README.md | 26 ++++++++++++++------ demos/ota_nav2_sensor_fix/docker-compose.yml | 1 + demos/ota_nav2_sensor_fix/entrypoint.sh | 7 ++++++ demos/ota_nav2_sensor_fix/run-demo.sh | 5 ++-- 5 files changed, 32 insertions(+), 10 deletions(-) diff --git a/demos/ota_nav2_sensor_fix/Dockerfile.gateway b/demos/ota_nav2_sensor_fix/Dockerfile.gateway index e355e74..024ec19 100644 --- a/demos/ota_nav2_sensor_fix/Dockerfile.gateway +++ b/demos/ota_nav2_sensor_fix/Dockerfile.gateway @@ -59,6 +59,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ ros-jazzy-visualization-msgs \ ros-jazzy-launch-ros \ ros-jazzy-test-msgs \ + ros-jazzy-foxglove-bridge \ libcpp-httplib-dev \ libsystemd-dev \ nlohmann-json3-dev \ @@ -73,5 +74,5 @@ RUN chmod +x /usr/local/bin/entrypoint.sh ENV ROS_DOMAIN_ID=42 -EXPOSE 8080 +EXPOSE 8080 8765 ENTRYPOINT ["/usr/local/bin/entrypoint.sh"] diff --git a/demos/ota_nav2_sensor_fix/README.md b/demos/ota_nav2_sensor_fix/README.md index 940a800..7024530 100644 --- a/demos/ota_nav2_sensor_fix/README.md +++ b/demos/ota_nav2_sensor_fix/README.md @@ -46,13 +46,25 @@ If host port 8080 is taken, override with `OTA_GATEWAY_PORT=8081 ./run-demo.sh`. Tear down: `docker compose down`. -## Adding a Foxglove visualization - -Install the `ros2_medkit_foxglove_extension` (which now ships an Updates -panel - see https://github.com/selfpatch/ros2_medkit_foxglove_extension) -in your local Foxglove Studio, then point it at -`http://localhost:8080/api/v1`. The Updates panel exposes Prepare and -Execute buttons next to each catalog entry. +## Foxglove Studio visualization + +The gateway container also runs `foxglove_bridge` on port `8765` so +Foxglove Studio can subscribe to ROS 2 topics (e.g. `/scan` from +broken_lidar / fixed_lidar). + +1. Open Foxglove Studio -> **Open connection** -> **Foxglove WebSocket** -> + `ws://localhost:8765`. You should see `/scan` and other topics in the + Topics panel. +2. Install the [`ros2_medkit_foxglove_extension`](https://github.com/selfpatch/ros2_medkit_foxglove_extension) + (`npm run local-install` in that repo, or drag-and-drop the `.foxe` + onto Foxglove). It ships three panels: Entity Browser, Faults Dashboard, + and **ros2_medkit Updates**. +3. Add the **ros2_medkit Updates** panel and set its `baseUrl` to + `http://localhost:8080/api/v1` (or the port you picked via + `OTA_GATEWAY_PORT`). +4. Click **Prepare** and **Execute** in the Updates panel - the same SOVD + endpoints `trigger-update.sh` hits, with progress feedback in the panel + and live `/scan` updates in the 3D scene. ## Adding nav2 / a sim diff --git a/demos/ota_nav2_sensor_fix/docker-compose.yml b/demos/ota_nav2_sensor_fix/docker-compose.yml index 52c8644..e132b38 100644 --- a/demos/ota_nav2_sensor_fix/docker-compose.yml +++ b/demos/ota_nav2_sensor_fix/docker-compose.yml @@ -16,6 +16,7 @@ services: networks: [otanet] ports: - "${OTA_GATEWAY_PORT:-8080}:8080" + - "${OTA_FOXGLOVE_BRIDGE_PORT:-8765}:8765" environment: ROS_DOMAIN_ID: 42 depends_on: diff --git a/demos/ota_nav2_sensor_fix/entrypoint.sh b/demos/ota_nav2_sensor_fix/entrypoint.sh index 5cf39f1..c9e356a 100755 --- a/demos/ota_nav2_sensor_fix/entrypoint.sh +++ b/demos/ota_nav2_sensor_fix/entrypoint.sh @@ -18,6 +18,13 @@ source /ws/install/setup.bash ros2 run broken_lidar broken_lidar_node & ros2 run broken_lidar_legacy broken_lidar_legacy & +# foxglove_bridge: WebSocket server on :8765 so Foxglove Studio can +# subscribe to /scan, /tf, and any topic the demo nodes publish. Required +# for the visual narrative (3D scene + phantom obstacle); the SOVD Updates +# panel itself only needs the gateway HTTP API. +ros2 run foxglove_bridge foxglove_bridge \ + --ros-args -p port:=8765 -p address:=0.0.0.0 & + # Foreground gateway. Pass the config file directly to the gateway_node # executable (the gateway.launch.py wrapper does not expose a config_file # argument, so we invoke the executable directly to thread our YAML in). diff --git a/demos/ota_nav2_sensor_fix/run-demo.sh b/demos/ota_nav2_sensor_fix/run-demo.sh index 143b9df..0b45633 100755 --- a/demos/ota_nav2_sensor_fix/run-demo.sh +++ b/demos/ota_nav2_sensor_fix/run-demo.sh @@ -119,8 +119,9 @@ fi echo "" echo "Demo is up." echo "" -echo " Gateway HTTP API: ${GATEWAY_URL}/api/v1/" -echo " Update server: http://localhost:9000/catalog" +echo " Gateway HTTP API: ${GATEWAY_URL}/api/v1/" +echo " Foxglove WebSocket: ws://localhost:${OTA_FOXGLOVE_BRIDGE_PORT:-8765}" +echo " Update server: http://localhost:9000/catalog" echo "" echo "Registered updates:" if command -v jq >/dev/null 2>&1; then From 26e61f43856163aa7289c350ff65a88a8ed1f29a Mon Sep 17 00:00:00 2001 From: Bartosz Burda Date: Mon, 27 Apr 2026 17:21:20 +0200 Subject: [PATCH 36/52] fix(demos/ota): launch fault_manager_node so /faults endpoint responds Without /fault_manager/* services running, the gateway's /faults endpoint hangs waiting for the service call (default 5s timeout) and the Faults Dashboard panel surfaces it as 503. Adds 'ros2 run ros2_medkit_fault_manager fault_manager_node' to entrypoint.sh - the gateway image already builds the package; we just need to run it. Defaults are fine for the demo (SQLite at /var/lib/ros2_medkit/faults.db, snapshot capture enabled). --- demos/ota_nav2_sensor_fix/entrypoint.sh | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/demos/ota_nav2_sensor_fix/entrypoint.sh b/demos/ota_nav2_sensor_fix/entrypoint.sh index c9e356a..ff04742 100755 --- a/demos/ota_nav2_sensor_fix/entrypoint.sh +++ b/demos/ota_nav2_sensor_fix/entrypoint.sh @@ -15,6 +15,12 @@ source /ws/install/setup.bash # Demo nodes the plugin will swap (broken_lidar -> fixed_lidar) and # uninstall (broken_lidar_legacy). obstacle_classifier_v2 is installed # fresh by the demo and not started here. +# Fault manager: serves /fault_manager/* services that the gateway's +# /faults endpoint calls. Without it /faults hangs because the gateway +# blocks waiting for the service. Default parameters are fine for the +# demo (in-memory store, no persistence). +ros2 run ros2_medkit_fault_manager fault_manager_node & + ros2 run broken_lidar broken_lidar_node & ros2 run broken_lidar_legacy broken_lidar_legacy & From add3bb438c47ec1039491c8f94d6922c7f30a416 Mon Sep 17 00:00:00 2001 From: Bartosz Burda Date: Mon, 27 Apr 2026 21:11:00 +0200 Subject: [PATCH 37/52] feat(demos/ota): bake TurtleBot3 + Nav2 + headless Gazebo into demo The OTA story needs a robot Foxglove can render. Previously the demo only spun up broken_lidar / broken_lidar_legacy and pushed nav2 to a "BYO sim" footnote - so a viewer opening Foxglove just saw a /scan topic with no robot in 3D space. Now `docker compose up` produces a self-contained scene: - ros-jazzy-turtlebot3-* + nav2-* + ros-gz-sim baked into the runtime image - ota_nav2_sensor_fix_demo package owns the launch + nav2_params + map config; entrypoint hands off to `ros2 launch` - spawn_turtlebot3 is wrapped in a SetRemap GroupAction that pushes gz-bridge's /scan to /scan_sim, so broken_lidar (and later fixed_lidar) is the sole publisher on /scan that nav2 + Foxglove consume - nav2 launched with use_composition:=False to dodge the apt-shipped nav2_msgs / fastcdr 2.2.5 typesupport ABI mismatch on Jazzy - RMW pinned to cyclonedds (same root cause - fastrtps typesupport pulls a fastcdr symbol the runtime doesn't export) - shm_size:2gb on the gateway service so gz-sim doesn't wedge on /dev/shm exhaustion Image grows by ~3GB; the trade-off is no external sim setup, the Foxglove 3D panel renders the robot in turtlebot3_world out of the box, and the OTA narrative ("phantom obstacle disappears after the swap") becomes literally visible. --- demos/ota_nav2_sensor_fix/Dockerfile.gateway | 32 +- demos/ota_nav2_sensor_fix/README.md | 55 +-- demos/ota_nav2_sensor_fix/docker-compose.yml | 14 +- demos/ota_nav2_sensor_fix/entrypoint.sh | 34 +- .../ota_nav2_sensor_fix_demo/CMakeLists.txt | 14 + .../config/nav2_params.yaml | 353 ++++++++++++++++++ .../config/turtlebot3_world.yaml | 7 + .../launch/demo.launch.py | 241 ++++++++++++ .../ota_nav2_sensor_fix_demo/package.xml | 36 ++ 9 files changed, 732 insertions(+), 54 deletions(-) create mode 100644 demos/ota_nav2_sensor_fix/ros2_packages/ota_nav2_sensor_fix_demo/CMakeLists.txt create mode 100644 demos/ota_nav2_sensor_fix/ros2_packages/ota_nav2_sensor_fix_demo/config/nav2_params.yaml create mode 100644 demos/ota_nav2_sensor_fix/ros2_packages/ota_nav2_sensor_fix_demo/config/turtlebot3_world.yaml create mode 100644 demos/ota_nav2_sensor_fix/ros2_packages/ota_nav2_sensor_fix_demo/launch/demo.launch.py create mode 100644 demos/ota_nav2_sensor_fix/ros2_packages/ota_nav2_sensor_fix_demo/package.xml diff --git a/demos/ota_nav2_sensor_fix/Dockerfile.gateway b/demos/ota_nav2_sensor_fix/Dockerfile.gateway index 024ec19..2375db4 100644 --- a/demos/ota_nav2_sensor_fix/Dockerfile.gateway +++ b/demos/ota_nav2_sensor_fix/Dockerfile.gateway @@ -52,6 +52,10 @@ RUN . /opt/ros/jazzy/setup.sh && \ FROM ros:jazzy +# Runtime dependencies. Beyond the gateway/plugin bare minimum we also pull in +# TurtleBot3, Nav2, and gz-sim so the container can self-host the visual demo +# (TB3 + headless Gazebo + Nav2) - no external sim required, the OTA story +# becomes "Foxglove sees a stuck robot, run an update, robot unsticks". RUN apt-get update && apt-get install -y --no-install-recommends \ ros-jazzy-rclcpp \ ros-jazzy-rclcpp-lifecycle \ @@ -60,6 +64,22 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ ros-jazzy-launch-ros \ ros-jazzy-test-msgs \ ros-jazzy-foxglove-bridge \ + ros-jazzy-turtlebot3-gazebo \ + ros-jazzy-turtlebot3-msgs \ + ros-jazzy-turtlebot3-description \ + ros-jazzy-turtlebot3-navigation2 \ + ros-jazzy-nav2-bringup \ + ros-jazzy-nav2-bt-navigator \ + ros-jazzy-nav2-controller \ + ros-jazzy-nav2-planner \ + ros-jazzy-nav2-behaviors \ + ros-jazzy-nav2-costmap-2d \ + ros-jazzy-nav2-lifecycle-manager \ + ros-jazzy-nav2-map-server \ + ros-jazzy-nav2-amcl \ + ros-jazzy-ros-gz-sim \ + ros-jazzy-ros-gz-bridge \ + ros-jazzy-rmw-cyclonedds-cpp \ libcpp-httplib-dev \ libsystemd-dev \ nlohmann-json3-dev \ @@ -72,7 +92,17 @@ COPY gateway_config.yaml /etc/ros2_medkit/gateway_config.yaml COPY entrypoint.sh /usr/local/bin/entrypoint.sh RUN chmod +x /usr/local/bin/entrypoint.sh -ENV ROS_DOMAIN_ID=42 +# Default TB3 model + gz-sim resource path so spawn_turtlebot3 + gz can find +# the burger URDF/world models without the launch file having to set them. +# RMW: jazzy's apt-shipped nav2_msgs fastrtps typesupport pulls +# eprosima::fastcdr::Cdr::serialize(uint32_t), which the bundled +# ros-jazzy-fastcdr 2.2.5 does NOT export - amcl/controller_server segfault +# at startup. Switch to cyclonedds, which doesn't use the broken typesupport. +ENV ROS_DOMAIN_ID=42 \ + TURTLEBOT3_MODEL=burger \ + GAZEBO_MODEL_PATH=/opt/ros/jazzy/share/turtlebot3_gazebo/models \ + HEADLESS=true \ + RMW_IMPLEMENTATION=rmw_cyclonedds_cpp EXPOSE 8080 8765 ENTRYPOINT ["/usr/local/bin/entrypoint.sh"] diff --git a/demos/ota_nav2_sensor_fix/README.md b/demos/ota_nav2_sensor_fix/README.md index 7024530..95944ab 100644 --- a/demos/ota_nav2_sensor_fix/README.md +++ b/demos/ota_nav2_sensor_fix/README.md @@ -48,35 +48,44 @@ Tear down: `docker compose down`. ## Foxglove Studio visualization -The gateway container also runs `foxglove_bridge` on port `8765` so -Foxglove Studio can subscribe to ROS 2 topics (e.g. `/scan` from -broken_lidar / fixed_lidar). +The gateway container bakes in a TurtleBot3 burger + Nav2 stack running on +top of headless Gazebo. `foxglove_bridge` runs on port `8765` and exposes +the full topic set: `/tf`, `/tf_static`, `/scan`, `/odom`, `/map`, +`/cmd_vel`, `/global_costmap/costmap`, `/local_costmap/costmap`, etc. - so a +Foxglove **3D** panel renders the actual robot in the world out of the box. 1. Open Foxglove Studio -> **Open connection** -> **Foxglove WebSocket** -> - `ws://localhost:8765`. You should see `/scan` and other topics in the - Topics panel. -2. Install the [`ros2_medkit_foxglove_extension`](https://github.com/selfpatch/ros2_medkit_foxglove_extension) + `ws://localhost:8765`. The Topics panel should list all of the topics + above. +2. Drop in a **3D** panel. You should see the TB3 burger sitting in the + default `turtlebot3_world.world` map, with the laser scan cone visible. + Before the OTA update, ray index 180 reports a phantom 1 m return - the + "obstacle" the demo's narrative pivots on. +3. Install the [`ros2_medkit_foxglove_extension`](https://github.com/selfpatch/ros2_medkit_foxglove_extension) (`npm run local-install` in that repo, or drag-and-drop the `.foxe` onto Foxglove). It ships three panels: Entity Browser, Faults Dashboard, and **ros2_medkit Updates**. -3. Add the **ros2_medkit Updates** panel and set its `baseUrl` to +4. Add the **ros2_medkit Updates** panel and set its `baseUrl` to `http://localhost:8080/api/v1` (or the port you picked via - `OTA_GATEWAY_PORT`). -4. Click **Prepare** and **Execute** in the Updates panel - the same SOVD - endpoints `trigger-update.sh` hits, with progress feedback in the panel - and live `/scan` updates in the 3D scene. - -## Adding nav2 / a sim - -This demo intentionally omits a nav2 sim from the compose so the stack stays -small and reliably reproducible. To make the visual story complete: - -- Bring up your favourite turtlebot3 sim (`turtlebot3_gazebo`) and point it - at `ROS_DOMAIN_ID=42` to share the DDS namespace with the gateway. -- The broken_lidar node publishes a phantom return on `/scan` ~1m straight - ahead. nav2's costmap will trace it as an obstacle and the planner will - refuse to drive forward. After the update flow, fixed_lidar publishes a - clean scan and the path planner unblocks. + `OTA_GATEWAY_PORT`). Click **Prepare** then **Execute** for + `fixed_lidar_2_1_0`. The 3D panel should show the phantom return + disappearing as `broken_lidar` is killed and `fixed_lidar` starts. + +### Driving the robot to make the narrative reproducible + +The demo doesn't auto-publish a navigation goal - that keeps it deterministic +for CI. To trigger the "robot stuck on phantom obstacle" beat manually: + +```bash +# From the host (or any container on the same ROS_DOMAIN_ID=42): +ros2 topic pub --once /goal_pose geometry_msgs/PoseStamped \ + '{header: {frame_id: map}, pose: {position: {x: 1.5, y: 0.0, z: 0.0}, orientation: {w: 1.0}}}' +``` + +Foxglove's **3D** panel also has a built-in "Publish" tool - select pose +mode, click a point ahead of the robot, and Foxglove publishes `/goal_pose` +for you. Before the update, Nav2 refuses to drive through the phantom return; +after `trigger-update.sh`, the robot completes the goal. ## Disclosures diff --git a/demos/ota_nav2_sensor_fix/docker-compose.yml b/demos/ota_nav2_sensor_fix/docker-compose.yml index e132b38..f3d9cf3 100644 --- a/demos/ota_nav2_sensor_fix/docker-compose.yml +++ b/demos/ota_nav2_sensor_fix/docker-compose.yml @@ -2,9 +2,10 @@ # Apache 2.0 # # Two-service stack: the gateway (with ota_update_plugin baked in plus the -# demo nodes the plugin will manage) and the FastAPI artifact server. nav2 -# and Foxglove are intentionally out of scope here - see README for how to -# bring your own. +# nav2 + TB3 + headless Gazebo + foxglove_bridge orchestration) and the +# FastAPI artifact server. The gateway image is hefty (~5GB) because it bakes +# the simulator in - the trade-off is `docker compose up` produces a robot +# Foxglove can render, no external sim setup required. services: gateway: @@ -19,6 +20,13 @@ services: - "${OTA_FOXGLOVE_BRIDGE_PORT:-8765}:8765" environment: ROS_DOMAIN_ID: 42 + HEADLESS: "true" + TURTLEBOT3_MODEL: burger + # Gazebo / DDS appreciate generous shared memory; without this + # /dev/shm fills and gz-sim tends to wedge on shutdown. + shm_size: "2gb" + tty: true + stdin_open: true depends_on: - ota_update_server diff --git a/demos/ota_nav2_sensor_fix/entrypoint.sh b/demos/ota_nav2_sensor_fix/entrypoint.sh index ff04742..916c5aa 100755 --- a/demos/ota_nav2_sensor_fix/entrypoint.sh +++ b/demos/ota_nav2_sensor_fix/entrypoint.sh @@ -2,8 +2,9 @@ # Copyright 2026 bburda # Apache 2.0 # -# Container entrypoint: launches the demo nodes that the OTA plugin will -# manage at runtime, then forks the gateway as PID 1's foreground process. +# Container entrypoint: hands off to the ota_nav2_sensor_fix_demo launch file +# which orchestrates everything (TB3 + Nav2 + headless Gazebo + foxglove_bridge +# + fault_manager + broken_lidar/legacy + gateway w/ ota_update_plugin). set -e @@ -12,29 +13,8 @@ source /opt/ros/jazzy/setup.bash # shellcheck disable=SC1091 source /ws/install/setup.bash -# Demo nodes the plugin will swap (broken_lidar -> fixed_lidar) and -# uninstall (broken_lidar_legacy). obstacle_classifier_v2 is installed -# fresh by the demo and not started here. -# Fault manager: serves /fault_manager/* services that the gateway's -# /faults endpoint calls. Without it /faults hangs because the gateway -# blocks waiting for the service. Default parameters are fine for the -# demo (in-memory store, no persistence). -ros2 run ros2_medkit_fault_manager fault_manager_node & +# Default to headless; an operator on a workstation can flip via env var. +HEADLESS_ARG="${HEADLESS:-true}" -ros2 run broken_lidar broken_lidar_node & -ros2 run broken_lidar_legacy broken_lidar_legacy & - -# foxglove_bridge: WebSocket server on :8765 so Foxglove Studio can -# subscribe to /scan, /tf, and any topic the demo nodes publish. Required -# for the visual narrative (3D scene + phantom obstacle); the SOVD Updates -# panel itself only needs the gateway HTTP API. -ros2 run foxglove_bridge foxglove_bridge \ - --ros-args -p port:=8765 -p address:=0.0.0.0 & - -# Foreground gateway. Pass the config file directly to the gateway_node -# executable (the gateway.launch.py wrapper does not expose a config_file -# argument, so we invoke the executable directly to thread our YAML in). -exec ros2 run ros2_medkit_gateway gateway_node \ - --ros-args \ - --params-file /etc/ros2_medkit/gateway_config.yaml \ - --log-level info +exec ros2 launch ota_nav2_sensor_fix_demo demo.launch.py \ + "headless:=${HEADLESS_ARG}" diff --git a/demos/ota_nav2_sensor_fix/ros2_packages/ota_nav2_sensor_fix_demo/CMakeLists.txt b/demos/ota_nav2_sensor_fix/ros2_packages/ota_nav2_sensor_fix_demo/CMakeLists.txt new file mode 100644 index 0000000..a431a82 --- /dev/null +++ b/demos/ota_nav2_sensor_fix/ros2_packages/ota_nav2_sensor_fix_demo/CMakeLists.txt @@ -0,0 +1,14 @@ +cmake_minimum_required(VERSION 3.16) +project(ota_nav2_sensor_fix_demo) + +find_package(ament_cmake REQUIRED) + +install(DIRECTORY launch/ + DESTINATION share/${PROJECT_NAME}/launch +) + +install(DIRECTORY config/ + DESTINATION share/${PROJECT_NAME}/config +) + +ament_package() diff --git a/demos/ota_nav2_sensor_fix/ros2_packages/ota_nav2_sensor_fix_demo/config/nav2_params.yaml b/demos/ota_nav2_sensor_fix/ros2_packages/ota_nav2_sensor_fix_demo/config/nav2_params.yaml new file mode 100644 index 0000000..aa8753d --- /dev/null +++ b/demos/ota_nav2_sensor_fix/ros2_packages/ota_nav2_sensor_fix_demo/config/nav2_params.yaml @@ -0,0 +1,353 @@ +# Nav2 parameters for TurtleBot3 + ros2_medkit demo +# Based on default nav2_bringup parameters for TurtleBot3 burger + +# Lifecycle manager configuration - exclude docking_server which we don't use +lifecycle_manager_navigation: + ros__parameters: + use_sim_time: True + autostart: True + node_names: + - controller_server + - smoother_server + - planner_server + - behavior_server + - bt_navigator + - waypoint_follower + - velocity_smoother + - collision_monitor + # Note: docking_server and route_server removed - not configured for this demo + +amcl: + ros__parameters: + use_sim_time: True + alpha1: 0.2 + alpha2: 0.2 + alpha3: 0.2 + alpha4: 0.2 + alpha5: 0.2 + base_frame_id: "base_footprint" + beam_skip_distance: 0.5 + beam_skip_error_threshold: 0.9 + beam_skip_threshold: 0.3 + do_beamskip: false + global_frame_id: "map" + lambda_short: 0.1 + laser_likelihood_max_dist: 2.0 + laser_max_range: 100.0 + laser_min_range: -1.0 + laser_model_type: "likelihood_field" + max_beams: 60 + max_particles: 2000 + min_particles: 500 + odom_frame_id: "odom" + pf_err: 0.05 + pf_z: 0.99 + recovery_alpha_fast: 0.0 + recovery_alpha_slow: 0.0 + resample_interval: 1 + robot_model_type: "nav2_amcl::DifferentialMotionModel" + save_pose_rate: 0.5 + sigma_hit: 0.2 + tf_broadcast: true + transform_tolerance: 1.0 + update_min_a: 0.2 + update_min_d: 0.25 + z_hit: 0.5 + z_max: 0.05 + z_rand: 0.5 + z_short: 0.05 + scan_topic: scan + set_initial_pose: true + initial_pose: + x: 0.0 + y: 0.0 + z: 0.0 + yaw: 0.0 + +bt_navigator: + ros__parameters: + use_sim_time: True + global_frame: map + robot_base_frame: base_link + odom_topic: /odom + bt_loop_duration: 10 + default_server_timeout: 20 + wait_for_service_timeout: 1000 + action_server_result_timeout: 900.0 + navigators: ["navigate_to_pose", "navigate_through_poses"] + navigate_to_pose: + plugin: "nav2_bt_navigator::NavigateToPoseNavigator" + navigate_through_poses: + plugin: "nav2_bt_navigator::NavigateThroughPosesNavigator" + # Note: plugin_lib_names is no longer needed in Jazzy - plugins are auto-loaded + +controller_server: + ros__parameters: + use_sim_time: True + enable_stamped_cmd_vel: True + controller_frequency: 20.0 + min_x_velocity_threshold: 0.001 + min_y_velocity_threshold: 0.5 + min_theta_velocity_threshold: 0.001 + failure_tolerance: 0.3 + progress_checker_plugins: ["progress_checker"] + goal_checker_plugins: ["general_goal_checker"] + controller_plugins: ["FollowPath"] + odom_topic: "odom" + + progress_checker: + plugin: "nav2_controller::SimpleProgressChecker" + required_movement_radius: 0.5 + movement_time_allowance: 10.0 + + general_goal_checker: + stateful: True + plugin: "nav2_controller::SimpleGoalChecker" + xy_goal_tolerance: 0.25 + yaw_goal_tolerance: 0.25 + + FollowPath: + plugin: "dwb_core::DWBLocalPlanner" + debug_trajectory_details: True + min_vel_x: 0.0 + min_vel_y: 0.0 + max_vel_x: 0.26 + max_vel_y: 0.0 + max_vel_theta: 1.0 + min_speed_xy: 0.0 + max_speed_xy: 0.26 + min_speed_theta: 0.0 + acc_lim_x: 2.5 + acc_lim_y: 0.0 + acc_lim_theta: 3.2 + decel_lim_x: -2.5 + decel_lim_y: 0.0 + decel_lim_theta: -3.2 + vx_samples: 20 + vy_samples: 5 + vtheta_samples: 20 + sim_time: 1.7 + linear_granularity: 0.05 + angular_granularity: 0.025 + transform_tolerance: 0.2 + xy_goal_tolerance: 0.25 + trans_stopped_velocity: 0.25 + short_circuit_trajectory_evaluation: True + stateful: True + critics: ["RotateToGoal", "Oscillation", "BaseObstacle", "GoalAlign", "PathAlign", "PathDist", "GoalDist"] + BaseObstacle.scale: 0.02 + PathAlign.scale: 32.0 + PathAlign.forward_point_distance: 0.1 + GoalAlign.scale: 24.0 + GoalAlign.forward_point_distance: 0.1 + PathDist.scale: 32.0 + GoalDist.scale: 24.0 + RotateToGoal.scale: 32.0 + RotateToGoal.slowing_factor: 5.0 + RotateToGoal.lookahead_time: -1.0 + +local_costmap: + local_costmap: + ros__parameters: + update_frequency: 5.0 + publish_frequency: 2.0 + global_frame: odom + robot_base_frame: base_link + use_sim_time: True + rolling_window: true + width: 3 + height: 3 + resolution: 0.05 + robot_radius: 0.22 + plugins: ["voxel_layer", "inflation_layer"] + inflation_layer: + plugin: "nav2_costmap_2d::InflationLayer" + cost_scaling_factor: 3.0 + inflation_radius: 0.55 + voxel_layer: + plugin: "nav2_costmap_2d::VoxelLayer" + enabled: True + publish_voxel_map: True + origin_z: 0.0 + z_resolution: 0.05 + z_voxels: 16 + max_obstacle_height: 2.0 + mark_threshold: 0 + observation_sources: scan + scan: + topic: /scan + max_obstacle_height: 2.0 + clearing: True + marking: True + data_type: "LaserScan" + raytrace_max_range: 3.0 + raytrace_min_range: 0.0 + obstacle_max_range: 2.5 + obstacle_min_range: 0.0 + static_layer: + plugin: "nav2_costmap_2d::StaticLayer" + map_subscribe_transient_local: True + always_send_full_costmap: True + +global_costmap: + global_costmap: + ros__parameters: + update_frequency: 1.0 + publish_frequency: 1.0 + global_frame: map + robot_base_frame: base_link + use_sim_time: True + robot_radius: 0.22 + resolution: 0.05 + track_unknown_space: true + plugins: ["static_layer", "obstacle_layer", "inflation_layer"] + obstacle_layer: + plugin: "nav2_costmap_2d::ObstacleLayer" + enabled: True + observation_sources: scan + scan: + topic: /scan + max_obstacle_height: 2.0 + clearing: True + marking: True + data_type: "LaserScan" + raytrace_max_range: 3.0 + raytrace_min_range: 0.0 + obstacle_max_range: 2.5 + obstacle_min_range: 0.0 + static_layer: + plugin: "nav2_costmap_2d::StaticLayer" + map_subscribe_transient_local: True + inflation_layer: + plugin: "nav2_costmap_2d::InflationLayer" + cost_scaling_factor: 3.0 + inflation_radius: 0.55 + always_send_full_costmap: True + +planner_server: + ros__parameters: + expected_planner_frequency: 20.0 + use_sim_time: True + planner_plugins: ["GridBased"] + GridBased: + plugin: "nav2_navfn_planner::NavfnPlanner" + tolerance: 0.5 + use_astar: false + allow_unknown: true + +smoother_server: + ros__parameters: + use_sim_time: True + smoother_plugins: ["simple_smoother"] + simple_smoother: + plugin: "nav2_smoother::SimpleSmoother" + tolerance: 1.0e-10 + max_its: 1000 + do_refinement: True + +behavior_server: + ros__parameters: + enable_stamped_cmd_vel: True + local_costmap_topic: local_costmap/costmap_raw + global_costmap_topic: global_costmap/costmap_raw + local_footprint_topic: local_costmap/published_footprint + global_footprint_topic: global_costmap/published_footprint + cycle_frequency: 10.0 + behavior_plugins: ["spin", "backup", "drive_on_heading", "assisted_teleop", "wait"] + spin: + plugin: "nav2_behaviors::Spin" + backup: + plugin: "nav2_behaviors::BackUp" + drive_on_heading: + plugin: "nav2_behaviors::DriveOnHeading" + wait: + plugin: "nav2_behaviors::Wait" + assisted_teleop: + plugin: "nav2_behaviors::AssistedTeleop" + local_frame: odom + global_frame: map + robot_base_frame: base_link + transform_tolerance: 0.1 + use_sim_time: True + simulate_ahead_time: 2.0 + max_rotational_vel: 1.0 + min_rotational_vel: 0.4 + rotational_acc_lim: 3.2 + +waypoint_follower: + ros__parameters: + use_sim_time: True + loop_rate: 20 + stop_on_failure: false + action_server_result_timeout: 900.0 + waypoint_task_executor_plugin: "wait_at_waypoint" + wait_at_waypoint: + plugin: "nav2_waypoint_follower::WaitAtWaypoint" + enabled: True + waypoint_pause_duration: 200 + +velocity_smoother: + ros__parameters: + use_sim_time: True + enable_stamped_cmd_vel: True + smoothing_frequency: 20.0 + scale_velocities: False + feedback: "OPEN_LOOP" + max_velocity: [0.26, 0.0, 1.0] + min_velocity: [-0.26, 0.0, -1.0] + max_accel: [2.5, 0.0, 3.2] + max_decel: [-2.5, 0.0, -3.2] + odom_topic: "odom" + odom_duration: 0.1 + deadband_velocity: [0.0, 0.0, 0.0] + velocity_timeout: 1.0 + +collision_monitor: + ros__parameters: + use_sim_time: True + enable_stamped_cmd_vel: True + base_frame_id: "base_link" + odom_frame_id: "odom" + cmd_vel_in_topic: "cmd_vel_smoothed" + cmd_vel_out_topic: "cmd_vel" + state_topic: "collision_monitor_state" + transform_tolerance: 0.2 + source_timeout: 1.0 + base_shift_correction: True + stop_pub_timeout: 2.0 + polygons: ["FootprintApproach"] + FootprintApproach: + type: "polygon" + action_type: "approach" + footprint_topic: "/local_costmap/published_footprint" + time_before_collision: 1.2 + simulation_time_step: 0.1 + min_points: 6 + visualize: False + enabled: True + observation_sources: ["scan"] + scan: + type: "scan" + topic: "scan" + min_height: 0.15 + max_height: 2.0 + enabled: True + +# Docking server - minimal config to satisfy lifecycle manager +# We don't actually use docking in this demo +docking_server: + ros__parameters: + use_sim_time: True + enable_stamped_cmd_vel: True + dock_plugins: ["simple_charging_dock"] + simple_charging_dock: + plugin: "opennav_docking::SimpleChargingDock" + use_external_detection_pose: false + docking_threshold: 0.02 + staging_x_offset: -0.5 + staging_yaw_offset: 0.0 + +# Route server - minimal config +route_server: + ros__parameters: + use_sim_time: True diff --git a/demos/ota_nav2_sensor_fix/ros2_packages/ota_nav2_sensor_fix_demo/config/turtlebot3_world.yaml b/demos/ota_nav2_sensor_fix/ros2_packages/ota_nav2_sensor_fix_demo/config/turtlebot3_world.yaml new file mode 100644 index 0000000..ad2e683 --- /dev/null +++ b/demos/ota_nav2_sensor_fix/ros2_packages/ota_nav2_sensor_fix_demo/config/turtlebot3_world.yaml @@ -0,0 +1,7 @@ +image: /opt/ros/jazzy/share/turtlebot3_navigation2/map/map.pgm +mode: trinary +resolution: 0.05 +origin: [-1.76, -2.42, 0.0] +negate: 0 +occupied_thresh: 0.65 +free_thresh: 0.196 diff --git a/demos/ota_nav2_sensor_fix/ros2_packages/ota_nav2_sensor_fix_demo/launch/demo.launch.py b/demos/ota_nav2_sensor_fix/ros2_packages/ota_nav2_sensor_fix_demo/launch/demo.launch.py new file mode 100644 index 0000000..4170833 --- /dev/null +++ b/demos/ota_nav2_sensor_fix/ros2_packages/ota_nav2_sensor_fix_demo/launch/demo.launch.py @@ -0,0 +1,241 @@ +# Copyright 2026 bburda +# Apache 2.0 +# +# Single launch entry point for the OTA over SOVD nav2 sensor-fix demo. +# +# Brings up, in one container: +# - headless Gazebo (gz-sim) with the TurtleBot3 world +# - TurtleBot3 (burger) spawned in that world + robot_state_publisher +# - the full Nav2 stack (bringup_launch.py) with the TB3 default map +# - foxglove_bridge on :8765 so Foxglove Studio can render /tf, /scan, /map etc. +# - ros2_medkit fault_manager (the gateway's /faults endpoint depends on it) +# - broken_lidar + broken_lidar_legacy (the OTA plugin swaps/uninstalls these) +# - the gateway with our ota_update_plugin loaded via gateway_config.yaml +# +# /scan-override strategy +# ----------------------- +# The TB3 gz-sim simulation publishes its own LaserScan on /scan via the +# ros_gz_bridge. broken_lidar also publishes on /scan with a phantom obstacle +# at ray index 180. Two publishers on /scan would interleave - nav2 would see +# garbage. We solve this by wrapping spawn_turtlebot3 in a GroupAction with a +# SetRemap that pushes the simulator's /scan to /scan_sim. That way the gz +# bridge inside the spawn launch ends up emitting /scan_sim, not /scan, and +# broken_lidar (and later fixed_lidar) is the sole publisher on /scan that +# nav2 + foxglove see. + +import os + +from ament_index_python.packages import ( + get_package_prefix, + get_package_share_directory, + PackageNotFoundError, +) +from launch import LaunchDescription +from launch.actions import ( + AppendEnvironmentVariable, + DeclareLaunchArgument, + GroupAction, + IncludeLaunchDescription, + SetEnvironmentVariable, +) +from launch.conditions import IfCondition, UnlessCondition +from launch.launch_description_sources import PythonLaunchDescriptionSource +from launch.substitutions import LaunchConfiguration +from launch_ros.actions import Node, SetRemap + + +def _resolve_plugin_path(package_name, lib_name): + """Resolve a gateway plugin .so path inside the colcon install tree.""" + try: + prefix = get_package_prefix(package_name) + except PackageNotFoundError: + return '' + candidates = [ + os.path.join(prefix, 'lib', package_name, f'lib{lib_name}.so'), + os.path.join(prefix, 'lib', package_name, f'{lib_name}.so'), + ] + for path in candidates: + if os.path.isfile(path): + return path + return '' + + +def generate_launch_description(): + turtlebot3_gazebo_dir = get_package_share_directory('turtlebot3_gazebo') + ros_gz_sim_dir = get_package_share_directory('ros_gz_sim') + nav2_bringup_dir = get_package_share_directory('nav2_bringup') + demo_pkg_dir = get_package_share_directory('ota_nav2_sensor_fix_demo') + + nav2_params_file = os.path.join(demo_pkg_dir, 'config', 'nav2_params.yaml') + map_file = os.path.join(demo_pkg_dir, 'config', 'turtlebot3_world.yaml') + + # OTA plugin shipped via the gateway image's /etc/ros2_medkit/gateway_config.yaml. + # The plugin itself loads when gateway_node parses that params file - we just + # point the gateway at it via --ros-args --params-file below. + gateway_config_file = os.environ.get( + 'OTA_DEMO_GATEWAY_CONFIG', + '/etc/ros2_medkit/gateway_config.yaml', + ) + + world_file = os.path.join(turtlebot3_gazebo_dir, 'worlds', 'turtlebot3_world.world') + + use_sim_time = LaunchConfiguration('use_sim_time', default='True') + headless = LaunchConfiguration('headless', default='True') + x_pose = LaunchConfiguration('x_pose', default='-2.0') + y_pose = LaunchConfiguration('y_pose', default='-0.5') + + set_gz_model_path = AppendEnvironmentVariable( + 'GZ_SIM_RESOURCE_PATH', + os.path.join(turtlebot3_gazebo_dir, 'models'), + ) + + # Gazebo headless server: -r runs the world, -s is server-only (no GUI), -v2 + # is INFO logging. on_exit_shutdown ensures gz dying tears the launch down. + gz_headless = IncludeLaunchDescription( + PythonLaunchDescriptionSource( + os.path.join(ros_gz_sim_dir, 'launch', 'gz_sim.launch.py'), + ), + launch_arguments={ + 'gz_args': ['-r', '-s', '-v2', world_file], + 'on_exit_shutdown': 'true', + }.items(), + condition=IfCondition(headless), + ) + + # GUI mode is left in for parity with the sibling TB3 demo (someone running + # this on a workstation can flip headless:=False); in containers we never + # take this branch. + gz_gui = IncludeLaunchDescription( + PythonLaunchDescriptionSource( + os.path.join(turtlebot3_gazebo_dir, 'launch', 'turtlebot3_world.launch.py'), + ), + launch_arguments={'use_sim_time': use_sim_time}.items(), + condition=UnlessCondition(headless), + ) + + # Spawn the robot - SetRemap inside the GroupAction pushes the gazebo + # bridge's /scan onto /scan_sim so broken_lidar owns /scan exclusively. + spawn_robot = GroupAction( + actions=[ + SetRemap(src='/scan', dst='/scan_sim'), + IncludeLaunchDescription( + PythonLaunchDescriptionSource( + os.path.join(turtlebot3_gazebo_dir, 'launch', 'spawn_turtlebot3.launch.py'), + ), + launch_arguments={'x_pose': x_pose, 'y_pose': y_pose}.items(), + ), + ], + condition=IfCondition(headless), + ) + + robot_state_publisher = IncludeLaunchDescription( + PythonLaunchDescriptionSource( + os.path.join(turtlebot3_gazebo_dir, 'launch', 'robot_state_publisher.launch.py'), + ), + launch_arguments={'use_sim_time': use_sim_time}.items(), + condition=IfCondition(headless), + ) + + # use_composition=False forces nav2 to launch each lifecycle node as its + # own process instead of co-loading them into component_container_isolated. + # The Jazzy apt build of nav2_msgs has an ABI mismatch against the + # fastcdr 2.2.5 currently shipping with ros-jazzy-fastcdr (missing + # eprosima::fastcdr::Cdr::serialize(unsigned int)) which immediately kills + # the container at composition time. Per-node mode dodges that crash. + nav2 = IncludeLaunchDescription( + PythonLaunchDescriptionSource( + os.path.join(nav2_bringup_dir, 'launch', 'bringup_launch.py'), + ), + launch_arguments={ + 'map': map_file, + 'params_file': nav2_params_file, + 'use_sim_time': use_sim_time, + 'autostart': 'True', + 'use_composition': 'False', + }.items(), + ) + + fault_manager = Node( + package='ros2_medkit_fault_manager', + executable='fault_manager_node', + name='fault_manager', + namespace='', + output='screen', + parameters=[{'use_sim_time': use_sim_time}], + ) + + foxglove = Node( + package='foxglove_bridge', + executable='foxglove_bridge', + name='foxglove_bridge', + output='screen', + parameters=[ + {'port': 8765}, + {'address': '0.0.0.0'}, + {'use_sim_time': use_sim_time}, + ], + ) + + broken_lidar_node = Node( + package='broken_lidar', + executable='broken_lidar_node', + name='scan_sensor_node', + output='screen', + parameters=[{'use_sim_time': use_sim_time}], + ) + + broken_lidar_legacy = Node( + package='broken_lidar_legacy', + executable='broken_lidar_legacy', + name='broken_lidar_legacy', + output='screen', + parameters=[{'use_sim_time': use_sim_time}], + ) + + # Plugin overrides + node params come from gateway_config.yaml. The .so + # path is pinned absolutely there (/ws/install/...), so we don't need to + # resolve it via _resolve_plugin_path the way the TB3 demo does. + _ = _resolve_plugin_path # kept for parity / future overrides + + gateway = Node( + package='ros2_medkit_gateway', + executable='gateway_node', + name='ros2_medkit_gateway', + output='screen', + parameters=[gateway_config_file], + arguments=['--ros-args', '--log-level', 'info'], + ) + + return LaunchDescription([ + DeclareLaunchArgument( + 'use_sim_time', default_value='True', + description='Use simulation (Gazebo) clock if true', + ), + DeclareLaunchArgument( + 'headless', default_value='True', + description='Run Gazebo without a GUI - default True for Docker/CI', + ), + DeclareLaunchArgument( + 'x_pose', default_value='-2.0', + description='Robot initial X position', + ), + DeclareLaunchArgument( + 'y_pose', default_value='-0.5', + description='Robot initial Y position', + ), + SetEnvironmentVariable( + name='TURTLEBOT3_MODEL', + value=os.environ.get('TURTLEBOT3_MODEL', 'burger'), + ), + set_gz_model_path, + gz_headless, + gz_gui, + spawn_robot, + robot_state_publisher, + nav2, + fault_manager, + foxglove, + broken_lidar_node, + broken_lidar_legacy, + gateway, + ]) diff --git a/demos/ota_nav2_sensor_fix/ros2_packages/ota_nav2_sensor_fix_demo/package.xml b/demos/ota_nav2_sensor_fix/ros2_packages/ota_nav2_sensor_fix_demo/package.xml new file mode 100644 index 0000000..04ed2e7 --- /dev/null +++ b/demos/ota_nav2_sensor_fix/ros2_packages/ota_nav2_sensor_fix_demo/package.xml @@ -0,0 +1,36 @@ + + + + ota_nav2_sensor_fix_demo + 0.1.0 + Launch + config glue for the OTA over SOVD nav2 sensor-fix demo (TurtleBot3 + Nav2 + headless Gazebo + broken_lidar/fixed_lidar swap). + bburda + Apache-2.0 + + ament_cmake + + ros2launch + turtlebot3_gazebo + turtlebot3_description + turtlebot3_navigation2 + ros_gz_sim + ros_gz_bridge + nav2_bringup + nav2_amcl + nav2_bt_navigator + nav2_controller + nav2_planner + nav2_behaviors + nav2_costmap_2d + nav2_lifecycle_manager + nav2_map_server + foxglove_bridge + ros2_medkit_gateway + ros2_medkit_fault_manager + broken_lidar + broken_lidar_legacy + + + ament_cmake + + From 1dbabd4e3f994cc341db36f128f7384389048bd0 Mon Sep 17 00:00:00 2001 From: Bartosz Burda Date: Mon, 27 Apr 2026 21:40:23 +0200 Subject: [PATCH 38/52] docs(demos/ota): refresh README + run-demo for the TB3 / Nav2 / gz-sim bundle Both files predated the TB3+Nav2+headless-Gazebo integration: - run-demo.sh header described only broken_lidar / broken_lidar_legacy; now mentions the full self-contained simulator + foxglove_bridge :8765 - usage section was missing OTA_FOXGLOVE_BRIDGE_PORT - "Connect a UI" block now points at the Foxglove 3D panel narrative (TurtleBot3 + /scan cone) as the recommended path - README quickstart called out a ~10 minute first-run build; bumped to ~15-20 minutes to reflect the ~3 GB TB3+Nav2+gz-sim runtime - README port-override hint now lists both OTA_GATEWAY_PORT and OTA_FOXGLOVE_BRIDGE_PORT (the latter was used in code but undocumented) --- demos/ota_nav2_sensor_fix/README.md | 10 +++++++--- demos/ota_nav2_sensor_fix/run-demo.sh | 20 ++++++++++++-------- 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/demos/ota_nav2_sensor_fix/README.md b/demos/ota_nav2_sensor_fix/README.md index 95944ab..2686a56 100644 --- a/demos/ota_nav2_sensor_fix/README.md +++ b/demos/ota_nav2_sensor_fix/README.md @@ -26,8 +26,9 @@ update package metadata. ./run-demo.sh ``` -The first run pulls `ros:jazzy` and builds the gateway from source - takes -~10 minutes. Subsequent runs reuse the layer cache. +The first run pulls `ros:jazzy`, installs the TurtleBot3 + Nav2 + gz-sim +runtime (~3 GB) and builds the gateway from source - takes ~15-20 minutes +on a fresh cache. Subsequent runs reuse the layer cache. In another terminal, drive the demo: @@ -42,7 +43,10 @@ In another terminal, drive the demo: Each trigger script issues SOVD `PUT /updates/{id}/prepare` then `/execute` and prints the resulting status plus the live process list. -If host port 8080 is taken, override with `OTA_GATEWAY_PORT=8081 ./run-demo.sh`. +Port overrides (set as env vars before `./run-demo.sh`): + +- `OTA_GATEWAY_PORT` - gateway HTTP API (default `8080`) +- `OTA_FOXGLOVE_BRIDGE_PORT` - foxglove_bridge WebSocket (default `8765`) Tear down: `docker compose down`. diff --git a/demos/ota_nav2_sensor_fix/run-demo.sh b/demos/ota_nav2_sensor_fix/run-demo.sh index 0b45633..042d3bd 100755 --- a/demos/ota_nav2_sensor_fix/run-demo.sh +++ b/demos/ota_nav2_sensor_fix/run-demo.sh @@ -1,9 +1,11 @@ #!/bin/bash # OTA over SOVD - nav2 sensor-fix demo runner. # Brings up the gateway (with the dev-grade ota_update_plugin baked in) and -# the FastAPI artifact server. The gateway entrypoint also launches -# broken_lidar (publishes /scan with a phantom obstacle) and -# broken_lidar_legacy (uninstall target). +# the FastAPI artifact server. The gateway image bundles a full TurtleBot3 + +# Nav2 + headless Gazebo stack and runs foxglove_bridge on :8765, so the +# demo is self-contained: broken_lidar publishes /scan with a phantom +# obstacle that nav2 + a Foxglove 3D panel both react to. The OTA flow +# swaps broken_lidar -> fixed_lidar and the phantom disappears. set -eu @@ -26,7 +28,8 @@ usage() { echo " -h, --help Show this help message" echo "" echo "Environment:" - echo " OTA_GATEWAY_PORT Host port for gateway HTTP API (default: 8080)" + echo " OTA_GATEWAY_PORT Host port for gateway HTTP API (default: 8080)" + echo " OTA_FOXGLOVE_BRIDGE_PORT Host port for foxglove_bridge WebSocket (default: 8765)" echo "" echo "Examples:" echo " $0 # Daemon mode (default)" @@ -142,7 +145,8 @@ echo " Web UI (ros2_medkit_web_ui):" echo " npm install && npm run dev" echo " open http://localhost:5173 -> Connect -> ${GATEWAY_URL}" echo "" -echo " Foxglove Studio (ros2_medkit_foxglove_extension):" -echo " cd ros2_medkit_foxglove_extension && npm install && npm run local-install" -echo " Open Foxglove -> add panel 'ros2_medkit Updates'" -echo " Set baseUrl in panel settings to ${GATEWAY_URL}/api/v1" +echo " Foxglove Studio (recommended for the 3D narrative):" +echo " Open connection -> Foxglove WebSocket -> ws://localhost:${OTA_FOXGLOVE_BRIDGE_PORT:-8765}" +echo " Add a 3D panel: TurtleBot3 in the world, /scan cone shows the phantom" +echo " Install ros2_medkit_foxglove_extension (npm run local-install) for the" +echo " 'ros2_medkit Updates' panel; set baseUrl to ${GATEWAY_URL}/api/v1" From d82fc9419990bb6a35690663b77824a759315d18 Mon Sep 17 00:00:00 2001 From: Bartosz Burda Date: Mon, 27 Apr 2026 22:03:59 +0200 Subject: [PATCH 39/52] test(demos/ota): add Uninstall + SetRemap smoke checks; fix flaky log grep The smoke test was missing two regression guards and had a SIGPIPE bug that started biting now that the gateway image bakes in the nav2 stack (~500 lines of lifecycle output before the gateway logs anything). New checks: - /scan SetRemap regression: assert /scan has exactly one publisher and it is NOT ros_gz_bridge. The launch wraps spawn_turtlebot3 in a SetRemap('/scan' -> '/scan_sim') so broken_lidar (and later fixed_lidar) is the sole publisher on /scan; if that remap regresses both publishers stomp each other and nav2 sees garbage scans. - Uninstall flow: PUT /updates/broken_lidar_legacy_remove/prepare + /execute, assert status is completed and the broken_lidar_legacy process is gone. Closes the Update / Install / Uninstall trio - the demo's whole SOVD ISO 17978-3 compliance story. Bug fix: - `docker logs $C | grep -q PATTERN` is unsafe under `set -o pipefail`. When grep finds the match early it exits, SIGPIPEs `docker logs`, and the pipeline returns 141 - which `if` reads as "no match" even when the line is there. With the small pre-nav2 log this was lucky enough to almost always pass; with the full nav2 lifecycle dump it flips to consistent false-negative. Capture logs into a variable first. --- tests/smoke_test_ota.sh | 53 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 51 insertions(+), 2 deletions(-) diff --git a/tests/smoke_test_ota.sh b/tests/smoke_test_ota.sh index 1272c94..f93ffde 100755 --- a/tests/smoke_test_ota.sh +++ b/tests/smoke_test_ota.sh @@ -76,13 +76,19 @@ fi section "UpdateProvider plugin loaded" -if docker logs "$GATEWAY_CONTAINER" 2>&1 | grep -q "Update backend provided by plugin"; then +# Capture logs into a variable rather than piping `docker logs | grep -q`. +# With `set -o pipefail` and a large log (nav2 lifecycle prints ~500 lines), +# grep -q exits early on first match, SIGPIPEs docker logs, and the pipeline +# returns 141 - which `if` reads as "no match" even when the line was found. +GATEWAY_LOGS=$(docker logs "$GATEWAY_CONTAINER" 2>&1 || true) + +if printf '%s\n' "$GATEWAY_LOGS" | grep -q "Update backend provided by plugin"; then pass "gateway log says: 'Update backend provided by plugin'" else fail "gateway log says: 'Update backend provided by plugin'" "log line missing" fi -if docker logs "$GATEWAY_CONTAINER" 2>&1 | grep -q "Updates enabled but no UpdateProvider plugin loaded"; then +if printf '%s\n' "$GATEWAY_LOGS" | grep -q "Updates enabled but no UpdateProvider plugin loaded"; then fail "no 'no UpdateProvider' warning" "warning was logged" else pass "no 'no UpdateProvider' warning" @@ -158,6 +164,30 @@ section "Initial process state" assert_process_running "/lib/broken_lidar/broken_lidar_node" "broken_lidar_node running before update" assert_process_running "broken_lidar_legacy" "broken_lidar_legacy running before uninstall" +section "/scan SetRemap regression (only broken_lidar publishes, not gz-bridge)" + +# The launch wraps spawn_turtlebot3 in a SetRemap('/scan' -> '/scan_sim') so +# the gz-bridge ends up on /scan_sim, leaving broken_lidar as the sole +# publisher on /scan. If that remap regresses, both publishers stomp each +# other and nav2 sees garbage. Use ros2 topic info -v inside the container +# (host runner has no ROS install) and assert exactly one publisher whose +# node name is NOT ros_gz_bridge. +SCAN_INFO=$(docker exec "$GATEWAY_CONTAINER" bash -lc \ + 'source /opt/ros/jazzy/setup.bash && ros2 topic info /scan -v' 2>/dev/null || true) + +PUB_COUNT=$(printf '%s\n' "$SCAN_INFO" | grep -c "Endpoint type: PUBLISHER" || true) +if [ "$PUB_COUNT" = "1" ]; then + pass "/scan has exactly 1 publisher" +else + fail "/scan has exactly 1 publisher" "got ${PUB_COUNT} publishers - SetRemap regressed?" +fi + +if printf '%s\n' "$SCAN_INFO" | grep -q "ros_gz_bridge"; then + fail "/scan publisher is not ros_gz_bridge" "gz-bridge is publishing on /scan (SetRemap broken)" +else + pass "/scan publisher is not ros_gz_bridge" +fi + section "Update flow: PUT /updates/fixed_lidar_2_1_0/prepare + /execute" curl -fsS -X PUT -H 'Content-Type: application/json' -d '{}' \ @@ -188,3 +218,22 @@ curl -fsS -X PUT -H 'Content-Type: application/json' -d '{}' \ sleep 5 assert_process_running "obstacle_classifier_node" "obstacle_classifier_node spawned after install" + +section "Uninstall flow: PUT /updates/broken_lidar_legacy_remove/prepare + /execute" + +curl -fsS -X PUT -H 'Content-Type: application/json' -d '{}' \ + "${API_BASE}/updates/broken_lidar_legacy_remove/prepare" >/dev/null +sleep 4 +curl -fsS -X PUT -H 'Content-Type: application/json' -d '{}' \ + "${API_BASE}/updates/broken_lidar_legacy_remove/execute" >/dev/null +sleep 5 + +if api_get "/updates/broken_lidar_legacy_remove/status"; then + if echo "$RESPONSE" | jq -e '.status == "completed"' >/dev/null 2>&1; then + pass "broken_lidar_legacy_remove status is completed" + else + fail "broken_lidar_legacy_remove status is completed" "got $(echo "$RESPONSE" | jq -c .)" + fi +fi + +assert_process_gone "/lib/broken_lidar_legacy/broken_lidar_legacy" "broken_lidar_legacy killed after uninstall" From 5067ff9ed8ed0dbc478ae360c935bf4104ccdd02 Mon Sep 17 00:00:00 2001 From: Bartosz Burda Date: Tue, 28 Apr 2026 15:54:32 +0200 Subject: [PATCH 40/52] fix(demos/ota): drop runtime function auto-gen + pin gateway to logs-fix branch Two demo polish items surfaced while preparing the recording: 1. discovery.runtime.create_functions_from_namespaces is now false. Our nav2 / TB3 nodes don't share a meaningful namespace - they all live at root with a few /global_costmap, /local_costmap exceptions - so the gateway was synthesizing single-host "global_costmap" / "local_costmap" / "root" Functions that don't represent any logical capability. Without a manifest those entries were noise; the tree now hides the Functions section entirely until something real exists to list. 2. Pin the gateway image to fix/component-logs-aggregation while the upstream PR lands. The per-component Logs tab in the Foxglove extension was always empty because the synthetic component prefix-match returned zero items for components without a manifest fqn. The branch makes COMPONENT log queries aggregate from hosted apps' fqns, parity with the existing AREA / FUNCTION handlers. --- demos/ota_nav2_sensor_fix/Dockerfile.gateway | 6 +++++- demos/ota_nav2_sensor_fix/gateway_config.yaml | 9 +++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/demos/ota_nav2_sensor_fix/Dockerfile.gateway b/demos/ota_nav2_sensor_fix/Dockerfile.gateway index 2375db4..5a366cd 100644 --- a/demos/ota_nav2_sensor_fix/Dockerfile.gateway +++ b/demos/ota_nav2_sensor_fix/Dockerfile.gateway @@ -15,7 +15,11 @@ FROM ros:jazzy AS builder ARG GATEWAY_REPO=https://github.com/selfpatch/ros2_medkit.git -ARG GATEWAY_REF=main +# Pin to the component-logs aggregation fix branch until the upstream PR +# lands on main; without it the per-component Logs tab in the Foxglove +# extension shows zero entries because the synthetic component prefix +# match returns empty for fqn-less components. +ARG GATEWAY_REF=fix/component-logs-aggregation RUN apt-get update && apt-get install -y --no-install-recommends \ git \ diff --git a/demos/ota_nav2_sensor_fix/gateway_config.yaml b/demos/ota_nav2_sensor_fix/gateway_config.yaml index 3cbda32..1452173 100644 --- a/demos/ota_nav2_sensor_fix/gateway_config.yaml +++ b/demos/ota_nav2_sensor_fix/gateway_config.yaml @@ -26,6 +26,15 @@ ros2_medkit_gateway: discovery: mode: "runtime_only" + runtime: + # The demo's nav2 nodes don't share a meaningful namespace - they + # all live at root with a few `/global_costmap`, `/local_costmap` + # exceptions. Letting the gateway auto-synthesize Functions from + # those namespaces produces single-host "global_costmap" / + # "local_costmap" / "root" Functions that don't represent any + # logical capability. Disable the auto-gen here; without a + # manifest the Functions tree section just stays empty. + create_functions_from_namespaces: false # Enable /updates endpoints; provider supplied by ota_update_plugin below. updates: From 08b0c21e77b0ee33df0840a2d5e5ef66adcdb77d Mon Sep 17 00:00:00 2001 From: Bartosz Burda Date: Tue, 28 Apr 2026 16:21:26 +0200 Subject: [PATCH 41/52] feat(demos/ota): add SOVD manifest with logical functions The previous config disabled function auto-generation but the result was zero Functions - which made the tree's Functions section empty rather than meaningful. Add a manifest defining areas, components, apps, and five logical functions: - Autonomous Navigation (bt_navigator + planner + controller + smoother + behaviors + waypoint + velocity smoother + collision monitor + docking + costmaps + lifecycle manager) - Localization (amcl + map server + lifecycle manager) - Perception (scan_sensor_node + ros_gz_bridge - the OTA target) - Fleet Diagnostics (gateway + fault manager) - Live Telemetry (foxglove bridge + robot state publisher) These are the capabilities the demo narrative pivots on: an operator viewing the tree sees "Perception is broken" or "Autonomous Navigation is degraded" rather than scrolling through 27 individual nodes. Switches discovery to hybrid mode so manifest entities + runtime discovery cooperate. unmanifested_nodes: warn + manifest_strict_validation: false tolerate the OTA-driven runtime changes (broken_lidar_legacy disappears on uninstall, obstacle_classifier_v2 appears on install) without manifest reconciliation churn. create_synthetic_components and create_functions_from_namespaces both stay off - the manifest is the source of truth. --- demos/ota_nav2_sensor_fix/Dockerfile.gateway | 1 + demos/ota_nav2_sensor_fix/gateway_config.yaml | 19 +- demos/ota_nav2_sensor_fix/manifest.yaml | 323 ++++++++++++++++++ 3 files changed, 335 insertions(+), 8 deletions(-) create mode 100644 demos/ota_nav2_sensor_fix/manifest.yaml diff --git a/demos/ota_nav2_sensor_fix/Dockerfile.gateway b/demos/ota_nav2_sensor_fix/Dockerfile.gateway index 5a366cd..356fb7f 100644 --- a/demos/ota_nav2_sensor_fix/Dockerfile.gateway +++ b/demos/ota_nav2_sensor_fix/Dockerfile.gateway @@ -93,6 +93,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ COPY --from=builder /ws/install /ws/install COPY gateway_config.yaml /etc/ros2_medkit/gateway_config.yaml +COPY manifest.yaml /etc/ros2_medkit/manifest.yaml COPY entrypoint.sh /usr/local/bin/entrypoint.sh RUN chmod +x /usr/local/bin/entrypoint.sh diff --git a/demos/ota_nav2_sensor_fix/gateway_config.yaml b/demos/ota_nav2_sensor_fix/gateway_config.yaml index 1452173..14318d2 100644 --- a/demos/ota_nav2_sensor_fix/gateway_config.yaml +++ b/demos/ota_nav2_sensor_fix/gateway_config.yaml @@ -25,15 +25,18 @@ ros2_medkit_gateway: max_age_seconds: 86400 discovery: - mode: "runtime_only" + # Hybrid: manifest defines areas/components/apps/functions, runtime + # fills in topics/services/params and surfaces OTA-installed nodes + # (e.g. obstacle_classifier_v2 after trigger-install.sh) without + # needing a manifest entry for them. + mode: "hybrid" + manifest_path: "/etc/ros2_medkit/manifest.yaml" + manifest_strict_validation: false runtime: - # The demo's nav2 nodes don't share a meaningful namespace - they - # all live at root with a few `/global_costmap`, `/local_costmap` - # exceptions. Letting the gateway auto-synthesize Functions from - # those namespaces produces single-host "global_costmap" / - # "local_costmap" / "root" Functions that don't represent any - # logical capability. Disable the auto-gen here; without a - # manifest the Functions tree section just stays empty. + # Manifest defines components, no need for synthetic ones. + create_synthetic_components: false + # Manifest defines functions; the auto-gen-from-namespaces path + # produces single-host noise because the nav2 stack lives at /. create_functions_from_namespaces: false # Enable /updates endpoints; provider supplied by ota_update_plugin below. diff --git a/demos/ota_nav2_sensor_fix/manifest.yaml b/demos/ota_nav2_sensor_fix/manifest.yaml new file mode 100644 index 0000000..3fc3692 --- /dev/null +++ b/demos/ota_nav2_sensor_fix/manifest.yaml @@ -0,0 +1,323 @@ +# Copyright 2026 bburda. Apache-2.0. +# +# SOVD manifest for the OTA over SOVD nav2 sensor-fix demo. +# +# The demo runs in hybrid discovery mode: this manifest defines areas / +# components / apps / functions, runtime discovery fills in topic +# bindings, and OTA-installed nodes (e.g. obstacle_classifier_v2 after +# trigger-install.sh) appear automatically without needing a manifest +# entry. +# +# The point of the manifest is to make the Functions tree meaningful: +# without it the gateway either auto-synthesizes one Function per +# namespace (mostly single-host noise on this stack, where everything +# lives at root) or leaves it empty. Here we group nodes by capability +# so an operator viewing the tree sees Navigation / Localization / +# Perception / Diagnostics / Visualization, not opaque hashes. + +manifest_version: "1.0" + +metadata: + name: "ota-nav2-sensor-fix" + description: "OTA over SOVD demo - TurtleBot3 + Nav2 + headless Gazebo + ros2_medkit gateway" + version: "0.1.0" + +config: + # broken_lidar_legacy gets uninstalled by the OTA flow; obstacle_classifier + # gets installed at runtime. Both are unmanifested at boot - tolerate. + unmanifested_nodes: warn + # Pull in topics, services, params from the running graph for entities + # the manifest declares. + inherit_runtime_resources: true + +# ============================================================================= +# AREAS +# ============================================================================= +areas: + - id: robot + name: "Robot" + description: "TurtleBot3 platform, sensors, and TF infrastructure" + namespace: / + + - id: navigation + name: "Navigation" + description: "Nav2 motion / planning / control stack" + namespace: / + + - id: diagnostics + name: "Diagnostics" + description: "ros2_medkit gateway, fault manager, OTA" + namespace: / + + - id: visualization + name: "Visualization" + description: "Foxglove bridge and simulator-ROS bridge" + namespace: / + +# ============================================================================= +# COMPONENTS - logical groupings the apps live on +# ============================================================================= +components: + - id: lidar-sensor + name: "LiDAR Sensor" + type: "sensor" + description: "2D LiDAR (broken_lidar / fixed_lidar after OTA swap)" + area: robot + + - id: robot-base + name: "TurtleBot3 Base" + type: "platform" + description: "Robot platform - URDF publisher and gz bridge" + area: robot + + - id: nav2-motion + name: "Nav2 Motion" + type: "controller" + description: "BT navigator, planner, controller, behaviors, costmaps" + area: navigation + + - id: nav2-localization + name: "Nav2 Localization" + type: "controller" + description: "AMCL + map server + localization lifecycle manager" + area: navigation + + - id: medkit-gateway-unit + name: "ros2_medkit Gateway" + type: "controller" + description: "SOVD HTTP gateway with the OTA update plugin loaded" + area: diagnostics + + - id: fault-manager-unit + name: "Fault Manager" + type: "controller" + description: "Fault aggregation backend behind /faults" + area: diagnostics + + - id: foxglove-unit + name: "Foxglove Bridge" + type: "controller" + description: "WebSocket bridge for Foxglove Studio panels" + area: visualization + +# ============================================================================= +# APPS - one per ROS 2 node we care about (skip nav2-internal _rclcpp_node +# helpers and transform_listener_impl_* - they're plumbing, not user-facing) +# ============================================================================= +apps: + # ── LiDAR / perception ──────────────────────────────────────────── + - id: scan-sensor + name: "Scan Sensor" + category: "sensor" + is_located_on: lidar-sensor + description: "LaserScan publisher (broken_lidar pre-OTA, fixed_lidar post-OTA)" + ros_binding: { node_name: scan_sensor_node, namespace: / } + + - id: ros-gz-bridge + name: "ROS-Gazebo Bridge" + category: "simulation" + is_located_on: lidar-sensor + description: "Bridges /scan_sim and /clock from gz-sim" + ros_binding: { node_name: ros_gz_bridge, namespace: / } + + # ── Robot platform ──────────────────────────────────────────────── + - id: robot-state-publisher + name: "Robot State Publisher" + category: "platform" + is_located_on: robot-base + description: "Publishes the robot URDF TF tree" + ros_binding: { node_name: robot_state_publisher, namespace: / } + + # ── Nav2 motion ─────────────────────────────────────────────────── + - id: bt-navigator + name: "BT Navigator" + category: "navigation" + is_located_on: nav2-motion + description: "Behavior Tree navigator - hosts navigate_to_pose action" + ros_binding: { node_name: bt_navigator, namespace: / } + + - id: planner-server + name: "Planner Server" + category: "navigation" + is_located_on: nav2-motion + description: "Global path planner" + ros_binding: { node_name: planner_server, namespace: / } + + - id: controller-server + name: "Controller Server" + category: "navigation" + is_located_on: nav2-motion + description: "Local path follower" + ros_binding: { node_name: controller_server, namespace: / } + + - id: smoother-server + name: "Smoother Server" + category: "navigation" + is_located_on: nav2-motion + description: "Path smoothing" + ros_binding: { node_name: smoother_server, namespace: / } + + - id: route-server + name: "Route Server" + category: "navigation" + is_located_on: nav2-motion + description: "Route planning" + ros_binding: { node_name: route_server, namespace: / } + + - id: behavior-server + name: "Behavior Server" + category: "navigation" + is_located_on: nav2-motion + description: "Recovery behaviors" + ros_binding: { node_name: behavior_server, namespace: / } + + - id: waypoint-follower + name: "Waypoint Follower" + category: "navigation" + is_located_on: nav2-motion + description: "Sequenced waypoint navigation" + ros_binding: { node_name: waypoint_follower, namespace: / } + + - id: velocity-smoother + name: "Velocity Smoother" + category: "navigation" + is_located_on: nav2-motion + description: "/cmd_vel smoothing" + ros_binding: { node_name: velocity_smoother, namespace: / } + + - id: collision-monitor + name: "Collision Monitor" + category: "navigation" + is_located_on: nav2-motion + description: "Emergency stop on imminent collision" + ros_binding: { node_name: collision_monitor, namespace: / } + + - id: docking-server + name: "Docking Server" + category: "navigation" + is_located_on: nav2-motion + description: "Approach + dock action" + ros_binding: { node_name: docking_server, namespace: / } + + - id: global-costmap + name: "Global Costmap" + category: "navigation" + is_located_on: nav2-motion + description: "Static + obstacle costmap for planning" + ros_binding: { node_name: global_costmap, namespace: /global_costmap } + + - id: local-costmap + name: "Local Costmap" + category: "navigation" + is_located_on: nav2-motion + description: "Local rolling costmap for control" + ros_binding: { node_name: local_costmap, namespace: /local_costmap } + + - id: lifecycle-manager-navigation + name: "Lifecycle Manager (Navigation)" + category: "navigation" + is_located_on: nav2-motion + description: "Nav2 motion lifecycle orchestration" + ros_binding: { node_name: lifecycle_manager_navigation, namespace: / } + + # ── Nav2 localization ───────────────────────────────────────────── + - id: amcl + name: "AMCL" + category: "localization" + is_located_on: nav2-localization + description: "Adaptive Monte Carlo Localization" + ros_binding: { node_name: amcl, namespace: / } + + - id: map-server + name: "Map Server" + category: "localization" + is_located_on: nav2-localization + description: "Static map publisher" + ros_binding: { node_name: map_server, namespace: / } + + - id: lifecycle-manager-localization + name: "Lifecycle Manager (Localization)" + category: "localization" + is_located_on: nav2-localization + description: "Localization lifecycle orchestration" + ros_binding: { node_name: lifecycle_manager_localization, namespace: / } + + # ── Diagnostics ─────────────────────────────────────────────────── + - id: medkit-gateway + name: "ros2_medkit Gateway" + category: "gateway" + is_located_on: medkit-gateway-unit + description: "SOVD REST gateway, hosts the OTA plugin" + ros_binding: { node_name: ros2_medkit_gateway, namespace: / } + + - id: medkit-fault-manager + name: "Fault Manager" + category: "diagnostics" + is_located_on: fault-manager-unit + description: "Fault aggregation + storage" + ros_binding: { node_name: fault_manager, namespace: / } + + # ── Visualization ───────────────────────────────────────────────── + - id: foxglove-bridge + name: "Foxglove Bridge" + category: "visualization" + is_located_on: foxglove-unit + description: "WebSocket bridge on :8765" + ros_binding: { node_name: foxglove_bridge, namespace: / } + +# ============================================================================= +# FUNCTIONS - what the user actually selects in the tree to ask +# "is this capability working?" +# ============================================================================= +functions: + - id: autonomous-navigation + name: "Autonomous Navigation" + category: "mobility" + description: "Plan + drive to a goal pose - the headline OTA demo capability" + hosted_by: + - bt-navigator + - planner-server + - controller-server + - smoother-server + - route-server + - behavior-server + - waypoint-follower + - velocity-smoother + - collision-monitor + - docking-server + - global-costmap + - local-costmap + - lifecycle-manager-navigation + + - id: localization + name: "Localization" + category: "mobility" + description: "Where is the robot in the map - AMCL + map server" + hosted_by: + - amcl + - map-server + - lifecycle-manager-localization + + - id: perception + name: "Perception" + category: "sensing" + description: "LaserScan stream feeding nav2 - the OTA target" + hosted_by: + - scan-sensor + - ros-gz-bridge + + - id: fleet-diagnostics + name: "Fleet Diagnostics" + category: "diagnostics" + description: "SOVD REST surface + fault aggregation - this panel" + hosted_by: + - medkit-gateway + - medkit-fault-manager + + - id: live-telemetry + name: "Live Telemetry" + category: "observability" + description: "Foxglove bridge + URDF publisher feeding the 3D panel" + hosted_by: + - foxglove-bridge + - robot-state-publisher From dab79c9a838456315990c9e5bbdd3196b367687e Mon Sep 17 00:00:00 2001 From: Bartosz Burda Date: Wed, 29 Apr 2026 09:24:52 +0200 Subject: [PATCH 42/52] feat(demos/ota): reactive fault narrative + reproducible artefact build Round of work to make the OTA demo's fault story actually visible AND to lock the build/test path against regression. Reactive SCAN_PHANTOM_RETURN fault: - broken_lidar subscribes /cmd_vel (Twist + TwistStamped, since Nav2 Jazzy uses the stamped variant) and only reports the fault while the controller is actively driving. Empty dashboard at boot, lights up the moment the operator publishes /goal_pose, stays active through recovery behaviors. The phantom itself is now a 21-ray, 0.5 m wedge centered on index 180 - one ray at 1 m the planner just routes around because the costmap raytraces it away. - fixed_lidar fires EVENT_PASSED at 500 ms (4x the broken_lidar tick) so the FaultManager debounce counter overtakes whatever broken_lidar accumulated, no matter how long the operator was driving before the OTA. Stops being load-bearing once the fault flips to HEALED. - demo.launch.py sets fault_manager to in-memory storage (clean dashboard at every boot - SQLite was persisting last session's CONFIRMED entry) and turns on healing_enabled with threshold 2. Reproducible artefact build: - ota_update_server/Dockerfile is now multi-stage. Stage 1 (ros:jazzy) clones ros2_medkit at the same ref the gateway image pins, builds ros2_medkit_msgs, builds fixed_lidar + obstacle_classifier_v2 from the demo's ros2_packages/, and runs pack_artifact.py to produce tarballs + catalog.json. Stage 2 ships the slim FastAPI server + the tarballs via COPY --from. `docker compose build` is now the reproducible path - no "build artefacts on host first" step. - CI: dropped the separate "Build artifacts inside ros:jazzy" step; `docker compose up -d --build` does it atomically. - scripts/build_artifacts.sh stays as an opt-in dev convenience. It no longer silently bootstraps msgs - if the env doesn't have it on the prefix path it errors out with instructions to either source an overlay or use `docker compose build`. Less hidden state. Demo narrative regression test: - tests/smoke_test_demo_narrative.sh exercises the full beat: publish /goal_pose, assert SCAN_PHANTOM_RETURN appears (reactive proof - if the fault appears, broken_lidar's /cmd_vel subscription saw motion, so we don't need a brittle `ros2 topic echo` snapshot), trigger OTA prepare+execute, assert process flip, assert fault clears, assert /cmd_vel settles. CI gets a new ota-demo-narrative job that runs this in isolation from the API-only smoke check. --- .github/workflows/ci.yml | 61 ++++- .../ota_update_server/Dockerfile | 99 ++++++- .../ros2_packages/broken_lidar/CMakeLists.txt | 4 +- .../ros2_packages/broken_lidar/package.xml | 2 + .../broken_lidar/src/broken_lidar_node.cpp | 118 +++++++- .../ros2_packages/fixed_lidar/CMakeLists.txt | 3 +- .../ros2_packages/fixed_lidar/package.xml | 1 + .../fixed_lidar/src/fixed_lidar_node.cpp | 67 +++++ .../launch/demo.launch.py | 22 +- .../scripts/build_artifacts.sh | 28 ++ tests/smoke_test_demo_narrative.sh | 254 ++++++++++++++++++ 11 files changed, 635 insertions(+), 24 deletions(-) create mode 100755 tests/smoke_test_demo_narrative.sh diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0432951..d9312b7 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -184,6 +184,55 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 + - name: Build and start OTA demo + working-directory: demos/ota_nav2_sensor_fix + run: docker compose up -d --build + + - name: Run smoke tests + run: ./tests/smoke_test_ota.sh + + - name: Show gateway logs on failure + if: failure() + working-directory: demos/ota_nav2_sensor_fix + run: docker compose logs gateway --tail=200 + + - name: Show update server logs on failure + if: failure() + working-directory: demos/ota_nav2_sensor_fix + run: docker compose logs ota_update_server --tail=200 + + - name: Teardown + if: always() + working-directory: demos/ota_nav2_sensor_fix + run: docker compose down + + # Separate job from build-and-test-ota: this one publishes /goal_pose, + # waits for the controller to actually try to drive, asserts the + # reactive SCAN_PHANTOM_RETURN fault appears, and only then runs the + # OTA swap. Catches regressions in the demo narrative (broken_lidar + # subscribing /cmd_vel, fault_manager debounce, fixed_lidar not + # reporting). Slower than the API-only smoke job because it has to + # wait for nav2 lifecycle to settle and for /cmd_vel to actually fire, + # so it's split out and can fail in isolation without blocking the + # quick OTA-endpoint check. + ota-demo-narrative: + needs: lint + runs-on: ubuntu-24.04 + steps: + - name: Show triggering source + if: github.event_name == 'repository_dispatch' + run: | + SHA="${{ github.event.client_payload.sha }}" + RUN_URL="${{ github.event.client_payload.run_url }}" + echo "## Triggered by ros2_medkit" >> "$GITHUB_STEP_SUMMARY" + echo "- Commit: \`${SHA:-unknown}\`" >> "$GITHUB_STEP_SUMMARY" + if [ -n "$RUN_URL" ]; then + echo "- Run: [View triggering run]($RUN_URL)" >> "$GITHUB_STEP_SUMMARY" + fi + + - name: Checkout repository + uses: actions/checkout@v4 + - name: Build artifacts (catalog + tarballs) inside ros:jazzy working-directory: demos/ota_nav2_sensor_fix run: | @@ -211,25 +260,19 @@ jobs: cd .. ./scripts/build_artifacts.sh ' - # Restore ownership of files the container created as root. sudo chown -R "$USER:$USER" . - name: Build and start OTA demo working-directory: demos/ota_nav2_sensor_fix run: docker compose up -d --build - - name: Run smoke tests - run: ./tests/smoke_test_ota.sh + - name: Run demo narrative smoke + run: ./tests/smoke_test_demo_narrative.sh - name: Show gateway logs on failure if: failure() working-directory: demos/ota_nav2_sensor_fix - run: docker compose logs gateway --tail=200 - - - name: Show update server logs on failure - if: failure() - working-directory: demos/ota_nav2_sensor_fix - run: docker compose logs ota_update_server --tail=200 + run: docker compose logs gateway --tail=300 - name: Teardown if: always() diff --git a/demos/ota_nav2_sensor_fix/ota_update_server/Dockerfile b/demos/ota_nav2_sensor_fix/ota_update_server/Dockerfile index 7bc0599..541c46e 100644 --- a/demos/ota_nav2_sensor_fix/ota_update_server/Dockerfile +++ b/demos/ota_nav2_sensor_fix/ota_update_server/Dockerfile @@ -1,21 +1,100 @@ -FROM python:3.11-slim +# Copyright 2026 bburda +# Apache 2.0 +# +# Multi-stage build: stage 1 produces the SOVD catalog + tarballs from +# the demo's ros2_packages/ + the gateway repo (for ros2_medkit_msgs); +# stage 2 ships a slim FastAPI server that serves those artefacts. This +# keeps `docker compose build` self-contained and reproducible on CI - +# no separate "build artifacts on host" step required. +# +# Build context is the demo root (so we can pull in ros2_packages/ + +# scripts/ + ota_update_server/). docker-compose.yml wires that up. + +# ============================================================================= +# Stage 1: build artefacts + catalog inside ros:jazzy +# ============================================================================= +FROM ros:jazzy AS artefact_builder + +ARG GATEWAY_REPO=https://github.com/selfpatch/ros2_medkit.git +ARG GATEWAY_REF=fix/component-logs-aggregation + +RUN apt-get update && apt-get install -y --no-install-recommends \ + git \ + python3-colcon-common-extensions \ + python3-catkin-pkg \ + python3-venv \ + python3-pip \ + build-essential \ + cmake \ + ros-jazzy-rclcpp \ + ros-jazzy-sensor-msgs \ + ros-jazzy-geometry-msgs \ + ros-jazzy-visualization-msgs \ + && rm -rf /var/lib/apt/lists/* + +# ros2_medkit_msgs is the SOVD ReportFault service def - cloned at the +# same ref the gateway image pins so the message ABI stays in lockstep. +RUN git clone --depth=1 --branch ${GATEWAY_REF} ${GATEWAY_REPO} /tmp/ros2_medkit +RUN . /opt/ros/jazzy/setup.sh && \ + cd /tmp/ros2_medkit && \ + colcon build --packages-select ros2_medkit_msgs -# Build context expected to be the demo root (so we can pull in artifacts/) -# rather than ota_update_server/. Compose wires this up. +# Now bring in the demo's source packages + pack_artifact.py. +WORKDIR /demo +COPY ros2_packages /demo/ros2_packages +COPY scripts /demo/scripts + +# Build fixed_lidar + obstacle_classifier_v2 (the only artefacts that +# need a binary - broken_lidar_legacy is uninstall-only). Tarballs + +# catalog.json land in /demo/artifacts. +RUN mkdir -p /demo/ros2_ws/src && \ + for pkg in fixed_lidar obstacle_classifier_v2; do \ + ln -sfn /demo/ros2_packages/$pkg /demo/ros2_ws/src/$pkg; \ + done && \ + . /opt/ros/jazzy/setup.sh && \ + . /tmp/ros2_medkit/install/setup.sh && \ + cd /demo/ros2_ws && \ + colcon build --packages-select fixed_lidar obstacle_classifier_v2 + +# pack_artifact.py uses pure stdlib (json/tarfile/argparse) so we don't +# need the .venv/pytest dance build_artifacts.sh does locally. +RUN mkdir -p /demo/artifacts && \ + rm -f /demo/artifacts/catalog.json && \ + PACK="python3 /demo/scripts/pack_artifact.py" && \ + $PACK --package fixed_lidar --version 2.1.0 \ + --kind update --target-component scan_sensor_node \ + --executable fixed_lidar_node \ + --replaces-executable broken_lidar_node \ + --notes "Fix /scan noise filter" \ + --skip-build --workspace /demo/ros2_ws \ + --out-dir /demo/artifacts --catalog /demo/artifacts/catalog.json && \ + $PACK --package obstacle_classifier_v2 --version 1.0.0 \ + --kind install --target-component obstacle_classifier \ + --executable obstacle_classifier_node \ + --notes "Extra safety layer for nav2" \ + --skip-build --workspace /demo/ros2_ws \ + --out-dir /demo/artifacts --catalog /demo/artifacts/catalog.json && \ + $PACK --package broken_lidar_legacy --version "" \ + --kind uninstall --target-component broken_lidar_legacy \ + --notes "Clean up deprecated package" \ + --skip-build --workspace /demo/ros2_ws \ + --out-dir /demo/artifacts --catalog /demo/artifacts/catalog.json + +# ============================================================================= +# Stage 2: slim runtime image +# ============================================================================= +FROM python:3.11-slim WORKDIR /app COPY ota_update_server/pyproject.toml ./ COPY ota_update_server/ota_update_server ./ota_update_server RUN pip install --no-cache-dir . -# Bake the demo catalog + tarballs into the image so the container is -# self-contained. Bind-mounting artifacts/ at runtime is unreliable on -# WSL2 + Docker Desktop, so we ship them in the image instead. -COPY artifacts /artifacts +COPY --from=artefact_builder /demo/artifacts /artifacts -ENV OTA_ARTIFACTS_DIR=/artifacts -ENV OTA_HOST=0.0.0.0 -ENV OTA_PORT=9000 +ENV OTA_ARTIFACTS_DIR=/artifacts \ + OTA_HOST=0.0.0.0 \ + OTA_PORT=9000 EXPOSE 9000 CMD ["ota-update-server"] diff --git a/demos/ota_nav2_sensor_fix/ros2_packages/broken_lidar/CMakeLists.txt b/demos/ota_nav2_sensor_fix/ros2_packages/broken_lidar/CMakeLists.txt index 9ecd54e..611bed7 100644 --- a/demos/ota_nav2_sensor_fix/ros2_packages/broken_lidar/CMakeLists.txt +++ b/demos/ota_nav2_sensor_fix/ros2_packages/broken_lidar/CMakeLists.txt @@ -12,9 +12,11 @@ endif() find_package(ament_cmake REQUIRED) find_package(rclcpp REQUIRED) find_package(sensor_msgs REQUIRED) +find_package(geometry_msgs REQUIRED) +find_package(ros2_medkit_msgs REQUIRED) add_executable(broken_lidar_node src/broken_lidar_node.cpp) -ament_target_dependencies(broken_lidar_node rclcpp sensor_msgs) +ament_target_dependencies(broken_lidar_node rclcpp sensor_msgs geometry_msgs ros2_medkit_msgs) install(TARGETS broken_lidar_node DESTINATION lib/${PROJECT_NAME}) diff --git a/demos/ota_nav2_sensor_fix/ros2_packages/broken_lidar/package.xml b/demos/ota_nav2_sensor_fix/ros2_packages/broken_lidar/package.xml index 1b4ab49..aa5d569 100644 --- a/demos/ota_nav2_sensor_fix/ros2_packages/broken_lidar/package.xml +++ b/demos/ota_nav2_sensor_fix/ros2_packages/broken_lidar/package.xml @@ -10,6 +10,8 @@ rclcpp sensor_msgs + geometry_msgs + ros2_medkit_msgs ament_cmake diff --git a/demos/ota_nav2_sensor_fix/ros2_packages/broken_lidar/src/broken_lidar_node.cpp b/demos/ota_nav2_sensor_fix/ros2_packages/broken_lidar/src/broken_lidar_node.cpp index 856f760..5b0368d 100644 --- a/demos/ota_nav2_sensor_fix/ros2_packages/broken_lidar/src/broken_lidar_node.cpp +++ b/demos/ota_nav2_sensor_fix/ros2_packages/broken_lidar/src/broken_lidar_node.cpp @@ -1,18 +1,66 @@ // Copyright 2026 bburda. Apache-2.0. +// +// Pre-OTA scan publisher: emits a 360-ray LaserScan at 10 Hz with a +// phantom 1 m return at index 180 (straight ahead). Nav2's local +// costmap traces it as an obstacle and the planner refuses to drive +// through. +// +// Reactive fault reporting (the demo narrative pivots on this beat): +// the node subscribes to /cmd_vel. While the controller is actively +// commanding non-zero motion - i.e. the operator just sent a goal and +// nav2 is trying to drive - we raise SCAN_PHANTOM_RETURN against +// scan_sensor_node every 5 s. After 10 s of idle /cmd_vel we emit +// report_passed and the fault clears from the dashboard. +// +// Result on screen: the Faults Dashboard is empty when the robot is +// idle, lights up the moment the operator publishes /goal_pose and +// the controller starts spinning, and clears once they stop driving +// or the OTA swap installs fixed_lidar (which doesn't make any of +// these reports). + #include #include #include +#include #include #include +#include +#include +#include using std::chrono_literals::operator""ms; +using std::chrono_literals::operator""s; class BrokenLidarNode : public rclcpp::Node { public: BrokenLidarNode() : Node("scan_sensor_node") { pub_ = create_publisher("scan", 10); timer_ = create_wall_timer(100ms, [this]() { publish_scan(); }); + + // Nav2 Jazzy publishes /cmd_vel as TwistStamped; older stacks (and + // teleop) still use plain Twist. Subscribe to both so the reactive + // fault works regardless of which side is driving. + constexpr double kThresh = 0.01; + auto handle_motion = [this](double linear_x, double angular_z) { + if (std::fabs(linear_x) > kThresh || std::fabs(angular_z) > kThresh) { + last_motion_command_ = now(); + } + }; + cmd_vel_sub_ = create_subscription( + "cmd_vel", 10, + [handle_motion](const geometry_msgs::msg::Twist::SharedPtr msg) { + handle_motion(msg->linear.x, msg->angular.z); + }); + cmd_vel_stamped_sub_ = create_subscription( + "cmd_vel", 10, + [handle_motion](const geometry_msgs::msg::TwistStamped::SharedPtr msg) { + handle_motion(msg->twist.linear.x, msg->twist.angular.z); + }); + + fault_client_ = create_client( + "/fault_manager/report_fault"); + fault_timer_ = create_wall_timer(2s, [this]() { tick_fault(); }); } private: @@ -27,13 +75,79 @@ class BrokenLidarNode : public rclcpp::Node { msg.range_max = 10.0f; constexpr int kRays = 360; msg.ranges.assign(kRays, msg.range_max); - // Inject a 1 m phantom return at angle 0 (straight ahead, ray index 180) - msg.ranges[180] = 1.0f; + // Phantom obstacle: a 21-ray (~20 degree) wedge of 0.5 m returns + // centered straight ahead. A single ray at 1.0 m the local costmap + // happily plans around because nav2 marks one cell, raytracing + // clears it on the next sweep, and the controller drives forward + // anyway. A close wide wedge gets stamped into the local costmap + // as a continuous wall the planner has to swerve to avoid - which + // it can't reliably do because the wedge stays anchored to base_scan + // (it rotates with the robot). End result: the controller stalls, + // BT navigator times out, NavigateToPose returns ABORTED. + constexpr int kPhantomCenter = 180; + constexpr int kPhantomHalfWidth = 10; + constexpr float kPhantomRange = 0.5f; + for (int i = kPhantomCenter - kPhantomHalfWidth; + i <= kPhantomCenter + kPhantomHalfWidth; + ++i) { + msg.ranges[i] = kPhantomRange; + } pub_->publish(msg); } + void tick_fault() { + if (!fault_client_->service_is_ready()) return; + + const auto now_ts = now(); + const bool driving = last_motion_command_.nanoseconds() > 0 + && (now_ts - last_motion_command_).seconds() < kIdleTimeoutSec; + + if (driving) { + send_report(false); // EVENT_FAILED - keep fault active + fault_active_ = true; + } else if (fault_active_) { + send_report(true); // EVENT_PASSED - clear fault + fault_active_ = false; + } + } + + void send_report(bool passed) { + auto req = std::make_shared(); + req->fault_code = "SCAN_PHANTOM_RETURN"; + req->event_type = passed + ? ros2_medkit_msgs::srv::ReportFault::Request::EVENT_PASSED + : ros2_medkit_msgs::srv::ReportFault::Request::EVENT_FAILED; + req->severity = 3; // ERROR + req->description = "LaserScan ray index 180 reports a constant 1.0 m return " + "(straight ahead). Nav2 traces it as a phantom obstacle " + "and the controller cannot make progress."; + req->source_id = "scan_sensor_node"; + + auto cb = [this, passed](rclcpp::Client::SharedFuture fut) { + try { + auto result = fut.get(); + if (!result->accepted) { + RCLCPP_DEBUG(get_logger(), "FaultManager rejected SCAN_PHANTOM_RETURN report (passed=%d)", + passed ? 1 : 0); + } + } catch (const std::exception & e) { + RCLCPP_DEBUG(get_logger(), "ReportFault call failed: %s", e.what()); + } + }; + auto fut = fault_client_->async_send_request(req, cb); + (void)fut; + } + + static constexpr double kIdleTimeoutSec = 10.0; + rclcpp::Publisher::SharedPtr pub_; rclcpp::TimerBase::SharedPtr timer_; + rclcpp::Subscription::SharedPtr cmd_vel_sub_; + rclcpp::Subscription::SharedPtr cmd_vel_stamped_sub_; + rclcpp::Client::SharedPtr fault_client_; + rclcpp::TimerBase::SharedPtr fault_timer_; + rclcpp::Time last_motion_command_{0, 0, RCL_ROS_TIME}; + bool fault_active_{false}; }; int main(int argc, char ** argv) { diff --git a/demos/ota_nav2_sensor_fix/ros2_packages/fixed_lidar/CMakeLists.txt b/demos/ota_nav2_sensor_fix/ros2_packages/fixed_lidar/CMakeLists.txt index d08580f..4244edb 100644 --- a/demos/ota_nav2_sensor_fix/ros2_packages/fixed_lidar/CMakeLists.txt +++ b/demos/ota_nav2_sensor_fix/ros2_packages/fixed_lidar/CMakeLists.txt @@ -12,9 +12,10 @@ endif() find_package(ament_cmake REQUIRED) find_package(rclcpp REQUIRED) find_package(sensor_msgs REQUIRED) +find_package(ros2_medkit_msgs REQUIRED) add_executable(fixed_lidar_node src/fixed_lidar_node.cpp) -ament_target_dependencies(fixed_lidar_node rclcpp sensor_msgs) +ament_target_dependencies(fixed_lidar_node rclcpp sensor_msgs ros2_medkit_msgs) install(TARGETS fixed_lidar_node DESTINATION lib/${PROJECT_NAME}) diff --git a/demos/ota_nav2_sensor_fix/ros2_packages/fixed_lidar/package.xml b/demos/ota_nav2_sensor_fix/ros2_packages/fixed_lidar/package.xml index d0315d7..0a7cdf2 100644 --- a/demos/ota_nav2_sensor_fix/ros2_packages/fixed_lidar/package.xml +++ b/demos/ota_nav2_sensor_fix/ros2_packages/fixed_lidar/package.xml @@ -10,6 +10,7 @@ rclcpp sensor_msgs + ros2_medkit_msgs ament_cmake diff --git a/demos/ota_nav2_sensor_fix/ros2_packages/fixed_lidar/src/fixed_lidar_node.cpp b/demos/ota_nav2_sensor_fix/ros2_packages/fixed_lidar/src/fixed_lidar_node.cpp index 587b97c..dc48728 100644 --- a/demos/ota_nav2_sensor_fix/ros2_packages/fixed_lidar/src/fixed_lidar_node.cpp +++ b/demos/ota_nav2_sensor_fix/ros2_packages/fixed_lidar/src/fixed_lidar_node.cpp @@ -1,18 +1,43 @@ // Copyright 2026 bburda. Apache-2.0. +// +// Post-OTA scan publisher: clean 10 m returns on every ray, no phantom. +// +// On startup we also fire one EVENT_PASSED for SCAN_PHANTOM_RETURN to +// the fault_manager. broken_lidar reactively raises that fault while +// the controller is driving, but it cannot clear its own fault when the +// OTA process kills it - the report_passed call would never run. fixed +// _lidar takes over the scan_sensor_node identity, so it can credibly +// report "the fault is gone" on behalf of that source. Without this the +// Faults Dashboard stays red forever after the OTA swap, even though +// the underlying sensor is healthy. + #include #include #include +#include #include #include +#include using std::chrono_literals::operator""ms; +using std::chrono_literals::operator""s; class FixedLidarNode : public rclcpp::Node { public: FixedLidarNode() : Node("scan_sensor_node") { pub_ = create_publisher("scan", 10); timer_ = create_wall_timer(100ms, [this]() { publish_scan(); }); + + fault_client_ = create_client( + "/fault_manager/report_fault"); + // Heartbeat at 500 ms (4x the broken_lidar FAILED tick rate) so the + // healing path overtakes the FAILED counter quickly even if the + // operator pushed multiple goals before the OTA. Stops being + // load-bearing after the fault flips to HEALED - the storage just + // no-ops further EVENT_PASSED reports - so the perpetual timer is + // free. + clear_timer_ = create_wall_timer(500ms, [this]() { try_clear_fault(); }); } private: @@ -30,8 +55,50 @@ class FixedLidarNode : public rclcpp::Node { pub_->publish(msg); } + void try_clear_fault() { + // FaultManager's healing path uses a signed debounce counter: + // EVENT_FAILED decrements it, EVENT_PASSED increments it, and the + // fault transitions to HEALED only when the counter rises to + // `healing_threshold`. broken_lidar fires a FAILED tick every 2 s + // while nav2 is driving, so by the time fixed_lidar takes over the + // counter can be deeply negative (-15 to -30). We need to send + // *more* PASSED events than broken_lidar sent FAILED events for + // the counter to climb back above the heal threshold. The cheapest + // path is to keep firing every 2 s for the lifetime of the node - + // once the fault flips to HEALED the storage rejects further + // PASSED events as no-ops, so we keep watching but it stops being + // load-bearing. + if (!fault_client_->service_is_ready()) return; + + auto req = std::make_shared(); + req->fault_code = "SCAN_PHANTOM_RETURN"; + req->event_type = ros2_medkit_msgs::srv::ReportFault::Request::EVENT_PASSED; + req->severity = 0; + req->description = "fixed_lidar took over scan_sensor_node - phantom returns no longer published."; + req->source_id = "scan_sensor_node"; + + auto cb = [this](rclcpp::Client::SharedFuture fut) { + try { + auto result = fut.get(); + if (result->accepted) { + clear_call_count_++; + if (clear_call_count_ == 1) { + RCLCPP_INFO(get_logger(), "Sent first EVENT_PASSED for SCAN_PHANTOM_RETURN"); + } + } + } catch (const std::exception & e) { + RCLCPP_DEBUG(get_logger(), "ReportFault EVENT_PASSED call failed: %s", e.what()); + } + }; + auto fut = fault_client_->async_send_request(req, cb); + (void)fut; + } + rclcpp::Publisher::SharedPtr pub_; rclcpp::TimerBase::SharedPtr timer_; + rclcpp::Client::SharedPtr fault_client_; + rclcpp::TimerBase::SharedPtr clear_timer_; + int clear_call_count_{0}; }; int main(int argc, char ** argv) { diff --git a/demos/ota_nav2_sensor_fix/ros2_packages/ota_nav2_sensor_fix_demo/launch/demo.launch.py b/demos/ota_nav2_sensor_fix/ros2_packages/ota_nav2_sensor_fix_demo/launch/demo.launch.py index 4170833..dccf575 100644 --- a/demos/ota_nav2_sensor_fix/ros2_packages/ota_nav2_sensor_fix_demo/launch/demo.launch.py +++ b/demos/ota_nav2_sensor_fix/ros2_packages/ota_nav2_sensor_fix_demo/launch/demo.launch.py @@ -155,13 +155,33 @@ def generate_launch_description(): }.items(), ) + # healing_enabled lets EVENT_PASSED reports actually clear an active + # fault instead of stalling at PREFAILED. We need this so fixed_lidar + # (post-OTA) can clear the SCAN_PHANTOM_RETURN that broken_lidar + # raised - otherwise the Faults Dashboard stays red forever after + # the OTA swap. Threshold of 2 keeps things responsive while still + # debouncing single accidental EVENT_PASSED reports. fault_manager = Node( package='ros2_medkit_fault_manager', executable='fault_manager_node', name='fault_manager', namespace='', output='screen', - parameters=[{'use_sim_time': use_sim_time}], + parameters=[{ + 'use_sim_time': use_sim_time, + # `memory` storage so faults don't persist across container + # restarts - the demo is supposed to start with a clean + # Faults Dashboard every time, and SQLite kept the last + # SCAN_PHANTOM_RETURN around forever. + 'storage_type': 'memory', + # healing_enabled lets EVENT_PASSED reports actually heal an + # active fault instead of stalling at PREFAILED (so + # fixed_lidar can clear the fault that broken_lidar raised + # via the OTA swap). healing_threshold=2 gives a small + # debounce against single accidental clears. + 'healing_enabled': True, + 'healing_threshold': 2, + }], ) foxglove = Node( diff --git a/demos/ota_nav2_sensor_fix/scripts/build_artifacts.sh b/demos/ota_nav2_sensor_fix/scripts/build_artifacts.sh index c34ce0e..05f7e28 100755 --- a/demos/ota_nav2_sensor_fix/scripts/build_artifacts.sh +++ b/demos/ota_nav2_sensor_fix/scripts/build_artifacts.sh @@ -1,4 +1,20 @@ #!/usr/bin/env bash +# +# Optional dev-convenience: build artefact tarballs + catalog.json on +# the host so a maintainer can iterate on broken_lidar / fixed_lidar / +# obstacle_classifier_v2 without going through `docker compose build` +# every time. +# +# This script is NOT load-bearing for CI or distribution. The +# reproducible path is `docker compose build ota_update_server`, which +# multi-stage-builds the same artefacts inside ros:jazzy. If you don't +# want to think about ROS env on your host, use compose. +# +# Prerequisites for running locally: +# - /opt/ros/jazzy on the prefix path +# - ros2_medkit_msgs sourced (e.g. via a colcon overlay built from +# a local clone of ros2_medkit; the gateway image embeds this). + set -eo pipefail SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" DEMO_DIR="$(dirname "$SCRIPT_DIR")" @@ -7,6 +23,18 @@ ARTIFACTS="$DEMO_DIR/artifacts" # shellcheck disable=SC1091 source /opt/ros/jazzy/setup.bash + +if ! ros2 pkg prefix ros2_medkit_msgs > /dev/null 2>&1; then + echo "ros2_medkit_msgs not found on the prefix path." >&2 + echo "" >&2 + echo "fixed_lidar / broken_lidar depend on the SOVD ReportFault service" >&2 + echo "definition that lives in ros2_medkit. Either:" >&2 + echo " - source an overlay that has it built, or" >&2 + echo " - run 'docker compose build ota_update_server' instead - that" >&2 + echo " path is reproducible and bundles the msgs build internally." >&2 + exit 1 +fi + set -u mkdir -p "$WS/src" diff --git a/tests/smoke_test_demo_narrative.sh b/tests/smoke_test_demo_narrative.sh new file mode 100755 index 0000000..fdf8e69 --- /dev/null +++ b/tests/smoke_test_demo_narrative.sh @@ -0,0 +1,254 @@ +#!/bin/bash +# Demo-narrative smoke test for the ota_nav2_sensor_fix demo. +# +# The other smoke test (smoke_test_ota.sh) exercises the SOVD endpoints +# and the OTA process swap, but it never tries to navigate. The headline +# scene of the demo - "operator sends a goal, robot stops, fault appears, +# operator OTAs, robot drives, fault clears" - was unguarded against +# regression. This script reproduces that flow end-to-end so a CI run +# either confirms it works or fails loudly when it doesn't. +# +# What it asserts, in order: +# 1. Gateway healthy, /faults clean (dashboard quiet at boot). +# 2. Nav2 motion lifecycle active (hosted_by under autonomous-navigation +# function returns at least one node with status=active). +# 3. Publish /goal_pose. Within ~20 s the controller spins /cmd_vel +# non-zero - proves the BT navigator accepted the goal. +# 4. Within ~30 s of /cmd_vel firing, SCAN_PHANTOM_RETURN appears in +# /faults with status=active and severity=error. Proves the reactive +# fault path (broken_lidar reports while controller is driving). +# 5. PUT /updates/fixed_lidar_2_1_0/{prepare,execute}. Process flips +# from broken_lidar_node to fixed_lidar_node. +# 6. Re-publish /goal_pose. /cmd_vel goes idle within 60 s (the robot +# either completes or the BT terminates - either way nav2 stops +# driving), then SCAN_PHANTOM_RETURN clears (status=cleared OR drops +# out of the active list) within the broken_lidar idle timeout. +# +# Usage: ./tests/smoke_test_demo_narrative.sh [GATEWAY_URL] + +GATEWAY_URL="${1:-http://localhost:8080}" +# shellcheck disable=SC2034 # used by smoke_lib.sh +API_BASE="${GATEWAY_URL}/api/v1" + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=tests/smoke_lib.sh +source "${SCRIPT_DIR}/smoke_lib.sh" + +trap print_summary EXIT + +GATEWAY_CONTAINER="${OTA_DEMO_GATEWAY_CONTAINER:-ota_demo_gateway}" + +# Run a `ros2 ...` command inside the gateway container with the +# environment sourced. Stays one-shot - we always re-source because the +# container's entrypoint isn't an interactive shell. +ros2_in_gw() { + docker exec "$GATEWAY_CONTAINER" bash -lc \ + "source /opt/ros/jazzy/setup.bash && source /ws/install/setup.bash && $*" +} + +publish_goal() { + local x="${1:-1.5}" + local y="${2:-0.0}" + ros2_in_gw "ros2 topic pub --once /goal_pose geometry_msgs/PoseStamped \ + '{header: {frame_id: map}, pose: {position: {x: ${x}, y: ${y}, z: 0.0}, orientation: {w: 1.0}}}'" \ + > /dev/null +} + +# Returns 0 if /cmd_vel has reported a non-zero linear.x|angular.z within +# the last `timeout` seconds. Uses `ros2 topic echo --once` with a wall +# timeout so we don't block forever in the empty-/cmd_vel idle case. +poll_cmd_vel_until_active() { + local timeout="${1:-30}" + local elapsed=0 + while [ $elapsed -lt "$timeout" ]; do + local sample + sample=$(ros2_in_gw "timeout 2 ros2 topic echo /cmd_vel --once --field linear" 2>/dev/null || true) + # Any non-zero coordinate trips the assertion. + if echo "$sample" | grep -qE 'x: -?[0-9]*\.?[0-9]*[1-9]'; then + return 0 + fi + if echo "$sample" | grep -qE 'y: -?[0-9]*\.?[0-9]*[1-9]'; then + return 0 + fi + sleep 2 + elapsed=$((elapsed + 4)) # 2s sample + 2s sleep + done + return 1 +} + +# Returns 0 once /cmd_vel has stayed at zero for `quiet_sec` seconds, or +# fails after `timeout`. +poll_cmd_vel_until_idle() { + local timeout="${1:-90}" + local quiet_sec="${2:-5}" + local elapsed=0 + local zero_streak=0 + while [ $elapsed -lt "$timeout" ]; do + local sample + sample=$(ros2_in_gw "timeout 2 ros2 topic echo /cmd_vel --once --field linear" 2>/dev/null || true) + if echo "$sample" | grep -qE 'x: -?[0-9]*\.?[0-9]*[1-9]'; then + zero_streak=0 + else + zero_streak=$((zero_streak + 4)) + if [ $zero_streak -ge "$quiet_sec" ]; then + return 0 + fi + fi + sleep 2 + elapsed=$((elapsed + 4)) + done + return 1 +} + +# Returns 0 if a fault with fault_code == $1 is present in /faults with +# any non-cleared status (CONFIRMED / PREFAILED / FAILED). The +# fault_manager goes through several debounced states; for the demo we +# just need "the dashboard would show this fault to the operator". +fault_active() { + local code="$1" + api_get "/faults" || return 1 + echo "$RESPONSE" | jq -e --arg c "$code" \ + '.items[] + | select(.fault_code == $c) + | select((.status // "") | ascii_upcase | test("CONFIRMED|PREFAILED|FAILED"))' \ + > /dev/null 2>&1 +} + +# Returns 0 once $1 fault is no longer present in items OR carries a +# cleared / healed status. Healing in fault_manager goes via PREFAILED +# until healing_threshold EVENT_PASSED reports flush it; this poll is +# the dashboard's "the red went away" check. +poll_fault_cleared() { + local code="$1" + local timeout="${2:-30}" + local elapsed=0 + while [ $elapsed -lt "$timeout" ]; do + if ! fault_active "$code"; then + return 0 + fi + sleep 2 + elapsed=$((elapsed + 2)) + done + return 1 +} + +# --------------------------------------------------------------------- +# Wait for gateway + the nav2 stack to be ready +# --------------------------------------------------------------------- +wait_for_gateway 120 + +section "Pre-flight" + +# Plugin boot poll registers the catalog - wait so the OTA step works. +echo " Waiting for plugin catalog (max 30s)..." +if poll_until "/updates" '.items[] | select(. == "fixed_lidar_2_1_0")' 30; then + pass "OTA catalog ready" +else + fail "OTA catalog ready" "fixed_lidar_2_1_0 missing from /updates" + exit 1 +fi + +# Faults Dashboard quiet at boot - the reactive fault should NOT be active +# until the operator publishes a goal. +if ! fault_active "SCAN_PHANTOM_RETURN"; then + pass "no SCAN_PHANTOM_RETURN fault active at boot" +else + fail "no SCAN_PHANTOM_RETURN fault active at boot" "fault present before goal_pose was published" +fi + +# Nav2 stack should be lifecycle-active. lifecycle_manager_navigation +# publishes its state on /lifecycle_manager_navigation/transition_event; +# easier proxy: SOVD says the node is live. +echo " Waiting for bt_navigator to be discoverable (max 60s)..." +if poll_until "/apps" '.items[] | select(.id == "bt-navigator")' 60; then + pass "bt_navigator app present" +else + fail "bt_navigator app present" "missing from /apps after 60s" + exit 1 +fi + +# --------------------------------------------------------------------- +# Phase 1: pre-OTA goal -> robot tries to drive -> reactive fault fires +# --------------------------------------------------------------------- +section "Reactive fault: pre-OTA goal triggers SCAN_PHANTOM_RETURN" + +publish_goal 1.5 0.0 +echo " Published /goal_pose (1.5, 0.0). Waiting for SCAN_PHANTOM_RETURN to appear..." + +# The fault appearing IS the proof the reactive path is wired correctly: +# broken_lidar only reports SCAN_PHANTOM_RETURN when its /cmd_vel +# subscription has seen non-zero motion, so a fault here means the BT +# navigator accepted the goal AND the controller spun /cmd_vel up. The +# previous explicit `ros2 topic echo /cmd_vel` check was brittle - echo +# --once snapshots a single message and missed the transient stream - +# without giving us any extra signal beyond what the fault provides. +elapsed=0 +while [ $elapsed -lt 60 ]; do + if fault_active "SCAN_PHANTOM_RETURN"; then + pass "SCAN_PHANTOM_RETURN active after goal_pose (proves reactive cmd_vel path)" + break + fi + sleep 2 + elapsed=$((elapsed + 2)) +done +if ! fault_active "SCAN_PHANTOM_RETURN"; then + fail "SCAN_PHANTOM_RETURN active after goal_pose" \ + "fault never appeared - either nav2 didn't accept the goal or broken_lidar's /cmd_vel subscription is broken" +fi + +# --------------------------------------------------------------------- +# Phase 2: OTA replaces broken_lidar with fixed_lidar +# --------------------------------------------------------------------- +section "OTA swap: broken_lidar -> fixed_lidar" + +curl -fsS -X PUT -H 'Content-Type: application/json' -d '{}' \ + "${API_BASE}/updates/fixed_lidar_2_1_0/prepare" > /dev/null +sleep 4 +curl -fsS -X PUT -H 'Content-Type: application/json' -d '{}' \ + "${API_BASE}/updates/fixed_lidar_2_1_0/execute" > /dev/null +sleep 6 + +if docker exec "$GATEWAY_CONTAINER" pgrep -f "/lib/fixed_lidar/fixed_lidar_node" > /dev/null 2>&1; then + pass "fixed_lidar_node spawned after OTA" +else + fail "fixed_lidar_node spawned after OTA" "process missing in gateway container" +fi +if docker exec "$GATEWAY_CONTAINER" pgrep -f "/lib/broken_lidar/broken_lidar_node" > /dev/null 2>&1; then + fail "broken_lidar_node killed after OTA" "still alive in gateway container" +else + pass "broken_lidar_node killed after OTA" +fi + +# --------------------------------------------------------------------- +# Phase 3: post-OTA goal -> robot drives -> fault clears +# --------------------------------------------------------------------- +section "Post-OTA: SCAN_PHANTOM_RETURN clears, robot completes the goal" + +# Re-issue the goal so the new run is on fixed_lidar. +publish_goal 1.5 0.0 +sleep 5 + +# fixed_lidar fires EVENT_PASSED at 500 ms; the fault_manager debounce +# counter has to climb above the FAILED count broken_lidar accumulated +# (one tick every 2 s while nav2 was driving). On a clean boot that's +# usually < 30 FAILED events so 60 s of healing at 500 ms is plenty - +# but we give 90 s of headroom for slow runners and replays where the +# operator pushed a couple of goals before the OTA. +echo " Waiting for SCAN_PHANTOM_RETURN to clear (max 90s)..." +if poll_fault_cleared "SCAN_PHANTOM_RETURN" 90; then + pass "SCAN_PHANTOM_RETURN cleared after OTA" +else + fail "SCAN_PHANTOM_RETURN cleared after OTA" \ + "fault still active - either fixed_lidar misbehaving or fault_manager debounce too long" +fi + +# Generous timeout - nav2 may still be planning; we just assert the +# controller eventually stops driving (succeeded OR aborted both end +# the active /cmd_vel stream). Either outcome unblocks the demo loop. +echo " Waiting for /cmd_vel to settle (max 90s)..." +if poll_cmd_vel_until_idle 90 6; then + pass "/cmd_vel returned to idle (nav2 finished or gave up)" +else + fail "/cmd_vel returned to idle (nav2 finished or gave up)" \ + "controller still commanding after 90s - something is wedged" +fi From 7bd90be75a05f8c8cd06d723e384223e1be6929b Mon Sep 17 00:00:00 2001 From: Bartosz Burda Date: Wed, 29 Apr 2026 10:55:33 +0200 Subject: [PATCH 43/52] feat(demos/ota): drop areas from manifest - components are the boundary Areas were doing zero work in this manifest: every area held one or two components, every component lived in exactly one area, and the operator viewing the tree just saw the same things twice. For a single-robot demo the meaningful boundaries are: - Components = OTA / hardware units (lidar, nav2 stack, gateway, ...). "What can I swap?" - Apps = the ROS 2 nodes that live on a component. - Functions = capability groupings (Autonomous Navigation, Perception) that pull apps from across components. SOVD allows Areas, but they only earn their slot when there's a real zone partition - powertrain / body / chassis on a vehicle ECU mesh, or multi-robot tenancy. Adding them here was decorative. The Entity Browser Functions section already covers the cross-cutting view (Perception spans LiDAR Sensor + TurtleBot3 Base components). With areas gone the tree shows the operator three things, each with a job: what to update, which node to look at, and which capability is at risk. --- demos/ota_nav2_sensor_fix/manifest.yaml | 61 ++++++++----------------- 1 file changed, 18 insertions(+), 43 deletions(-) diff --git a/demos/ota_nav2_sensor_fix/manifest.yaml b/demos/ota_nav2_sensor_fix/manifest.yaml index 3fc3692..eb24410 100644 --- a/demos/ota_nav2_sensor_fix/manifest.yaml +++ b/demos/ota_nav2_sensor_fix/manifest.yaml @@ -2,18 +2,24 @@ # # SOVD manifest for the OTA over SOVD nav2 sensor-fix demo. # -# The demo runs in hybrid discovery mode: this manifest defines areas / -# components / apps / functions, runtime discovery fills in topic -# bindings, and OTA-installed nodes (e.g. obstacle_classifier_v2 after -# trigger-install.sh) appear automatically without needing a manifest -# entry. +# Hybrid discovery: the manifest defines components + apps + functions, +# runtime discovery fills in topics/services/params, and OTA-installed +# nodes (obstacle_classifier_v2 after trigger-install.sh) appear +# automatically without needing a manifest entry. # -# The point of the manifest is to make the Functions tree meaningful: -# without it the gateway either auto-synthesizes one Function per -# namespace (mostly single-host noise on this stack, where everything -# lives at root) or leaves it empty. Here we group nodes by capability -# so an operator viewing the tree sees Navigation / Localization / -# Perception / Diagnostics / Visualization, not opaque hashes. +# Hierarchy used for this demo: +# - Components are the OTA / hardware boundaries: a LiDAR sensor, the +# Nav2 motion stack, the diagnostics services, etc. This is what +# "you can swap" looks like. +# - Apps are the ROS 2 nodes that live on a component. +# - Functions are capability groupings - "Autonomous Navigation", +# "Perception" - that pull apps from across components when the +# capability is delivered by more than one node. +# +# Areas are intentionally left out. SOVD allows them, but they only add +# value when there's a meaningful spatial / zone partition (powertrain / +# body / chassis on a vehicle ECU mesh). On a single robot they just +# regroup the same components a second time and clutter the tree. manifest_version: "1.0" @@ -31,74 +37,43 @@ config: inherit_runtime_resources: true # ============================================================================= -# AREAS -# ============================================================================= -areas: - - id: robot - name: "Robot" - description: "TurtleBot3 platform, sensors, and TF infrastructure" - namespace: / - - - id: navigation - name: "Navigation" - description: "Nav2 motion / planning / control stack" - namespace: / - - - id: diagnostics - name: "Diagnostics" - description: "ros2_medkit gateway, fault manager, OTA" - namespace: / - - - id: visualization - name: "Visualization" - description: "Foxglove bridge and simulator-ROS bridge" - namespace: / - -# ============================================================================= -# COMPONENTS - logical groupings the apps live on +# COMPONENTS - OTA / hardware boundaries; the apps live on these # ============================================================================= components: - id: lidar-sensor name: "LiDAR Sensor" type: "sensor" description: "2D LiDAR (broken_lidar / fixed_lidar after OTA swap)" - area: robot - id: robot-base name: "TurtleBot3 Base" type: "platform" description: "Robot platform - URDF publisher and gz bridge" - area: robot - id: nav2-motion name: "Nav2 Motion" type: "controller" description: "BT navigator, planner, controller, behaviors, costmaps" - area: navigation - id: nav2-localization name: "Nav2 Localization" type: "controller" description: "AMCL + map server + localization lifecycle manager" - area: navigation - id: medkit-gateway-unit name: "ros2_medkit Gateway" type: "controller" description: "SOVD HTTP gateway with the OTA update plugin loaded" - area: diagnostics - id: fault-manager-unit name: "Fault Manager" type: "controller" description: "Fault aggregation backend behind /faults" - area: diagnostics - id: foxglove-unit name: "Foxglove Bridge" type: "controller" description: "WebSocket bridge for Foxglove Studio panels" - area: visualization # ============================================================================= # APPS - one per ROS 2 node we care about (skip nav2-internal _rclcpp_node From 6b9e08419a2d5842fcdafee4ea6d9394e041806f Mon Sep 17 00:00:00 2001 From: Bartosz Burda Date: Wed, 29 Apr 2026 11:04:50 +0200 Subject: [PATCH 44/52] refactor(demos/ota): collapse to one component, keep apps + functions The previous manifest split apps across seven components - lidar-sensor, robot-base, nav2-motion, nav2-localization, medkit-gateway-unit, fault-manager-unit, foxglove-unit - but none of those subdivisions correspond to anything you can actually OTA independently in this demo. They all run on a single host, the only swap target is the LiDAR app itself, and the seven-component tree just made the operator scroll past artificial boundaries. Replace with a single `turtlebot3` component that owns every app. The hierarchy the operator now sees is: - 1 Component: TurtleBot3 Robot (the OTA boundary). - 22 Apps: the ROS 2 nodes inside it. - 5 Functions: capability groupings (Autonomous Navigation / Localization / Perception / Fleet Diagnostics / Live Telemetry). Functions remain the cross-cutting view - "is Perception working?" pulls scan-sensor + ros-gz-bridge from the single component, no component partition needed to answer that question. Multi-component manifests still make sense in multi-host / multi-ECU scenarios; for this single-robot demo we don't fake them. --- demos/ota_nav2_sensor_fix/manifest.yaml | 105 +++++++++--------------- 1 file changed, 40 insertions(+), 65 deletions(-) diff --git a/demos/ota_nav2_sensor_fix/manifest.yaml b/demos/ota_nav2_sensor_fix/manifest.yaml index eb24410..6f3d42c 100644 --- a/demos/ota_nav2_sensor_fix/manifest.yaml +++ b/demos/ota_nav2_sensor_fix/manifest.yaml @@ -8,18 +8,22 @@ # automatically without needing a manifest entry. # # Hierarchy used for this demo: -# - Components are the OTA / hardware boundaries: a LiDAR sensor, the -# Nav2 motion stack, the diagnostics services, etc. This is what -# "you can swap" looks like. -# - Apps are the ROS 2 nodes that live on a component. +# - One Component: the robot itself. SOVD lets a manifest declare +# multiple components (e.g. one per ECU on a vehicle), but here +# everything runs on a single host so a flat single-component model +# is honest and avoids fake "lidar-sensor / nav2-motion / ..." +# subdivisions that don't correspond to anything you can actually +# swap independently in this demo. +# - Apps are the ROS 2 nodes - one entry per node we want the operator +# to see. They all live on the single robot component. # - Functions are capability groupings - "Autonomous Navigation", -# "Perception" - that pull apps from across components when the -# capability is delivered by more than one node. +# "Perception" etc. - that pull apps together by what they deliver, +# orthogonally to where they run. # # Areas are intentionally left out. SOVD allows them, but they only add -# value when there's a meaningful spatial / zone partition (powertrain / -# body / chassis on a vehicle ECU mesh). On a single robot they just -# regroup the same components a second time and clutter the tree. +# value when there's a meaningful zone partition (powertrain / body / +# chassis on a vehicle ECU mesh, or multi-robot tenancy). For a single +# robot they just regroup the same component a second time. manifest_version: "1.0" @@ -37,43 +41,14 @@ config: inherit_runtime_resources: true # ============================================================================= -# COMPONENTS - OTA / hardware boundaries; the apps live on these +# COMPONENTS - one per OTA boundary. For this demo there's a single host +# so a single component owns every app. # ============================================================================= components: - - id: lidar-sensor - name: "LiDAR Sensor" - type: "sensor" - description: "2D LiDAR (broken_lidar / fixed_lidar after OTA swap)" - - - id: robot-base - name: "TurtleBot3 Base" + - id: turtlebot3 + name: "TurtleBot3 Robot" type: "platform" - description: "Robot platform - URDF publisher and gz bridge" - - - id: nav2-motion - name: "Nav2 Motion" - type: "controller" - description: "BT navigator, planner, controller, behaviors, costmaps" - - - id: nav2-localization - name: "Nav2 Localization" - type: "controller" - description: "AMCL + map server + localization lifecycle manager" - - - id: medkit-gateway-unit - name: "ros2_medkit Gateway" - type: "controller" - description: "SOVD HTTP gateway with the OTA update plugin loaded" - - - id: fault-manager-unit - name: "Fault Manager" - type: "controller" - description: "Fault aggregation backend behind /faults" - - - id: foxglove-unit - name: "Foxglove Bridge" - type: "controller" - description: "WebSocket bridge for Foxglove Studio panels" + description: "TurtleBot3 burger running headless Gazebo, Nav2, the ros2_medkit gateway, and the OTA-managed sensor stack." # ============================================================================= # APPS - one per ROS 2 node we care about (skip nav2-internal _rclcpp_node @@ -84,14 +59,14 @@ apps: - id: scan-sensor name: "Scan Sensor" category: "sensor" - is_located_on: lidar-sensor + is_located_on: turtlebot3 description: "LaserScan publisher (broken_lidar pre-OTA, fixed_lidar post-OTA)" ros_binding: { node_name: scan_sensor_node, namespace: / } - id: ros-gz-bridge name: "ROS-Gazebo Bridge" category: "simulation" - is_located_on: lidar-sensor + is_located_on: turtlebot3 description: "Bridges /scan_sim and /clock from gz-sim" ros_binding: { node_name: ros_gz_bridge, namespace: / } @@ -99,7 +74,7 @@ apps: - id: robot-state-publisher name: "Robot State Publisher" category: "platform" - is_located_on: robot-base + is_located_on: turtlebot3 description: "Publishes the robot URDF TF tree" ros_binding: { node_name: robot_state_publisher, namespace: / } @@ -107,91 +82,91 @@ apps: - id: bt-navigator name: "BT Navigator" category: "navigation" - is_located_on: nav2-motion + is_located_on: turtlebot3 description: "Behavior Tree navigator - hosts navigate_to_pose action" ros_binding: { node_name: bt_navigator, namespace: / } - id: planner-server name: "Planner Server" category: "navigation" - is_located_on: nav2-motion + is_located_on: turtlebot3 description: "Global path planner" ros_binding: { node_name: planner_server, namespace: / } - id: controller-server name: "Controller Server" category: "navigation" - is_located_on: nav2-motion + is_located_on: turtlebot3 description: "Local path follower" ros_binding: { node_name: controller_server, namespace: / } - id: smoother-server name: "Smoother Server" category: "navigation" - is_located_on: nav2-motion + is_located_on: turtlebot3 description: "Path smoothing" ros_binding: { node_name: smoother_server, namespace: / } - id: route-server name: "Route Server" category: "navigation" - is_located_on: nav2-motion + is_located_on: turtlebot3 description: "Route planning" ros_binding: { node_name: route_server, namespace: / } - id: behavior-server name: "Behavior Server" category: "navigation" - is_located_on: nav2-motion + is_located_on: turtlebot3 description: "Recovery behaviors" ros_binding: { node_name: behavior_server, namespace: / } - id: waypoint-follower name: "Waypoint Follower" category: "navigation" - is_located_on: nav2-motion + is_located_on: turtlebot3 description: "Sequenced waypoint navigation" ros_binding: { node_name: waypoint_follower, namespace: / } - id: velocity-smoother name: "Velocity Smoother" category: "navigation" - is_located_on: nav2-motion + is_located_on: turtlebot3 description: "/cmd_vel smoothing" ros_binding: { node_name: velocity_smoother, namespace: / } - id: collision-monitor name: "Collision Monitor" category: "navigation" - is_located_on: nav2-motion + is_located_on: turtlebot3 description: "Emergency stop on imminent collision" ros_binding: { node_name: collision_monitor, namespace: / } - id: docking-server name: "Docking Server" category: "navigation" - is_located_on: nav2-motion + is_located_on: turtlebot3 description: "Approach + dock action" ros_binding: { node_name: docking_server, namespace: / } - id: global-costmap name: "Global Costmap" category: "navigation" - is_located_on: nav2-motion + is_located_on: turtlebot3 description: "Static + obstacle costmap for planning" ros_binding: { node_name: global_costmap, namespace: /global_costmap } - id: local-costmap name: "Local Costmap" category: "navigation" - is_located_on: nav2-motion + is_located_on: turtlebot3 description: "Local rolling costmap for control" ros_binding: { node_name: local_costmap, namespace: /local_costmap } - id: lifecycle-manager-navigation name: "Lifecycle Manager (Navigation)" category: "navigation" - is_located_on: nav2-motion + is_located_on: turtlebot3 description: "Nav2 motion lifecycle orchestration" ros_binding: { node_name: lifecycle_manager_navigation, namespace: / } @@ -199,21 +174,21 @@ apps: - id: amcl name: "AMCL" category: "localization" - is_located_on: nav2-localization + is_located_on: turtlebot3 description: "Adaptive Monte Carlo Localization" ros_binding: { node_name: amcl, namespace: / } - id: map-server name: "Map Server" category: "localization" - is_located_on: nav2-localization + is_located_on: turtlebot3 description: "Static map publisher" ros_binding: { node_name: map_server, namespace: / } - id: lifecycle-manager-localization name: "Lifecycle Manager (Localization)" category: "localization" - is_located_on: nav2-localization + is_located_on: turtlebot3 description: "Localization lifecycle orchestration" ros_binding: { node_name: lifecycle_manager_localization, namespace: / } @@ -221,14 +196,14 @@ apps: - id: medkit-gateway name: "ros2_medkit Gateway" category: "gateway" - is_located_on: medkit-gateway-unit + is_located_on: turtlebot3 description: "SOVD REST gateway, hosts the OTA plugin" ros_binding: { node_name: ros2_medkit_gateway, namespace: / } - id: medkit-fault-manager name: "Fault Manager" category: "diagnostics" - is_located_on: fault-manager-unit + is_located_on: turtlebot3 description: "Fault aggregation + storage" ros_binding: { node_name: fault_manager, namespace: / } @@ -236,7 +211,7 @@ apps: - id: foxglove-bridge name: "Foxglove Bridge" category: "visualization" - is_located_on: foxglove-unit + is_located_on: turtlebot3 description: "WebSocket bridge on :8765" ros_binding: { node_name: foxglove_bridge, namespace: / } From ca0232d1e553feaa7175a1887f522a5b20f6170e Mon Sep 17 00:00:00 2001 From: Bartosz Burda Date: Wed, 29 Apr 2026 11:37:43 +0200 Subject: [PATCH 45/52] fix(demos/ota): subscribe only TwistStamped on /cmd_vel broken_lidar's reactive-fault path was subscribing to both Twist and TwistStamped on /cmd_vel "to be safe". At runtime that worked, but Foxglove inspects every subscriber when listing topic schemas and emitted: "Multiple channels advertise the same topic /cmd_vel but the schema, schema name or encodings do not match". The mixed-types entry also confused ros2 topic info, which then reported the topic with two type strings. Nav2 Jazzy publishes TwistStamped (docking_server / collision_monitor / velocity_smoother all use the stamped variant). The demo doesn't ship any teleop or legacy publisher; there's no Twist source on this stack. Drop the Twist subscription, keep TwistStamped. --- .../broken_lidar/src/broken_lidar_node.cpp | 32 ++++++++----------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/demos/ota_nav2_sensor_fix/ros2_packages/broken_lidar/src/broken_lidar_node.cpp b/demos/ota_nav2_sensor_fix/ros2_packages/broken_lidar/src/broken_lidar_node.cpp index 5b0368d..67ffecd 100644 --- a/demos/ota_nav2_sensor_fix/ros2_packages/broken_lidar/src/broken_lidar_node.cpp +++ b/demos/ota_nav2_sensor_fix/ros2_packages/broken_lidar/src/broken_lidar_node.cpp @@ -25,7 +25,6 @@ #include #include -#include #include #include @@ -38,24 +37,20 @@ class BrokenLidarNode : public rclcpp::Node { pub_ = create_publisher("scan", 10); timer_ = create_wall_timer(100ms, [this]() { publish_scan(); }); - // Nav2 Jazzy publishes /cmd_vel as TwistStamped; older stacks (and - // teleop) still use plain Twist. Subscribe to both so the reactive - // fault works regardless of which side is driving. + // Nav2 Jazzy publishes /cmd_vel as TwistStamped. Subscribing to + // both Twist and TwistStamped works at runtime, but Foxglove + // (which inspects subscribers when listing schemas) complains: + // "Multiple channels advertise the same topic /cmd_vel but the + // schema, schema name or encodings do not match". Stick to the + // Nav2 Jazzy default - TwistStamped only. constexpr double kThresh = 0.01; - auto handle_motion = [this](double linear_x, double angular_z) { - if (std::fabs(linear_x) > kThresh || std::fabs(angular_z) > kThresh) { - last_motion_command_ = now(); - } - }; - cmd_vel_sub_ = create_subscription( + cmd_vel_sub_ = create_subscription( "cmd_vel", 10, - [handle_motion](const geometry_msgs::msg::Twist::SharedPtr msg) { - handle_motion(msg->linear.x, msg->angular.z); - }); - cmd_vel_stamped_sub_ = create_subscription( - "cmd_vel", 10, - [handle_motion](const geometry_msgs::msg::TwistStamped::SharedPtr msg) { - handle_motion(msg->twist.linear.x, msg->twist.angular.z); + [this](const geometry_msgs::msg::TwistStamped::SharedPtr msg) { + if (std::fabs(msg->twist.linear.x) > kThresh || + std::fabs(msg->twist.angular.z) > kThresh) { + last_motion_command_ = now(); + } }); fault_client_ = create_client( @@ -142,8 +137,7 @@ class BrokenLidarNode : public rclcpp::Node { rclcpp::Publisher::SharedPtr pub_; rclcpp::TimerBase::SharedPtr timer_; - rclcpp::Subscription::SharedPtr cmd_vel_sub_; - rclcpp::Subscription::SharedPtr cmd_vel_stamped_sub_; + rclcpp::Subscription::SharedPtr cmd_vel_sub_; rclcpp::Client::SharedPtr fault_client_; rclcpp::TimerBase::SharedPtr fault_timer_; rclcpp::Time last_motion_command_{0, 0, RCL_ROS_TIME}; From 602ad905e22b02a088e5b8e5d592fb474f8e2006 Mon Sep 17 00:00:00 2001 From: Bartosz Burda Date: Wed, 29 Apr 2026 13:52:23 +0200 Subject: [PATCH 46/52] fix(demos/ota): move robot spawn into the map + revert phantom to single ray Two related issues that together broke the navigate_to_pose flow: 1. Spawn pose (-2.0, -0.5) was outside the global_costmap bounds. turtlebot3_world.yaml's origin is (-1.76, -2.42), so the robot sat to the west of where the costmap began. Every navigate_to_pose came back with error_code 203 (NO_VIABLE_PATH) and a planner log spam: "Sensor origin out of map bounds". Move spawn AND the AMCL initial_pose to (-1.5, -0.5), well inside the map. 2. The wide-wedge phantom (21 rays at 0.5 m, then 7 rays at 0.6 m) was too aggressive: the global planner rejected the goal up front, so /cmd_vel never spun and broken_lidar's reactive fault never fired even with a viable spawn pose. Revert to the single 1.0 m return on ray 180 - the controller engages, drives /cmd_vel, broken_lidar sees the motion and raises SCAN_PHANTOM_RETURN. The robot still completes the goal because the planner routes around a single ray; the demo's beat is "fault is visible while the robot drives" rather than "robot hangs forever". --- .../broken_lidar/src/broken_lidar_node.cpp | 28 ++++++++----------- .../config/nav2_params.yaml | 9 ++++-- .../launch/demo.launch.py | 6 +++- 3 files changed, 23 insertions(+), 20 deletions(-) diff --git a/demos/ota_nav2_sensor_fix/ros2_packages/broken_lidar/src/broken_lidar_node.cpp b/demos/ota_nav2_sensor_fix/ros2_packages/broken_lidar/src/broken_lidar_node.cpp index 67ffecd..bc07fcd 100644 --- a/demos/ota_nav2_sensor_fix/ros2_packages/broken_lidar/src/broken_lidar_node.cpp +++ b/demos/ota_nav2_sensor_fix/ros2_packages/broken_lidar/src/broken_lidar_node.cpp @@ -70,23 +70,17 @@ class BrokenLidarNode : public rclcpp::Node { msg.range_max = 10.0f; constexpr int kRays = 360; msg.ranges.assign(kRays, msg.range_max); - // Phantom obstacle: a 21-ray (~20 degree) wedge of 0.5 m returns - // centered straight ahead. A single ray at 1.0 m the local costmap - // happily plans around because nav2 marks one cell, raytracing - // clears it on the next sweep, and the controller drives forward - // anyway. A close wide wedge gets stamped into the local costmap - // as a continuous wall the planner has to swerve to avoid - which - // it can't reliably do because the wedge stays anchored to base_scan - // (it rotates with the robot). End result: the controller stalls, - // BT navigator times out, NavigateToPose returns ABORTED. - constexpr int kPhantomCenter = 180; - constexpr int kPhantomHalfWidth = 10; - constexpr float kPhantomRange = 0.5f; - for (int i = kPhantomCenter - kPhantomHalfWidth; - i <= kPhantomCenter + kPhantomHalfWidth; - ++i) { - msg.ranges[i] = kPhantomRange; - } + // Phantom obstacle: a single 1.0 m return on ray index 180 (angle + // 0, straight ahead in base_scan). The single ray is just enough to + // jitter the local costmap inflation layer near the path - the + // global planner still finds a route, the controller engages and + // drives /cmd_vel, but the planner has to keep replanning as the + // phantom rotates with the robot. Wider wedges trigger + // NO_VIABLE_PATH at the planner level and the action ABORTS before + // the controller ever spins, which kills the demo's reactive-fault + // beat (broken_lidar only raises the fault while /cmd_vel is + // commanded). + msg.ranges[180] = 1.0f; pub_->publish(msg); } diff --git a/demos/ota_nav2_sensor_fix/ros2_packages/ota_nav2_sensor_fix_demo/config/nav2_params.yaml b/demos/ota_nav2_sensor_fix/ros2_packages/ota_nav2_sensor_fix_demo/config/nav2_params.yaml index aa8753d..5597675 100644 --- a/demos/ota_nav2_sensor_fix/ros2_packages/ota_nav2_sensor_fix_demo/config/nav2_params.yaml +++ b/demos/ota_nav2_sensor_fix/ros2_packages/ota_nav2_sensor_fix_demo/config/nav2_params.yaml @@ -57,10 +57,15 @@ amcl: z_rand: 0.5 z_short: 0.05 scan_topic: scan + # Must match the spawn pose set by demo.launch.py (x_pose=-2.0, y_pose=-0.5). + # Otherwise AMCL's particle filter searches around origin, never sees the + # robot, and never publishes map -> odom - which leaves Foxglove unable + # to set Display Frame to "map" and floods rosout with "Invalid frame + # ID 'map'" errors. set_initial_pose: true initial_pose: - x: 0.0 - y: 0.0 + x: -1.5 + y: -0.5 z: 0.0 yaw: 0.0 diff --git a/demos/ota_nav2_sensor_fix/ros2_packages/ota_nav2_sensor_fix_demo/launch/demo.launch.py b/demos/ota_nav2_sensor_fix/ros2_packages/ota_nav2_sensor_fix_demo/launch/demo.launch.py index dccf575..317968f 100644 --- a/demos/ota_nav2_sensor_fix/ros2_packages/ota_nav2_sensor_fix_demo/launch/demo.launch.py +++ b/demos/ota_nav2_sensor_fix/ros2_packages/ota_nav2_sensor_fix_demo/launch/demo.launch.py @@ -81,7 +81,11 @@ def generate_launch_description(): use_sim_time = LaunchConfiguration('use_sim_time', default='True') headless = LaunchConfiguration('headless', default='True') - x_pose = LaunchConfiguration('x_pose', default='-2.0') + # Robot spawn must lie inside the map's bounds. turtlebot3_world.yaml + # has origin (-1.76, -2.42) - the previous (-2.0, -0.5) default put + # the robot outside, so global_costmap reported "Sensor origin out of + # map bounds" and every navigate_to_pose returned NO_VIABLE_PATH 203. + x_pose = LaunchConfiguration('x_pose', default='-1.5') y_pose = LaunchConfiguration('y_pose', default='-0.5') set_gz_model_path = AppendEnvironmentVariable( From f48cbba5636b11927a22c44e2b77f3daed5d9111 Mon Sep 17 00:00:00 2001 From: Bartosz Burda Date: Wed, 29 Apr 2026 14:21:30 +0200 Subject: [PATCH 47/52] fix(demos/ota): broadcast map -> odom continuously when robot is idle AMCL only broadcast its map->odom TF after the robot drove update_min_d (0.25 m) or rotated update_min_a (0.2 rad). The demo's flow has the robot sitting still while the operator inspects the Foxglove dashboard - so the map frame never showed up in the Display Frame dropdown until they published a goal and the robot actually started moving. Foxglove just sat in 'Missing transform from frame ' for the whole pre-OTA narrative. Set both update_min_d and update_min_a to 0 so AMCL broadcasts on every laser scan. Also drops the duplicate keys that were sitting below the original block - they were silently overriding the values I'd just added. --- .../ota_nav2_sensor_fix_demo/config/nav2_params.yaml | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/demos/ota_nav2_sensor_fix/ros2_packages/ota_nav2_sensor_fix_demo/config/nav2_params.yaml b/demos/ota_nav2_sensor_fix/ros2_packages/ota_nav2_sensor_fix_demo/config/nav2_params.yaml index 5597675..52c799b 100644 --- a/demos/ota_nav2_sensor_fix/ros2_packages/ota_nav2_sensor_fix_demo/config/nav2_params.yaml +++ b/demos/ota_nav2_sensor_fix/ros2_packages/ota_nav2_sensor_fix_demo/config/nav2_params.yaml @@ -49,9 +49,15 @@ amcl: save_pose_rate: 0.5 sigma_hit: 0.2 tf_broadcast: true + # Broadcast map -> odom on every laser update, not only after the + # robot has driven update_min_d / update_min_a. Demo flow has the + # robot sitting idle while the operator inspects the dashboard; + # without continuous broadcasting Foxglove can't even pick `map` + # in the Display Frame dropdown until the operator publishes a + # goal and the robot moves. + update_min_d: 0.0 + update_min_a: 0.0 transform_tolerance: 1.0 - update_min_a: 0.2 - update_min_d: 0.25 z_hit: 0.5 z_max: 0.05 z_rand: 0.5 From 17325d5bece635d149cc13d3127ffa29e6e02d65 Mon Sep 17 00:00:00 2001 From: Bartosz Burda Date: Wed, 29 Apr 2026 15:06:20 +0200 Subject: [PATCH 48/52] fix(demos/ota): bypass turtlebot3_gazebo's hardcoded frame_prefix slash turtlebot3_gazebo's robot_state_publisher.launch.py builds the frame_prefix parameter with PythonExpression(["'", frame_prefix, "/'"]), which appends a literal "/" even when the launch arg is empty. The result was a tf tree where AMCL / odometry broadcast frames as "base_link" (no slash) and robot_state_publisher published joint transforms as "/base_link" (with slash). Two disjoint subgraphs - so Foxglove rendered the URDF as a pile of disconnected meshes scattered around the origin instead of a robot model attached to base_footprint. Spawn robot_state_publisher directly from this launch with frame_prefix='' (empty, no PythonExpression mutilation), reading the turtlebot3 URDF off disk the same way the upstream launch does. Result: tf_static now reports "base_link", "base_footprint" without leading slashes, the URDF renders as one connected robot, and "map -> base_link" resolves end-to-end. --- .../launch/demo.launch.py | 31 ++++++++++++++++--- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/demos/ota_nav2_sensor_fix/ros2_packages/ota_nav2_sensor_fix_demo/launch/demo.launch.py b/demos/ota_nav2_sensor_fix/ros2_packages/ota_nav2_sensor_fix_demo/launch/demo.launch.py index 317968f..5f38f6d 100644 --- a/demos/ota_nav2_sensor_fix/ros2_packages/ota_nav2_sensor_fix_demo/launch/demo.launch.py +++ b/demos/ota_nav2_sensor_fix/ros2_packages/ota_nav2_sensor_fix_demo/launch/demo.launch.py @@ -132,11 +132,32 @@ def generate_launch_description(): condition=IfCondition(headless), ) - robot_state_publisher = IncludeLaunchDescription( - PythonLaunchDescriptionSource( - os.path.join(turtlebot3_gazebo_dir, 'launch', 'robot_state_publisher.launch.py'), - ), - launch_arguments={'use_sim_time': use_sim_time}.items(), + # We deliberately do NOT include turtlebot3_gazebo's + # robot_state_publisher.launch.py - it hardcodes + # `frame_prefix=PythonExpression(["'", frame_prefix, "/'"])`, which + # appends a leading "/" to every link name even when the launch arg + # is empty. The result was a tf tree where odom / amcl frames are + # `base_link` (no slash) and URDF link frames are `/base_link` + # (with slash) - two disjoint subgraphs, so Foxglove rendered the + # robot as a pile of disconnected meshes. Spawn robot_state_publisher + # directly with an empty frame_prefix. + turtlebot3_model = os.environ.get('TURTLEBOT3_MODEL', 'burger') + urdf_path = os.path.join( + turtlebot3_gazebo_dir, 'urdf', f'turtlebot3_{turtlebot3_model}.urdf' + ) + with open(urdf_path, 'r') as f: + robot_desc = f.read() + + robot_state_publisher = Node( + package='robot_state_publisher', + executable='robot_state_publisher', + name='robot_state_publisher', + output='screen', + parameters=[{ + 'use_sim_time': use_sim_time, + 'robot_description': robot_desc, + 'frame_prefix': '', + }], condition=IfCondition(headless), ) From af571b4985da1c19a6868d8a9cf38bbebb0e3333 Mon Sep 17 00:00:00 2001 From: Bartosz Burda Date: Wed, 29 Apr 2026 15:32:22 +0200 Subject: [PATCH 49/52] fix(ota_plugin): forward use_sim_time to OTA-spawned processes ProcessRunner::spawn was running the new binary with execl(path, path, nullptr) - no --ros-args, no parameters. After an OTA execute the post-update node (fixed_lidar after the update flow, obstacle_classifier_v2 after install) was therefore born with use_sim_time=false while the rest of the demo runs on /clock from gz-sim. Result: every /scan / /diagnostics message stamped by the new node landed outside the gateway's TF cache, nav2's costmaps logged "Message Filter dropping message ... earlier than all the data in the transform cache" on every tick, the obstacle layer stayed empty, and NavigateToPose came back ABORTED with no progress - "robot stops responding" from the operator's seat. Pass --ros-args -p use_sim_time:=true to execl so the spawned node joins the same clock domain as the rest of the stack. Robot resumes following goals after the OTA swap. This is the minimum viable param plumbing for the demo. A full production plugin should plumb arbitrary parameters from the catalog entry through to execve - but for now use_sim_time is the only param the OTA-managed nodes care about. --- .../ota_update_plugin/src/process_runner.cpp | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/demos/ota_nav2_sensor_fix/ota_update_plugin/src/process_runner.cpp b/demos/ota_nav2_sensor_fix/ota_update_plugin/src/process_runner.cpp index 8ec65e9..32651fc 100644 --- a/demos/ota_nav2_sensor_fix/ota_update_plugin/src/process_runner.cpp +++ b/demos/ota_nav2_sensor_fix/ota_update_plugin/src/process_runner.cpp @@ -138,7 +138,23 @@ tl::expected ProcessRunner::spawn(const std::string & executab } if (grandchild == 0) { setsid(); - execl(executable_path.c_str(), executable_path.c_str(), nullptr); + // Forward use_sim_time so the spawned node aligns with the + // gateway's clock domain. Without this, nodes started by OTA + // (post-update fixed_lidar, post-install obstacle_classifier) + // run on wall time while the rest of the stack runs on + // /clock from gz-sim - their /scan / /diagnostics timestamps + // fall outside nav2's TF buffer and the costmap drops every + // message: "the timestamp on the message is earlier than all + // the data in the transform cache". Robot stops responding. + // + // Note: this is the minimum viable param plumbing. A full + // production plugin should plumb arbitrary parameters from + // the catalog entry through to execve. + execl(executable_path.c_str(), + executable_path.c_str(), + "--ros-args", + "-p", "use_sim_time:=true", + static_cast(nullptr)); std::fprintf(stderr, "execl %s failed: %s\n", executable_path.c_str(), std::strerror(errno)); _exit(127); From e7cb757161201c886b0ece99fcfbca1a9fca9d47 Mon Sep 17 00:00:00 2001 From: Bartosz Burda Date: Wed, 29 Apr 2026 16:07:46 +0200 Subject: [PATCH 50/52] feat(ota_plugin): write manifest fragments + notify gateway on install/uninstall Without this, OTA-installed apps (obstacle_classifier_v2 after the Install flow) showed up in the gateway runtime graph but never got attached to a manifest entity, so the Foxglove tree treated them as orphans and they never appeared under the turtlebot3 component or in any function's host list. The catalog said "this update adds the obstacle_classifier app", the runtime saw the new node, but the manifest tree stayed unchanged. Wire the gateway's plugin-side manifest fragment contract: 1. Plugin reads `fragments_dir` from its config (matches the gateway's discovery.manifest.fragments_dir param). Empty = legacy "no fragments" behavior preserved. 2. set_context() now stores the PluginContext so post-execute we can call notify_entities_changed and have the gateway re-merge the base manifest with the fragments dir. 3. On Install: render a minimal manifest YAML for the new app (id from added_components[0], node_name from x_medkit_executable, located on the single turtlebot3 component), atomic-publish via tmp-rename per the ManifestManager fragment contract, then notify. Gateway picks the entity up; new app shows under /components/turtlebot3/hosts and any function that lists it. 4. On Uninstall: drop the fragment file (no-op if the entity was in the base manifest, like broken_lidar_legacy). Notify either way so the cache stops returning a now-dead app. 5. On Update: no fragment change (same app id, just different binary), but still notify so the entity cache rebuilds with the new pid. Adds the dirs to Dockerfile and threads fragments_dir through both gateway_config.yaml (discovery.manifest.fragments_dir) and the plugin config block (plugins.ota_update_plugin.fragments_dir) so they stay in lockstep. --- demos/ota_nav2_sensor_fix/Dockerfile.gateway | 6 + demos/ota_nav2_sensor_fix/gateway_config.yaml | 15 ++- .../ota_update_plugin/ota_update_plugin.hpp | 13 ++ .../src/ota_update_plugin.cpp | 127 +++++++++++++++++- 4 files changed, 157 insertions(+), 4 deletions(-) diff --git a/demos/ota_nav2_sensor_fix/Dockerfile.gateway b/demos/ota_nav2_sensor_fix/Dockerfile.gateway index 356fb7f..f1fc826 100644 --- a/demos/ota_nav2_sensor_fix/Dockerfile.gateway +++ b/demos/ota_nav2_sensor_fix/Dockerfile.gateway @@ -94,6 +94,12 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ COPY --from=builder /ws/install /ws/install COPY gateway_config.yaml /etc/ros2_medkit/gateway_config.yaml COPY manifest.yaml /etc/ros2_medkit/manifest.yaml + +# Pre-create the fragments directory so the gateway's manifest manager +# scans an existing (empty) dir at boot rather than logging "missing +# fragments_dir" warnings. Plugin writes / removes yaml files here at +# OTA install / uninstall time. +RUN mkdir -p /etc/ros2_medkit/manifest_fragments COPY entrypoint.sh /usr/local/bin/entrypoint.sh RUN chmod +x /usr/local/bin/entrypoint.sh diff --git a/demos/ota_nav2_sensor_fix/gateway_config.yaml b/demos/ota_nav2_sensor_fix/gateway_config.yaml index 14318d2..cb20925 100644 --- a/demos/ota_nav2_sensor_fix/gateway_config.yaml +++ b/demos/ota_nav2_sensor_fix/gateway_config.yaml @@ -26,12 +26,18 @@ ros2_medkit_gateway: discovery: # Hybrid: manifest defines areas/components/apps/functions, runtime - # fills in topics/services/params and surfaces OTA-installed nodes - # (e.g. obstacle_classifier_v2 after trigger-install.sh) without - # needing a manifest entry for them. + # fills in topics/services/params, and OTA-deployed apps land via + # manifest fragments dropped in fragments_dir below. mode: "hybrid" manifest_path: "/etc/ros2_medkit/manifest.yaml" manifest_strict_validation: false + manifest: + # ota_update_plugin writes one fragment per Install operation + # here and calls notify_entities_changed; the gateway re-merges + # the base manifest + every yaml in this dir on each reload. + # Path is shared with the plugin via plugins.ota_update_plugin + # .fragments_dir below - keep them in lockstep. + fragments_dir: "/etc/ros2_medkit/manifest_fragments" runtime: # Manifest defines components, no need for synthetic ones. create_synthetic_components: false @@ -48,3 +54,6 @@ ros2_medkit_gateway: plugins.ota_update_plugin.catalog_url: "http://ota_update_server:9000" plugins.ota_update_plugin.staging_dir: "/tmp/ota_staging" plugins.ota_update_plugin.install_dir: "/ws/install" + # Same path the gateway has under discovery.manifest.fragments_dir. + # Plugin drops one yaml per Install and removes it on Uninstall. + plugins.ota_update_plugin.fragments_dir: "/etc/ros2_medkit/manifest_fragments" diff --git a/demos/ota_nav2_sensor_fix/ota_update_plugin/include/ota_update_plugin/ota_update_plugin.hpp b/demos/ota_nav2_sensor_fix/ota_update_plugin/include/ota_update_plugin/ota_update_plugin.hpp index 7e67c0b..c8e628f 100644 --- a/demos/ota_nav2_sensor_fix/ota_update_plugin/include/ota_update_plugin/ota_update_plugin.hpp +++ b/demos/ota_nav2_sensor_fix/ota_update_plugin/include/ota_update_plugin/ota_update_plugin.hpp @@ -73,9 +73,22 @@ class OtaUpdatePlugin : public ros2_medkit_gateway::GatewayPlugin, public ros2_m void poll_and_register_catalog(); private: + // Manifest-fragment helpers. Plugins that deploy new nodes at runtime + // are expected to drop a fragment yaml in `fragments_dir_` and then + // notify the gateway so its ManifestManager re-merges. Without this + // the new app shows up as an "Orphan node (not in manifest)" warn + // log and never attaches to the manifest entity tree. + tl::expected write_install_fragment(const std::string & update_id, + const nlohmann::json & metadata); + tl::expected remove_install_fragment(const std::string & update_id); + void notify_manifest_changed(); + std::string catalog_url_; std::string staging_dir_; std::string install_dir_; + std::string fragments_dir_; + + ros2_medkit_gateway::PluginContext * context_{nullptr}; std::mutex mu_; std::map registry_; diff --git a/demos/ota_nav2_sensor_fix/ota_update_plugin/src/ota_update_plugin.cpp b/demos/ota_nav2_sensor_fix/ota_update_plugin/src/ota_update_plugin.cpp index f081a10..47faca4 100644 --- a/demos/ota_nav2_sensor_fix/ota_update_plugin/src/ota_update_plugin.cpp +++ b/demos/ota_nav2_sensor_fix/ota_update_plugin/src/ota_update_plugin.cpp @@ -16,9 +16,14 @@ #include #include +#include #include +#include #include +#include +#include + #include "catalog_client.hpp" #include "operation_dispatcher.hpp" #include "process_runner.hpp" @@ -76,12 +81,22 @@ void OtaUpdatePlugin::configure(const nlohmann::json & config) { catalog_url_ = config.value("catalog_url", "http://ota_update_server:9000"); staging_dir_ = config.value("staging_dir", "/tmp/ota_staging"); install_dir_ = config.value("install_dir", "/ws/install"); + // Where this plugin drops manifest fragments for OTA-installed apps. + // Must equal the path the gateway has configured under + // discovery.manifest.fragments_dir, otherwise the gateway won't pick + // them up on reload. Empty disables fragment writes (legacy behavior: + // installed nodes appear as orphans in the entity tree). + fragments_dir_ = config.value("fragments_dir", ""); if (!catalog_client_) { catalog_client_ = std::make_unique(catalog_url_); } } -void OtaUpdatePlugin::set_context(ros2_medkit_gateway::PluginContext & /*context*/) { +void OtaUpdatePlugin::set_context(ros2_medkit_gateway::PluginContext & context) { + // Hold on to the context so post-execute we can ask the gateway to + // re-merge manifest fragments and rerun discovery via + // notify_entities_changed. + context_ = &context; poll_and_register_catalog(); } @@ -245,6 +260,12 @@ tl::expected OtaUpdatePlugin::execute( if (!sp) { return tl::make_unexpected(UpdateBackendErrorInfo{UpdateBackendError::Internal, "spawn failed: " + sp.error()}); } + // Update flow: same app id (the binary swapped in is bound to the + // same scan_sensor_node entity as the binary it replaced) - no + // manifest fragment to write, but the gateway still needs to + // rerun discovery so the new pid / process metadata replaces the + // stale entries in the entity cache. + notify_manifest_changed(); reporter.set_progress(100); return {}; } @@ -267,6 +288,17 @@ tl::expected OtaUpdatePlugin::execute( if (!sp) { return tl::make_unexpected(UpdateBackendErrorInfo{UpdateBackendError::Internal, "spawn failed: " + sp.error()}); } + // Install flow: NEW app entity. Write a manifest fragment so the + // gateway picks the new app up under the manifest tree (otherwise + // it stays as an "Orphan node (not in manifest)" warn log and + // never appears under the turtlebot3 component / Functions + // listing). Notify even when fragment write fails - the spawn + // already happened and discovery should still see the new node. + if (auto fr = write_install_fragment(id, metadata); !fr) { + std::fprintf(stderr, "[ota_update_plugin] fragment write failed for %s: %s\n", id.c_str(), + fr.error().c_str()); + } + notify_manifest_changed(); reporter.set_progress(100); return {}; } @@ -282,6 +314,16 @@ tl::expected OtaUpdatePlugin::execute( std::error_code ec; fs::remove_all(install_dir_ + "/" + target_package, ec); } + // Uninstall: drop any fragment we wrote at install time and rerun + // discovery so the entity tree no longer lists the now-dead app. + // Entities defined in the base manifest stay - fragments only + // ADD, they can't remove base-manifest declarations - those + // entries just go offline. + if (auto fr = remove_install_fragment(id); !fr) { + std::fprintf(stderr, "[ota_update_plugin] fragment remove failed for %s: %s\n", id.c_str(), + fr.error().c_str()); + } + notify_manifest_changed(); reporter.set_progress(100); return {}; } @@ -293,4 +335,87 @@ tl::expected OtaUpdatePlugin::supports_automated(c return false; } +namespace { + +// Build the YAML body for a single OTA-installed app. We hand-emit the +// minimal subset the gateway's manifest parser accepts (no quoting +// edge cases in our generated values, so a yaml-cpp roundtrip would be +// overkill). The base manifest defines the `turtlebot3` component; +// fragments only ever add apps onto it. +std::string render_install_fragment(const std::string & app_id, const std::string & node_name, + const std::string & description) { + std::string out; + out += "manifest_version: \"1.0\"\n"; + out += "apps:\n"; + out += " - id: " + app_id + "\n"; + out += " name: \"" + app_id + "\"\n"; + out += " category: \"ota-installed\"\n"; + out += " is_located_on: turtlebot3\n"; + out += " description: \"" + description + "\"\n"; + out += " ros_binding: { node_name: " + node_name + ", namespace: / }\n"; + return out; +} + +} // namespace + +tl::expected OtaUpdatePlugin::write_install_fragment(const std::string & update_id, + const nlohmann::json & metadata) { + if (fragments_dir_.empty()) return {}; + + const std::string node_name = metadata.value("x_medkit_executable", ""); + // SOVD ISO 17978-3 reports the target entity via `added_components` + // (it's an array; for an OTA install we always have exactly one). + std::string app_id; + if (metadata.contains("added_components") && metadata["added_components"].is_array() + && !metadata["added_components"].empty()) { + app_id = metadata["added_components"][0].get(); + } + if (node_name.empty() || app_id.empty()) { + return tl::make_unexpected( + "metadata missing x_medkit_executable / added_components for fragment"); + } + const std::string description = "OTA-installed via " + update_id; + + std::error_code ec; + fs::create_directories(fragments_dir_, ec); + if (ec) { + return tl::make_unexpected("create fragments_dir failed: " + ec.message()); + } + + const std::string final_path = fragments_dir_ + "/" + update_id + ".yaml"; + const std::string tmp_path = fragments_dir_ + "/.tmp-" + update_id + ".yaml"; + // Atomic publish per ManifestManager's fragment contract: write to + // tmp, fsync, rename. The gateway's fragment scanner runs on the + // notify_entities_changed thread - a half-written file would fail + // the manifest reload and roll back the entire merge. + { + std::ofstream f(tmp_path, std::ios::binary | std::ios::trunc); + if (!f) return tl::make_unexpected("open tmp fragment failed: " + tmp_path); + f << render_install_fragment(app_id, node_name, description); + f.flush(); + if (!f) return tl::make_unexpected("write tmp fragment failed: " + tmp_path); + } + if (std::rename(tmp_path.c_str(), final_path.c_str()) != 0) { + return tl::make_unexpected("rename fragment failed: " + std::string(std::strerror(errno))); + } + return {}; +} + +tl::expected OtaUpdatePlugin::remove_install_fragment(const std::string & update_id) { + if (fragments_dir_.empty()) return {}; + std::error_code ec; + fs::remove(fragments_dir_ + "/" + update_id + ".yaml", ec); + // Missing-file is fine (uninstall of an entity that lived in the + // base manifest, never had a fragment); other errors are reported. + if (ec && ec != std::errc::no_such_file_or_directory) { + return tl::make_unexpected("remove fragment failed: " + ec.message()); + } + return {}; +} + +void OtaUpdatePlugin::notify_manifest_changed() { + if (!context_) return; + context_->notify_entities_changed(ros2_medkit_gateway::EntityChangeScope::full_refresh()); +} + } // namespace ota_update_plugin From d411a7bab8c8ec9646bbaf44cea10349816d9e3e Mon Sep 17 00:00:00 2001 From: Bartosz Burda Date: Wed, 29 Apr 2026 16:51:27 +0200 Subject: [PATCH 51/52] ci(ota): drop "Build artifacts on host" step from narrative job The build-and-test-ota job already moved off this recipe in favor of docker compose's multi-stage build, but the newer ota-demo-narrative job still inherited the old script. With build_artifacts.sh now hard- failing when ros2_medkit_msgs isn't on the prefix path (the script is documented as opt-in dev convenience, not the reproducible path), CI hit "ros2_medkit_msgs not found on the prefix path" and exited 1. Drop the step. ota_update_server's Dockerfile multi-stage build clones ros2_medkit_msgs at the pinned ref, builds it, runs pack_artifact.py, and ships the resulting catalog + tarballs to stage 2 - all triggered by docker compose up --build. No host-side ROS prereqs required. --- .github/workflows/ci.yml | 33 ++++----------------------------- 1 file changed, 4 insertions(+), 29 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d9312b7..bf7fbc7 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -233,37 +233,12 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 - - name: Build artifacts (catalog + tarballs) inside ros:jazzy - working-directory: demos/ota_nav2_sensor_fix - run: | - docker run --rm \ - -v "$PWD":/work \ - -w /work \ - ros:jazzy \ - bash -c ' - set -eu - apt-get update - apt-get install -y --no-install-recommends \ - python3-colcon-common-extensions \ - python3-catkin-pkg \ - python3-venv \ - python3-pip \ - build-essential \ - cmake \ - ros-jazzy-rclcpp \ - ros-jazzy-sensor-msgs \ - ros-jazzy-visualization-msgs - cd scripts - python3 -m venv .venv - .venv/bin/pip install --upgrade pip - .venv/bin/pip install pytest - cd .. - ./scripts/build_artifacts.sh - ' - sudo chown -R "$USER:$USER" . - - name: Build and start OTA demo working-directory: demos/ota_nav2_sensor_fix + # docker compose up --build runs the multi-stage build for + # ota_update_server which produces the catalog + tarballs + # internally - no separate "build artifacts on host" step + # needed (and the host wouldn't have ros2_medkit_msgs anyway). run: docker compose up -d --build - name: Run demo narrative smoke From 041a7a647cd2cb86615bcd443be694b0b15cf139 Mon Sep 17 00:00:00 2001 From: Bartosz Burda Date: Wed, 29 Apr 2026 19:25:15 +0200 Subject: [PATCH 52/52] fix(ota_demo): report fault source_id with leading slash so per-entity aggregation matches Gateway's filter_faults_by_sources prefix-matches reporting_sources against app.effective_fqn() ("/scan_sensor_node"), so a bare "scan_sensor_node" was visible only via server-level /faults and missing from /components/turtlebot3 /faults and /apps/scan-sensor/faults - the endpoints the Foxglove panels use. The official ros2_medkit_fault_reporter README documents get_fully_qualified_name() as the convention, so align broken_lidar / fixed_lidar with it. --- .../ros2_packages/broken_lidar/src/broken_lidar_node.cpp | 8 +++++++- .../ros2_packages/fixed_lidar/src/fixed_lidar_node.cpp | 5 ++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/demos/ota_nav2_sensor_fix/ros2_packages/broken_lidar/src/broken_lidar_node.cpp b/demos/ota_nav2_sensor_fix/ros2_packages/broken_lidar/src/broken_lidar_node.cpp index bc07fcd..e60ceb6 100644 --- a/demos/ota_nav2_sensor_fix/ros2_packages/broken_lidar/src/broken_lidar_node.cpp +++ b/demos/ota_nav2_sensor_fix/ros2_packages/broken_lidar/src/broken_lidar_node.cpp @@ -110,7 +110,13 @@ class BrokenLidarNode : public rclcpp::Node { req->description = "LaserScan ray index 180 reports a constant 1.0 m return " "(straight ahead). Nav2 traces it as a phantom obstacle " "and the controller cannot make progress."; - req->source_id = "scan_sensor_node"; + // Fully-qualified node name. Gateway's per-component / per-app fault + // aggregation in fault_handlers.cpp:filter_faults_by_sources does + // prefix-match against app.effective_fqn() ("/scan_sensor_node") and + // a bare "scan_sensor_node" never matches, so the fault appears in + // server-level /faults but is invisible from /components/turtlebot3 + // /faults and /apps/scan-sensor/faults (the panels Foxglove uses). + req->source_id = "/scan_sensor_node"; auto cb = [this, passed](rclcpp::Client::SharedFuture fut) { try { diff --git a/demos/ota_nav2_sensor_fix/ros2_packages/fixed_lidar/src/fixed_lidar_node.cpp b/demos/ota_nav2_sensor_fix/ros2_packages/fixed_lidar/src/fixed_lidar_node.cpp index dc48728..5d48d4d 100644 --- a/demos/ota_nav2_sensor_fix/ros2_packages/fixed_lidar/src/fixed_lidar_node.cpp +++ b/demos/ota_nav2_sensor_fix/ros2_packages/fixed_lidar/src/fixed_lidar_node.cpp @@ -75,7 +75,10 @@ class FixedLidarNode : public rclcpp::Node { req->event_type = ros2_medkit_msgs::srv::ReportFault::Request::EVENT_PASSED; req->severity = 0; req->description = "fixed_lidar took over scan_sensor_node - phantom returns no longer published."; - req->source_id = "scan_sensor_node"; + // Same FQN as broken_lidar so the EVENT_PASSED clears the fault on + // the same reporting source - see broken_lidar_node.cpp for why a + // bare name doesn't match the per-app/per-component aggregation. + req->source_id = "/scan_sensor_node"; auto cb = [this](rclcpp::Client::SharedFuture fut) { try {