From 69e88141e0dfa63f42941e08767a323da6136f27 Mon Sep 17 00:00:00 2001 From: ppraneth Date: Tue, 16 Jun 2026 08:09:28 +0530 Subject: [PATCH 1/3] add cosmos --- CHANGELOG.md | 4 ++ docs/source/docs/scalarization/cosmos.rst | 7 ++ docs/source/docs/scalarization/index.rst | 1 + src/torchjd/scalarization/__init__.py | 2 + src/torchjd/scalarization/_cosmos.py | 74 +++++++++++++++++++++ tests/unit/scalarization/test_cosmos.py | 81 +++++++++++++++++++++++ 6 files changed, 169 insertions(+) create mode 100644 docs/source/docs/scalarization/cosmos.rst create mode 100644 src/torchjd/scalarization/_cosmos.py create mode 100644 tests/unit/scalarization/test_cosmos.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 829e29ca..af56e41d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,10 @@ changelog does not include internal changes that do not affect the user. inner loop on a cross-batch matrix `A = J_1 @ J_2.T` (computed from two independent mini-batches using `autojac.jac`), with a direction-oriented regularizer pulling the descent direction toward a preference direction. +- Added `COSMOS` from [Scalable Pareto Front Approximation for Deep Multi-Objective + Learning](https://arxiv.org/pdf/2103.13392) (ICDM 2021), a `Scalarizer` that combines a linear + scalarization with a cosine-similarity penalty pulling the vector of values toward a preference + direction. - Added `DWA` (Dynamic Weight Average) from [End-to-End Multi-Task Learning with Attention](https://openaccess.thecvf.com/content_CVPR_2019/papers/Liu_End-To-End_Multi-Task_Learning_With_Attention_CVPR_2019_paper.pdf) (CVPR 2019), a stateful `Scalarizer` that weights each value by the relative rate at which its diff --git a/docs/source/docs/scalarization/cosmos.rst b/docs/source/docs/scalarization/cosmos.rst new file mode 100644 index 00000000..9b3d9c1c --- /dev/null +++ b/docs/source/docs/scalarization/cosmos.rst @@ -0,0 +1,7 @@ +:hide-toc: + +COSMOS +====== + +.. autoclass:: torchjd.scalarization.COSMOS + :members: __call__ diff --git a/docs/source/docs/scalarization/index.rst b/docs/source/docs/scalarization/index.rst index d38708c0..76b98cd6 100644 --- a/docs/source/docs/scalarization/index.rst +++ b/docs/source/docs/scalarization/index.rst @@ -15,6 +15,7 @@ Abstract base class :maxdepth: 1 constant.rst + cosmos.rst dwa.rst famo.rst geometric_mean.rst diff --git a/src/torchjd/scalarization/__init__.py b/src/torchjd/scalarization/__init__.py index f1d22029..a7a0c3fc 100644 --- a/src/torchjd/scalarization/__init__.py +++ b/src/torchjd/scalarization/__init__.py @@ -20,6 +20,7 @@ """ from ._constant import Constant +from ._cosmos import COSMOS from ._dwa import DWA from ._famo import FAMO from ._geometric_mean import GeometricMean @@ -33,6 +34,7 @@ __all__ = [ "Constant", + "COSMOS", "DWA", "FAMO", "GeometricMean", diff --git a/src/torchjd/scalarization/_cosmos.py b/src/torchjd/scalarization/_cosmos.py new file mode 100644 index 00000000..4371e08c --- /dev/null +++ b/src/torchjd/scalarization/_cosmos.py @@ -0,0 +1,74 @@ +import torch +from torch import Tensor + +from ._scalarizer_base import Scalarizer + + +class COSMOS(Scalarizer): + r""" + :class:`~torchjd.scalarization.Scalarizer` that combines the input tensor of values using the + COSMOS scalarization, proposed in `Scalable Pareto Front Approximation for Deep Multi-Objective + Learning `_. + + It returns a linear scalarization penalized by the cosine similarity between the values and the + preference vector: + + .. math:: + \sum_i r_i L_i - \lambda \frac{\sum_i r_i L_i}{\lVert r \rVert \, \lVert L \rVert}, + + where: + + - :math:`L_i` is the :math:`i`-th input value (the :math:`i`-th objective); + - :math:`r_i` is its preference weight (the ``weights`` parameter); + - :math:`\lambda` is the cosine-similarity penalty coefficient (the ``lambda_`` parameter); + - the subtracted term is :math:`\lambda \cos(r, L)`, which rewards aligning the vector of values + with the preference direction and is what spreads the approximated Pareto front. + + :param lambda_: The cosine-similarity penalty coefficient :math:`\lambda`. Must be non-negative. + A value of ``0`` reduces COSMOS to a plain linear scalarization. The paper uses values + ranging from ``0.01`` to ``8`` depending on the dataset, with no single best value. + :param weights: The preference vector :math:`r` applied to the values (in the paper, sampled on + the probability simplex). If ``None``, a uniform preference summing to one is used. If + provided, it must have the same shape as the values passed at call time. + + .. note:: + COSMOS divides by :math:`\lVert L \rVert`, so an all-zero vector of values produces ``nan``. + This is not enforced. + + .. note:: + The full COSMOS method also conditions the model on the preference vector by concatenating it + to the input; that is a modeling choice left to the user. This scalarizer only implements the + objective. The `libmoon `_ reference normalizes the + linear term by :math:`\lVert r \rVert`; here the linear term is the raw weighted sum, as in + the paper and the official implementation. + """ + + def __init__(self, lambda_: float, weights: Tensor | None = None) -> None: + if lambda_ < 0.0: + raise ValueError( + f"Parameter `lambda_` should be non-negative. Found `lambda_ = {lambda_}`." + ) + + super().__init__() + self.lambda_ = lambda_ + self.weights = weights + + def forward(self, values: Tensor, /) -> Tensor: + if self.weights is not None and self.weights.shape != values.shape: + raise ValueError( + f"Parameter `weights` should have the same shape as `values`. Found " + f"`weights.shape = {tuple(self.weights.shape)}` and `values.shape = " + f"{tuple(values.shape)}`." + ) + + if self.weights is None: + weights = torch.full_like(values, 1.0 / values.numel()) + else: + weights = self.weights + + weighted_sum = (weights * values).sum() + cosine_similarity = weighted_sum / (weights.norm() * values.norm()) + return weighted_sum - self.lambda_ * cosine_similarity + + def __repr__(self) -> str: + return f"{self.__class__.__name__}(lambda_={self.lambda_}, weights={self.weights!r})" diff --git a/tests/unit/scalarization/test_cosmos.py b/tests/unit/scalarization/test_cosmos.py new file mode 100644 index 00000000..1e675b14 --- /dev/null +++ b/tests/unit/scalarization/test_cosmos.py @@ -0,0 +1,81 @@ +import torch +from pytest import mark, raises +from torch import Tensor +from utils.tensors import tensor_ + +from torchjd.scalarization import COSMOS + +from ._asserts import ( + assert_grad_flow, + assert_permutation_invariant, + assert_returns_scalar, +) +from ._inputs import all_inputs + + +def test_value_default() -> None: + # Uniform weights on equal values are perfectly aligned, so cos(r, L) = 1. The result is the + # weighted sum (1) minus lambda (1): 0. + out = COSMOS(lambda_=1.0)(tensor_([1.0, 1.0])) + torch.testing.assert_close(out, tensor_(0.0)) + + +def test_value_lambda_zero_is_linear_scalarization() -> None: + # With lambda = 0 there is no cosine penalty, so COSMOS is the (uniform) weighted sum. + out = COSMOS(lambda_=0.0)(tensor_([1.0, 2.0, 4.0])) + torch.testing.assert_close(out, tensor_(7.0 / 3.0)) + + +def test_value_with_weights() -> None: + # With lambda = 0, only the linear term remains: 2*3 + 1*4 = 10. + out = COSMOS(lambda_=0.0, weights=tensor_([2.0, 1.0]))(tensor_([3.0, 4.0])) + torch.testing.assert_close(out, tensor_(10.0)) + + +def test_full_formula() -> None: + values = tensor_([1.0, 2.0, 4.0]) + weights = tensor_([0.5, 0.3, 0.2]) + lambda_ = 2.0 + weighted_sum = (weights * values).sum() + expected = weighted_sum - lambda_ * weighted_sum / (weights.norm() * values.norm()) + torch.testing.assert_close(COSMOS(lambda_, weights=weights)(values), expected) + + +@mark.parametrize("values", all_inputs) +def test_expected_structure(values: Tensor) -> None: + assert_returns_scalar(COSMOS(lambda_=1.0), values) + + +@mark.parametrize("values", all_inputs) +def test_grad_flow(values: Tensor) -> None: + assert_grad_flow(COSMOS(lambda_=1.0), values) + + +@mark.parametrize("values", all_inputs) +def test_permutation_invariant(values: Tensor) -> None: + # With uniform weights, both the weighted sum and the cosine term are symmetric in the inputs. + assert_permutation_invariant(COSMOS(lambda_=1.0), values) + + +def test_nan_for_all_zero_values() -> None: + # The cosine term divides by ||L||, so an all-zero vector of values produces nan. + out = COSMOS(lambda_=1.0)(tensor_([0.0, 0.0])) + assert out.isnan() + + +@mark.parametrize("lambda_", [-1.0, -0.5]) +def test_raises_on_negative_lambda(lambda_: float) -> None: + with raises(ValueError): + COSMOS(lambda_=lambda_) + + +def test_raises_on_weights_shape_mismatch() -> None: + scalarizer = COSMOS(lambda_=1.0, weights=tensor_([1.0, 1.0, 1.0])) + with raises(ValueError): + scalarizer(tensor_([1.0, 1.0])) + + +def test_representations() -> None: + s = COSMOS(lambda_=0.5) + assert repr(s) == "COSMOS(lambda_=0.5, weights=None)" + assert str(s) == "COSMOS" From b6f088a41bd40619aec3857f6bf615616d4bf79d Mon Sep 17 00:00:00 2001 From: ppraneth Date: Wed, 17 Jun 2026 19:59:14 +0530 Subject: [PATCH 2/3] minor fixes --- src/torchjd/scalarization/_cosmos.py | 35 +++++++++------------- tests/unit/scalarization/test_cosmos.py | 39 +++++++++++++++---------- 2 files changed, 37 insertions(+), 37 deletions(-) diff --git a/src/torchjd/scalarization/_cosmos.py b/src/torchjd/scalarization/_cosmos.py index 4371e08c..be7bc3dc 100644 --- a/src/torchjd/scalarization/_cosmos.py +++ b/src/torchjd/scalarization/_cosmos.py @@ -1,5 +1,5 @@ -import torch from torch import Tensor +from torch.nn.functional import cosine_similarity from ._scalarizer_base import Scalarizer @@ -27,23 +27,21 @@ class COSMOS(Scalarizer): :param lambda_: The cosine-similarity penalty coefficient :math:`\lambda`. Must be non-negative. A value of ``0`` reduces COSMOS to a plain linear scalarization. The paper uses values ranging from ``0.01`` to ``8`` depending on the dataset, with no single best value. - :param weights: The preference vector :math:`r` applied to the values (in the paper, sampled on - the probability simplex). If ``None``, a uniform preference summing to one is used. If - provided, it must have the same shape as the values passed at call time. - - .. note:: - COSMOS divides by :math:`\lVert L \rVert`, so an all-zero vector of values produces ``nan``. - This is not enforced. + :param weights: The preference vector :math:`r` applied to the values. It must have the same + shape as the values passed at call time. To approximate the whole Pareto front rather than a + single trade-off, it should be re-sampled from a Dirichlet distribution and reassigned before + every call, as in the paper, e.g. for ``m`` objectives + ``cosmos.weights = torch.distributions.Dirichlet(torch.ones(m)).sample()`` (a uniform + distribution over the probability simplex; a concentration smaller than one spreads the + samples toward the corners of the simplex). .. note:: The full COSMOS method also conditions the model on the preference vector by concatenating it to the input; that is a modeling choice left to the user. This scalarizer only implements the - objective. The `libmoon `_ reference normalizes the - linear term by :math:`\lVert r \rVert`; here the linear term is the raw weighted sum, as in - the paper and the official implementation. + objective. """ - def __init__(self, lambda_: float, weights: Tensor | None = None) -> None: + def __init__(self, lambda_: float, weights: Tensor) -> None: if lambda_ < 0.0: raise ValueError( f"Parameter `lambda_` should be non-negative. Found `lambda_ = {lambda_}`." @@ -54,21 +52,16 @@ def __init__(self, lambda_: float, weights: Tensor | None = None) -> None: self.weights = weights def forward(self, values: Tensor, /) -> Tensor: - if self.weights is not None and self.weights.shape != values.shape: + if self.weights.shape != values.shape: raise ValueError( f"Parameter `weights` should have the same shape as `values`. Found " f"`weights.shape = {tuple(self.weights.shape)}` and `values.shape = " f"{tuple(values.shape)}`." ) - if self.weights is None: - weights = torch.full_like(values, 1.0 / values.numel()) - else: - weights = self.weights - - weighted_sum = (weights * values).sum() - cosine_similarity = weighted_sum / (weights.norm() * values.norm()) - return weighted_sum - self.lambda_ * cosine_similarity + weighted_sum = (self.weights * values).sum() + cosine = cosine_similarity(self.weights.flatten(), values.flatten(), dim=0) + return weighted_sum - self.lambda_ * cosine def __repr__(self) -> str: return f"{self.__class__.__name__}(lambda_={self.lambda_}, weights={self.weights!r})" diff --git a/tests/unit/scalarization/test_cosmos.py b/tests/unit/scalarization/test_cosmos.py index 1e675b14..d98369d1 100644 --- a/tests/unit/scalarization/test_cosmos.py +++ b/tests/unit/scalarization/test_cosmos.py @@ -1,6 +1,7 @@ import torch from pytest import mark, raises from torch import Tensor +from torch.nn.functional import cosine_similarity from utils.tensors import tensor_ from torchjd.scalarization import COSMOS @@ -13,16 +14,22 @@ from ._inputs import all_inputs -def test_value_default() -> None: +def _uniform(values: Tensor) -> Tensor: + """Uniform preference vector matching the shape of `values`.""" + return torch.full_like(values, 1.0 / values.numel()) + + +def test_value_aligned_gives_zero() -> None: # Uniform weights on equal values are perfectly aligned, so cos(r, L) = 1. The result is the # weighted sum (1) minus lambda (1): 0. - out = COSMOS(lambda_=1.0)(tensor_([1.0, 1.0])) + out = COSMOS(lambda_=1.0, weights=tensor_([0.5, 0.5]))(tensor_([1.0, 1.0])) torch.testing.assert_close(out, tensor_(0.0)) def test_value_lambda_zero_is_linear_scalarization() -> None: - # With lambda = 0 there is no cosine penalty, so COSMOS is the (uniform) weighted sum. - out = COSMOS(lambda_=0.0)(tensor_([1.0, 2.0, 4.0])) + # With lambda = 0 there is no cosine penalty, so COSMOS is just the weighted sum. + weights = tensor_([1.0 / 3.0, 1.0 / 3.0, 1.0 / 3.0]) + out = COSMOS(lambda_=0.0, weights=weights)(tensor_([1.0, 2.0, 4.0])) torch.testing.assert_close(out, tensor_(7.0 / 3.0)) @@ -36,37 +43,37 @@ def test_full_formula() -> None: values = tensor_([1.0, 2.0, 4.0]) weights = tensor_([0.5, 0.3, 0.2]) lambda_ = 2.0 - weighted_sum = (weights * values).sum() - expected = weighted_sum - lambda_ * weighted_sum / (weights.norm() * values.norm()) + expected = (weights * values).sum() - lambda_ * cosine_similarity(weights, values, dim=0) torch.testing.assert_close(COSMOS(lambda_, weights=weights)(values), expected) @mark.parametrize("values", all_inputs) def test_expected_structure(values: Tensor) -> None: - assert_returns_scalar(COSMOS(lambda_=1.0), values) + assert_returns_scalar(COSMOS(lambda_=1.0, weights=_uniform(values)), values) @mark.parametrize("values", all_inputs) def test_grad_flow(values: Tensor) -> None: - assert_grad_flow(COSMOS(lambda_=1.0), values) + assert_grad_flow(COSMOS(lambda_=1.0, weights=_uniform(values)), values) @mark.parametrize("values", all_inputs) def test_permutation_invariant(values: Tensor) -> None: # With uniform weights, both the weighted sum and the cosine term are symmetric in the inputs. - assert_permutation_invariant(COSMOS(lambda_=1.0), values) + assert_permutation_invariant(COSMOS(lambda_=1.0, weights=_uniform(values)), values) -def test_nan_for_all_zero_values() -> None: - # The cosine term divides by ||L||, so an all-zero vector of values produces nan. - out = COSMOS(lambda_=1.0)(tensor_([0.0, 0.0])) - assert out.isnan() +def test_zero_values_returns_zero() -> None: + # `cosine_similarity` is numerically stable for the zero vector, so all-zero values give 0 (no + # nan), regardless of lambda. + out = COSMOS(lambda_=1.0, weights=tensor_([0.5, 0.5]))(tensor_([0.0, 0.0])) + torch.testing.assert_close(out, tensor_(0.0)) @mark.parametrize("lambda_", [-1.0, -0.5]) def test_raises_on_negative_lambda(lambda_: float) -> None: with raises(ValueError): - COSMOS(lambda_=lambda_) + COSMOS(lambda_=lambda_, weights=tensor_([0.5, 0.5])) def test_raises_on_weights_shape_mismatch() -> None: @@ -76,6 +83,6 @@ def test_raises_on_weights_shape_mismatch() -> None: def test_representations() -> None: - s = COSMOS(lambda_=0.5) - assert repr(s) == "COSMOS(lambda_=0.5, weights=None)" + s = COSMOS(lambda_=0.5, weights=torch.tensor([0.5, 0.5])) + assert repr(s) == "COSMOS(lambda_=0.5, weights=tensor([0.5000, 0.5000]))" assert str(s) == "COSMOS" From 835f934bc2432337e4467610aa30be2b83753ba6 Mon Sep 17 00:00:00 2001 From: ppraneth Date: Wed, 17 Jun 2026 21:12:27 +0530 Subject: [PATCH 3/3] minor fixes changelog --- CHANGELOG.md | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index af56e41d..870f7949 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,13 @@ changelog does not include internal changes that do not affect the user. ## [Unreleased] +### Added + +- Added `COSMOS` from [Scalable Pareto Front Approximation for Deep Multi-Objective + Learning](https://arxiv.org/pdf/2103.13392) (ICDM 2021), a `Scalarizer` that combines a linear + scalarization with a cosine-similarity penalty pulling the vector of values toward a preference + direction. + ## [0.15.0] - 2026-06-15 ### Added @@ -19,10 +26,6 @@ changelog does not include internal changes that do not affect the user. inner loop on a cross-batch matrix `A = J_1 @ J_2.T` (computed from two independent mini-batches using `autojac.jac`), with a direction-oriented regularizer pulling the descent direction toward a preference direction. -- Added `COSMOS` from [Scalable Pareto Front Approximation for Deep Multi-Objective - Learning](https://arxiv.org/pdf/2103.13392) (ICDM 2021), a `Scalarizer` that combines a linear - scalarization with a cosine-similarity penalty pulling the vector of values toward a preference - direction. - Added `DWA` (Dynamic Weight Average) from [End-to-End Multi-Task Learning with Attention](https://openaccess.thecvf.com/content_CVPR_2019/papers/Liu_End-To-End_Multi-Task_Learning_With_Attention_CVPR_2019_paper.pdf) (CVPR 2019), a stateful `Scalarizer` that weights each value by the relative rate at which its