From 69e88141e0dfa63f42941e08767a323da6136f27 Mon Sep 17 00:00:00 2001
From: ppraneth <pranethparuchuri@gmail.com>
Date: Tue, 16 Jun 2026 08:09:28 +0530
Subject: [PATCH 1/3] add cosmos

---
 CHANGELOG.md                              |  4 ++
 docs/source/docs/scalarization/cosmos.rst |  7 ++
 docs/source/docs/scalarization/index.rst  |  1 +
 src/torchjd/scalarization/__init__.py     |  2 +
 src/torchjd/scalarization/_cosmos.py      | 74 +++++++++++++++++++++
 tests/unit/scalarization/test_cosmos.py   | 81 +++++++++++++++++++++++
 6 files changed, 169 insertions(+)
 create mode 100644 docs/source/docs/scalarization/cosmos.rst
 create mode 100644 src/torchjd/scalarization/_cosmos.py
 create mode 100644 tests/unit/scalarization/test_cosmos.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 829e29ca..af56e41d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -19,6 +19,10 @@ changelog does not include internal changes that do not affect the user.
   inner loop on a cross-batch matrix `A = J_1 @ J_2.T` (computed from two independent mini-batches
   using `autojac.jac`), with a direction-oriented regularizer pulling the descent direction toward
   a preference direction.
+- Added `COSMOS` from [Scalable Pareto Front Approximation for Deep Multi-Objective
+  Learning](https://arxiv.org/pdf/2103.13392) (ICDM 2021), a `Scalarizer` that combines a linear
+  scalarization with a cosine-similarity penalty pulling the vector of values toward a preference
+  direction.
 - Added `DWA` (Dynamic Weight Average) from [End-to-End Multi-Task Learning with
   Attention](https://openaccess.thecvf.com/content_CVPR_2019/papers/Liu_End-To-End_Multi-Task_Learning_With_Attention_CVPR_2019_paper.pdf)
   (CVPR 2019), a stateful `Scalarizer` that weights each value by the relative rate at which its
diff --git a/docs/source/docs/scalarization/cosmos.rst b/docs/source/docs/scalarization/cosmos.rst
new file mode 100644
index 00000000..9b3d9c1c
--- /dev/null
+++ b/docs/source/docs/scalarization/cosmos.rst
@@ -0,0 +1,7 @@
+:hide-toc:
+
+COSMOS
+======
+
+.. autoclass:: torchjd.scalarization.COSMOS
+    :members: __call__
diff --git a/docs/source/docs/scalarization/index.rst b/docs/source/docs/scalarization/index.rst
index d38708c0..76b98cd6 100644
--- a/docs/source/docs/scalarization/index.rst
+++ b/docs/source/docs/scalarization/index.rst
@@ -15,6 +15,7 @@ Abstract base class
     :maxdepth: 1
 
     constant.rst
+    cosmos.rst
     dwa.rst
     famo.rst
     geometric_mean.rst
diff --git a/src/torchjd/scalarization/__init__.py b/src/torchjd/scalarization/__init__.py
index f1d22029..a7a0c3fc 100644
--- a/src/torchjd/scalarization/__init__.py
+++ b/src/torchjd/scalarization/__init__.py
@@ -20,6 +20,7 @@
 """
 
 from ._constant import Constant
+from ._cosmos import COSMOS
 from ._dwa import DWA
 from ._famo import FAMO
 from ._geometric_mean import GeometricMean
@@ -33,6 +34,7 @@
 
 __all__ = [
     "Constant",
+    "COSMOS",
     "DWA",
     "FAMO",
     "GeometricMean",
diff --git a/src/torchjd/scalarization/_cosmos.py b/src/torchjd/scalarization/_cosmos.py
new file mode 100644
index 00000000..4371e08c
--- /dev/null
+++ b/src/torchjd/scalarization/_cosmos.py
@@ -0,0 +1,74 @@
+import torch
+from torch import Tensor
+
+from ._scalarizer_base import Scalarizer
+
+
+class COSMOS(Scalarizer):
+    r"""
+    :class:`~torchjd.scalarization.Scalarizer` that combines the input tensor of values using the
+    COSMOS scalarization, proposed in `Scalable Pareto Front Approximation for Deep Multi-Objective
+    Learning <https://arxiv.org/pdf/2103.13392>`_.
+
+    It returns a linear scalarization penalized by the cosine similarity between the values and the
+    preference vector:
+
+    .. math::
+        \sum_i r_i L_i - \lambda \frac{\sum_i r_i L_i}{\lVert r \rVert \, \lVert L \rVert},
+
+    where:
+
+    - :math:`L_i` is the :math:`i`-th input value (the :math:`i`-th objective);
+    - :math:`r_i` is its preference weight (the ``weights`` parameter);
+    - :math:`\lambda` is the cosine-similarity penalty coefficient (the ``lambda_`` parameter);
+    - the subtracted term is :math:`\lambda \cos(r, L)`, which rewards aligning the vector of values
+      with the preference direction and is what spreads the approximated Pareto front.
+
+    :param lambda_: The cosine-similarity penalty coefficient :math:`\lambda`. Must be non-negative.
+        A value of ``0`` reduces COSMOS to a plain linear scalarization. The paper uses values
+        ranging from ``0.01`` to ``8`` depending on the dataset, with no single best value.
+    :param weights: The preference vector :math:`r` applied to the values (in the paper, sampled on
+        the probability simplex). If ``None``, a uniform preference summing to one is used. If
+        provided, it must have the same shape as the values passed at call time.
+
+    .. note::
+        COSMOS divides by :math:`\lVert L \rVert`, so an all-zero vector of values produces ``nan``.
+        This is not enforced.
+
+    .. note::
+        The full COSMOS method also conditions the model on the preference vector by concatenating it
+        to the input; that is a modeling choice left to the user. This scalarizer only implements the
+        objective. The `libmoon <https://github.com/xzhang2523/libmoon>`_ reference normalizes the
+        linear term by :math:`\lVert r \rVert`; here the linear term is the raw weighted sum, as in
+        the paper and the official implementation.
+    """
+
+    def __init__(self, lambda_: float, weights: Tensor | None = None) -> None:
+        if lambda_ < 0.0:
+            raise ValueError(
+                f"Parameter `lambda_` should be non-negative. Found `lambda_ = {lambda_}`."
+            )
+
+        super().__init__()
+        self.lambda_ = lambda_
+        self.weights = weights
+
+    def forward(self, values: Tensor, /) -> Tensor:
+        if self.weights is not None and self.weights.shape != values.shape:
+            raise ValueError(
+                f"Parameter `weights` should have the same shape as `values`. Found "
+                f"`weights.shape = {tuple(self.weights.shape)}` and `values.shape = "
+                f"{tuple(values.shape)}`."
+            )
+
+        if self.weights is None:
+            weights = torch.full_like(values, 1.0 / values.numel())
+        else:
+            weights = self.weights
+
+        weighted_sum = (weights * values).sum()
+        cosine_similarity = weighted_sum / (weights.norm() * values.norm())
+        return weighted_sum - self.lambda_ * cosine_similarity
+
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(lambda_={self.lambda_}, weights={self.weights!r})"
diff --git a/tests/unit/scalarization/test_cosmos.py b/tests/unit/scalarization/test_cosmos.py
new file mode 100644
index 00000000..1e675b14
--- /dev/null
+++ b/tests/unit/scalarization/test_cosmos.py
@@ -0,0 +1,81 @@
+import torch
+from pytest import mark, raises
+from torch import Tensor
+from utils.tensors import tensor_
+
+from torchjd.scalarization import COSMOS
+
+from ._asserts import (
+    assert_grad_flow,
+    assert_permutation_invariant,
+    assert_returns_scalar,
+)
+from ._inputs import all_inputs
+
+
+def test_value_default() -> None:
+    # Uniform weights on equal values are perfectly aligned, so cos(r, L) = 1. The result is the
+    # weighted sum (1) minus lambda (1): 0.
+    out = COSMOS(lambda_=1.0)(tensor_([1.0, 1.0]))
+    torch.testing.assert_close(out, tensor_(0.0))
+
+
+def test_value_lambda_zero_is_linear_scalarization() -> None:
+    # With lambda = 0 there is no cosine penalty, so COSMOS is the (uniform) weighted sum.
+    out = COSMOS(lambda_=0.0)(tensor_([1.0, 2.0, 4.0]))
+    torch.testing.assert_close(out, tensor_(7.0 / 3.0))
+
+
+def test_value_with_weights() -> None:
+    # With lambda = 0, only the linear term remains: 2*3 + 1*4 = 10.
+    out = COSMOS(lambda_=0.0, weights=tensor_([2.0, 1.0]))(tensor_([3.0, 4.0]))
+    torch.testing.assert_close(out, tensor_(10.0))
+
+
+def test_full_formula() -> None:
+    values = tensor_([1.0, 2.0, 4.0])
+    weights = tensor_([0.5, 0.3, 0.2])
+    lambda_ = 2.0
+    weighted_sum = (weights * values).sum()
+    expected = weighted_sum - lambda_ * weighted_sum / (weights.norm() * values.norm())
+    torch.testing.assert_close(COSMOS(lambda_, weights=weights)(values), expected)
+
+
+@mark.parametrize("values", all_inputs)
+def test_expected_structure(values: Tensor) -> None:
+    assert_returns_scalar(COSMOS(lambda_=1.0), values)
+
+
+@mark.parametrize("values", all_inputs)
+def test_grad_flow(values: Tensor) -> None:
+    assert_grad_flow(COSMOS(lambda_=1.0), values)
+
+
+@mark.parametrize("values", all_inputs)
+def test_permutation_invariant(values: Tensor) -> None:
+    # With uniform weights, both the weighted sum and the cosine term are symmetric in the inputs.
+    assert_permutation_invariant(COSMOS(lambda_=1.0), values)
+
+
+def test_nan_for_all_zero_values() -> None:
+    # The cosine term divides by ||L||, so an all-zero vector of values produces nan.
+    out = COSMOS(lambda_=1.0)(tensor_([0.0, 0.0]))
+    assert out.isnan()
+
+
+@mark.parametrize("lambda_", [-1.0, -0.5])
+def test_raises_on_negative_lambda(lambda_: float) -> None:
+    with raises(ValueError):
+        COSMOS(lambda_=lambda_)
+
+
+def test_raises_on_weights_shape_mismatch() -> None:
+    scalarizer = COSMOS(lambda_=1.0, weights=tensor_([1.0, 1.0, 1.0]))
+    with raises(ValueError):
+        scalarizer(tensor_([1.0, 1.0]))
+
+
+def test_representations() -> None:
+    s = COSMOS(lambda_=0.5)
+    assert repr(s) == "COSMOS(lambda_=0.5, weights=None)"
+    assert str(s) == "COSMOS"

From b6f088a41bd40619aec3857f6bf615616d4bf79d Mon Sep 17 00:00:00 2001
From: ppraneth <pranethparuchuri@gmail.com>
Date: Wed, 17 Jun 2026 19:59:14 +0530
Subject: [PATCH 2/3] minor fixes

---
 src/torchjd/scalarization/_cosmos.py    | 35 +++++++++-------------
 tests/unit/scalarization/test_cosmos.py | 39 +++++++++++++++----------
 2 files changed, 37 insertions(+), 37 deletions(-)

diff --git a/src/torchjd/scalarization/_cosmos.py b/src/torchjd/scalarization/_cosmos.py
index 4371e08c..be7bc3dc 100644
--- a/src/torchjd/scalarization/_cosmos.py
+++ b/src/torchjd/scalarization/_cosmos.py
@@ -1,5 +1,5 @@
-import torch
 from torch import Tensor
+from torch.nn.functional import cosine_similarity
 
 from ._scalarizer_base import Scalarizer
 
@@ -27,23 +27,21 @@ class COSMOS(Scalarizer):
     :param lambda_: The cosine-similarity penalty coefficient :math:`\lambda`. Must be non-negative.
         A value of ``0`` reduces COSMOS to a plain linear scalarization. The paper uses values
         ranging from ``0.01`` to ``8`` depending on the dataset, with no single best value.
-    :param weights: The preference vector :math:`r` applied to the values (in the paper, sampled on
-        the probability simplex). If ``None``, a uniform preference summing to one is used. If
-        provided, it must have the same shape as the values passed at call time.
-
-    .. note::
-        COSMOS divides by :math:`\lVert L \rVert`, so an all-zero vector of values produces ``nan``.
-        This is not enforced.
+    :param weights: The preference vector :math:`r` applied to the values. It must have the same
+        shape as the values passed at call time. To approximate the whole Pareto front rather than a
+        single trade-off, it should be re-sampled from a Dirichlet distribution and reassigned before
+        every call, as in the paper, e.g. for ``m`` objectives
+        ``cosmos.weights = torch.distributions.Dirichlet(torch.ones(m)).sample()`` (a uniform
+        distribution over the probability simplex; a concentration smaller than one spreads the
+        samples toward the corners of the simplex).
 
     .. note::
         The full COSMOS method also conditions the model on the preference vector by concatenating it
         to the input; that is a modeling choice left to the user. This scalarizer only implements the
-        objective. The `libmoon <https://github.com/xzhang2523/libmoon>`_ reference normalizes the
-        linear term by :math:`\lVert r \rVert`; here the linear term is the raw weighted sum, as in
-        the paper and the official implementation.
+        objective.
     """
 
-    def __init__(self, lambda_: float, weights: Tensor | None = None) -> None:
+    def __init__(self, lambda_: float, weights: Tensor) -> None:
         if lambda_ < 0.0:
             raise ValueError(
                 f"Parameter `lambda_` should be non-negative. Found `lambda_ = {lambda_}`."
@@ -54,21 +52,16 @@ def __init__(self, lambda_: float, weights: Tensor | None = None) -> None:
         self.weights = weights
 
     def forward(self, values: Tensor, /) -> Tensor:
-        if self.weights is not None and self.weights.shape != values.shape:
+        if self.weights.shape != values.shape:
             raise ValueError(
                 f"Parameter `weights` should have the same shape as `values`. Found "
                 f"`weights.shape = {tuple(self.weights.shape)}` and `values.shape = "
                 f"{tuple(values.shape)}`."
             )
 
-        if self.weights is None:
-            weights = torch.full_like(values, 1.0 / values.numel())
-        else:
-            weights = self.weights
-
-        weighted_sum = (weights * values).sum()
-        cosine_similarity = weighted_sum / (weights.norm() * values.norm())
-        return weighted_sum - self.lambda_ * cosine_similarity
+        weighted_sum = (self.weights * values).sum()
+        cosine = cosine_similarity(self.weights.flatten(), values.flatten(), dim=0)
+        return weighted_sum - self.lambda_ * cosine
 
     def __repr__(self) -> str:
         return f"{self.__class__.__name__}(lambda_={self.lambda_}, weights={self.weights!r})"
diff --git a/tests/unit/scalarization/test_cosmos.py b/tests/unit/scalarization/test_cosmos.py
index 1e675b14..d98369d1 100644
--- a/tests/unit/scalarization/test_cosmos.py
+++ b/tests/unit/scalarization/test_cosmos.py
@@ -1,6 +1,7 @@
 import torch
 from pytest import mark, raises
 from torch import Tensor
+from torch.nn.functional import cosine_similarity
 from utils.tensors import tensor_
 
 from torchjd.scalarization import COSMOS
@@ -13,16 +14,22 @@
 from ._inputs import all_inputs
 
 
-def test_value_default() -> None:
+def _uniform(values: Tensor) -> Tensor:
+    """Uniform preference vector matching the shape of `values`."""
+    return torch.full_like(values, 1.0 / values.numel())
+
+
+def test_value_aligned_gives_zero() -> None:
     # Uniform weights on equal values are perfectly aligned, so cos(r, L) = 1. The result is the
     # weighted sum (1) minus lambda (1): 0.
-    out = COSMOS(lambda_=1.0)(tensor_([1.0, 1.0]))
+    out = COSMOS(lambda_=1.0, weights=tensor_([0.5, 0.5]))(tensor_([1.0, 1.0]))
     torch.testing.assert_close(out, tensor_(0.0))
 
 
 def test_value_lambda_zero_is_linear_scalarization() -> None:
-    # With lambda = 0 there is no cosine penalty, so COSMOS is the (uniform) weighted sum.
-    out = COSMOS(lambda_=0.0)(tensor_([1.0, 2.0, 4.0]))
+    # With lambda = 0 there is no cosine penalty, so COSMOS is just the weighted sum.
+    weights = tensor_([1.0 / 3.0, 1.0 / 3.0, 1.0 / 3.0])
+    out = COSMOS(lambda_=0.0, weights=weights)(tensor_([1.0, 2.0, 4.0]))
     torch.testing.assert_close(out, tensor_(7.0 / 3.0))
 
 
@@ -36,37 +43,37 @@ def test_full_formula() -> None:
     values = tensor_([1.0, 2.0, 4.0])
     weights = tensor_([0.5, 0.3, 0.2])
     lambda_ = 2.0
-    weighted_sum = (weights * values).sum()
-    expected = weighted_sum - lambda_ * weighted_sum / (weights.norm() * values.norm())
+    expected = (weights * values).sum() - lambda_ * cosine_similarity(weights, values, dim=0)
     torch.testing.assert_close(COSMOS(lambda_, weights=weights)(values), expected)
 
 
 @mark.parametrize("values", all_inputs)
 def test_expected_structure(values: Tensor) -> None:
-    assert_returns_scalar(COSMOS(lambda_=1.0), values)
+    assert_returns_scalar(COSMOS(lambda_=1.0, weights=_uniform(values)), values)
 
 
 @mark.parametrize("values", all_inputs)
 def test_grad_flow(values: Tensor) -> None:
-    assert_grad_flow(COSMOS(lambda_=1.0), values)
+    assert_grad_flow(COSMOS(lambda_=1.0, weights=_uniform(values)), values)
 
 
 @mark.parametrize("values", all_inputs)
 def test_permutation_invariant(values: Tensor) -> None:
     # With uniform weights, both the weighted sum and the cosine term are symmetric in the inputs.
-    assert_permutation_invariant(COSMOS(lambda_=1.0), values)
+    assert_permutation_invariant(COSMOS(lambda_=1.0, weights=_uniform(values)), values)
 
 
-def test_nan_for_all_zero_values() -> None:
-    # The cosine term divides by ||L||, so an all-zero vector of values produces nan.
-    out = COSMOS(lambda_=1.0)(tensor_([0.0, 0.0]))
-    assert out.isnan()
+def test_zero_values_returns_zero() -> None:
+    # `cosine_similarity` is numerically stable for the zero vector, so all-zero values give 0 (no
+    # nan), regardless of lambda.
+    out = COSMOS(lambda_=1.0, weights=tensor_([0.5, 0.5]))(tensor_([0.0, 0.0]))
+    torch.testing.assert_close(out, tensor_(0.0))
 
 
 @mark.parametrize("lambda_", [-1.0, -0.5])
 def test_raises_on_negative_lambda(lambda_: float) -> None:
     with raises(ValueError):
-        COSMOS(lambda_=lambda_)
+        COSMOS(lambda_=lambda_, weights=tensor_([0.5, 0.5]))
 
 
 def test_raises_on_weights_shape_mismatch() -> None:
@@ -76,6 +83,6 @@ def test_raises_on_weights_shape_mismatch() -> None:
 
 
 def test_representations() -> None:
-    s = COSMOS(lambda_=0.5)
-    assert repr(s) == "COSMOS(lambda_=0.5, weights=None)"
+    s = COSMOS(lambda_=0.5, weights=torch.tensor([0.5, 0.5]))
+    assert repr(s) == "COSMOS(lambda_=0.5, weights=tensor([0.5000, 0.5000]))"
     assert str(s) == "COSMOS"

From 835f934bc2432337e4467610aa30be2b83753ba6 Mon Sep 17 00:00:00 2001
From: ppraneth <pranethparuchuri@gmail.com>
Date: Wed, 17 Jun 2026 21:12:27 +0530
Subject: [PATCH 3/3] minor fixes changelog

---
 CHANGELOG.md | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index af56e41d..870f7949 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,13 @@ changelog does not include internal changes that do not affect the user.
 
 ## [Unreleased]
 
+### Added
+
+- Added `COSMOS` from [Scalable Pareto Front Approximation for Deep Multi-Objective
+  Learning](https://arxiv.org/pdf/2103.13392) (ICDM 2021), a `Scalarizer` that combines a linear
+  scalarization with a cosine-similarity penalty pulling the vector of values toward a preference
+  direction.
+
 ## [0.15.0] - 2026-06-15
 
 ### Added
@@ -19,10 +26,6 @@ changelog does not include internal changes that do not affect the user.
   inner loop on a cross-batch matrix `A = J_1 @ J_2.T` (computed from two independent mini-batches
   using `autojac.jac`), with a direction-oriented regularizer pulling the descent direction toward
   a preference direction.
-- Added `COSMOS` from [Scalable Pareto Front Approximation for Deep Multi-Objective
-  Learning](https://arxiv.org/pdf/2103.13392) (ICDM 2021), a `Scalarizer` that combines a linear
-  scalarization with a cosine-similarity penalty pulling the vector of values toward a preference
-  direction.
 - Added `DWA` (Dynamic Weight Average) from [End-to-End Multi-Task Learning with
   Attention](https://openaccess.thecvf.com/content_CVPR_2019/papers/Liu_End-To-End_Multi-Task_Learning_With_Attention_CVPR_2019_paper.pdf)
   (CVPR 2019), a stateful `Scalarizer` that weights each value by the relative rate at which its