From 5db49199179e1c4289878ea0c7662d986e513f27 Mon Sep 17 00:00:00 2001
From: DataboyUsen <suxiaochen@yahoo.com>
Date: Fri, 3 Apr 2026 14:03:48 +0800
Subject: [PATCH 1/3] refactor(_mf_class.py): improve code quality and fix
 potential issues

- Replace random_state handling with proper rng instance initialization
- Remove pandas dependency by using numpy-only operations
- Fix convergence check: apply abs() to obj_diff before comparison
- Porperly move the initialization of model parameters to fit() method
- Fix spelling errors in comments and docstrings
---
 rehline/_mf_class.py | 47 +++++++++++++++++++++++++++++---------------
 1 file changed, 31 insertions(+), 16 deletions(-)

diff --git a/rehline/_mf_class.py b/rehline/_mf_class.py
index 6e90984..8d031a2 100644
--- a/rehline/_mf_class.py
+++ b/rehline/_mf_class.py
@@ -3,7 +3,6 @@
 import warnings
 
 import numpy as np
-import pandas as pd
 from sklearn.base import BaseEstimator
 from sklearn.exceptions import ConvergenceWarning
 from sklearn.utils.validation import _check_sample_weight
@@ -214,27 +213,29 @@ def __init__(
             raise ValueError("; ".join(errors))
 
         # parameter initialization
-        ## -----------------------------basic perameters-----------------------------
+        ## -----------------------------basic parameters-----------------------------
         self.n_users = n_users
         self.n_items = n_items
         self.loss = loss
         self.constraint_user = constraint_user if constraint_user is not None else []
         self.constraint_item = constraint_item if constraint_item is not None else []
         self.biased = biased
-        ## -----------------------------hyper perameters-----------------------------
+        ## -----------------------------hyper parameters-----------------------------
         self.rank = rank
         self.C = C
         self.rho = rho
-        ## --------------------------coefficient perameters--------------------------
+        ## --------------------------coefficient parameters--------------------------
         self.init_mean = init_mean
         self.init_sd = init_sd
         self.random_state = random_state
-        if self.random_state:
-            np.random.seed(random_state)
-        self.P = np.random.normal(loc=init_mean, scale=init_sd, size=(n_users, rank))
-        self.Q = np.random.normal(loc=init_mean, scale=init_sd, size=(n_items, rank))
-        self.bu = np.zeros(n_users) if self.biased else None
-        self.bi = np.zeros(n_items) if self.biased else None
+        if random_state is not None:
+            self.rng = np.random.default_rng(random_state) 
+        else:
+            self.rng = np.random.default_rng()
+        self.P = None
+        self.Q = None
+        self.bu = None
+        self.bi = None
         ## ----------------------------fitting parameters----------------------------
         self.max_iter_CD = max_iter_CD
         self.tol_CD = tol_CD
@@ -271,11 +272,19 @@ def fit(self, X, y, sample_weight=None):
         self.history = np.nan * np.zeros((self.max_iter_CD + 1, 2))
         self.sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
 
-        X_df = pd.DataFrame(X, columns=["user", "item"])
-        uidx_map = X_df.groupby("user").indices
-        iidx_map = X_df.groupby("item").indices
-        self.Iu = [uidx_map.get(u, np.array([], dtype=int)) for u in range(self.n_users)]
-        self.Ui = [iidx_map.get(i, np.array([], dtype=int)) for i in range(self.n_items)]
+        sort_idx_users = np.argsort(X[:, 0], kind='stable')
+        sorted_users = X[sort_idx_users, 0]
+        counts = np.unique(sorted_users, return_counts=True)[1]
+        self.Iu = [np.array([], dtype=int)] * self.n_users
+        for u, idxs in zip(sorted_users[np.cumsum(counts) - counts], np.split(sort_idx_users, np.cumsum(counts)[:-1])):
+            self.Iu[u] = idxs
+
+        sort_idx_items = np.argsort(X[:, 1], kind='stable')
+        sorted_items = X[sort_idx_items, 1]
+        counts = np.unique(sorted_items, return_counts=True)[1]
+        self.Ui = [np.array([], dtype=int)] * self.n_items
+        for i, idxs in zip(sorted_items[np.cumsum(counts) - counts], np.split(sort_idx_items, np.cumsum(counts)[:-1])):
+            self.Ui[i] = idxs
 
         C_user = self.C * self.n_users / (self.rho) / 2
         C_item = self.C * self.n_items / (1 - self.rho) / 2
@@ -289,6 +298,12 @@ def fit(self, X, y, sample_weight=None):
                 )
             )
 
+        # Initialization
+        self.P = self.rng.normal(loc=self.init_mean, scale=self.init_sd, size=(self.n_users, self.rank))
+        self.Q = self.rng.normal(loc=self.init_mean, scale=self.init_sd, size=(self.n_items, self.rank))
+        self.bu = np.zeros(self.n_users) if self.biased else None
+        self.bi = np.zeros(self.n_items) if self.biased else None
+
         # CD algorithm
         self.history[0] = self.obj(X, y)
         for iter_idx in range(self.max_iter_CD):
@@ -435,7 +450,7 @@ def fit(self, X, y, sample_weight=None):
                 obj = f"{self.history[iter_idx + 1][1]:.6f}"
                 print(f"{iter_idx + 1:<12} {mean_loss:<20} {obj:<20}")
 
-            if obj_diff < self.tol_CD:
+            if abs(obj_diff) < self.tol_CD:
                 break
 
         return self

From 4d0d3878c48a8f164987dffaff961ee9dda90b83 Mon Sep 17 00:00:00 2001
From: DataboyUsen <suxiaochen@yahoo.com>
Date: Sun, 5 Apr 2026 01:13:44 +0800
Subject: [PATCH 2/3] refactor(_mf_class): Modifications made in response to
 the suggestions for OPUS 4.6

- Fix cold start users sharing mutable list bug (suggestions by opus 4.6, P1)
- Further adhere to sklearn style: move all model parameter definitions from __init__ to fit(), transfer validation logic from __init__ to fit() with extended checks, and correct rng instance usage (suggestions by opus 4.6, P2&P5)
- Clarify obj() loss computation logic: replace inappropriate direct X usage with meaningless X_dummy placeholder, explicitly indicating it as an intermediate variable not involved in loss calculation (suggestions by opus 4.6, P4)
- Add comments and optimize low-efficiency code (suggestions by opus 4.6, P7)
---
 rehline/_mf_class.py | 95 ++++++++++++++++++++++++++------------------
 1 file changed, 57 insertions(+), 38 deletions(-)

diff --git a/rehline/_mf_class.py b/rehline/_mf_class.py
index 8d031a2..f194c6c 100644
--- a/rehline/_mf_class.py
+++ b/rehline/_mf_class.py
@@ -198,44 +198,22 @@ def __init__(
         tol_CD=1e-4,
         verbose=0,
     ):
-        # check input
-        errors = []
-        checks = [
-            (0 < rho < 1, "rho must be between 0 and 1"),
-            (C > 0, "C must be positive"),
-            (tol_CD > 0, "tol_CD must be positive"),
-            (tol > 0, "tol must be positive"),
-        ]
-        for condition, error_msg in checks:
-            if not condition:
-                errors.append(error_msg)
-        if errors:
-            raise ValueError("; ".join(errors))
-
         # parameter initialization
         ## -----------------------------basic parameters-----------------------------
         self.n_users = n_users
         self.n_items = n_items
         self.loss = loss
-        self.constraint_user = constraint_user if constraint_user is not None else []
-        self.constraint_item = constraint_item if constraint_item is not None else []
+        self.constraint_user = constraint_user
+        self.constraint_item = constraint_item
         self.biased = biased
         ## -----------------------------hyper parameters-----------------------------
         self.rank = rank
         self.C = C
         self.rho = rho
-        ## --------------------------coefficient parameters--------------------------
+        ## -------------------------initialization parameters------------------------
         self.init_mean = init_mean
         self.init_sd = init_sd
         self.random_state = random_state
-        if random_state is not None:
-            self.rng = np.random.default_rng(random_state) 
-        else:
-            self.rng = np.random.default_rng()
-        self.P = None
-        self.Q = None
-        self.bu = None
-        self.bi = None
         ## ----------------------------fitting parameters----------------------------
         self.max_iter_CD = max_iter_CD
         self.tol_CD = tol_CD
@@ -267,25 +245,65 @@ def fit(self, X, y, sample_weight=None):
             An instance of the estimator.
 
         """
+        # check input
+        ## parameter validation
+        errors = []
+        checks = [
+            (0 < self.rho < 1, "rho must be between 0 and 1"),
+            (self.C > 0, "C must be positive"),
+            (self.tol_CD > 0, "tol_CD must be positive"),
+            (self.tol > 0, "tol must be positive"),
+        ]
+        for condition, error_msg in checks:
+            if not condition:
+                errors.append(error_msg)
+        if errors:
+            raise ValueError("; ".join(errors))
+        
+        ## data validation
+        X = np.asarray(X)
+        y = np.asarray(y)
+        if X.ndim != 2 or X.shape[1] != 2:
+            raise ValueError("X must have shape (n_ratings, 2)")
+        if X.shape[0] != len(y):
+            raise ValueError("X and y must have the same number of samples")
+        user_ids = X[:, 0].astype(int)
+        item_ids = X[:, 1].astype(int)
+        if np.any(user_ids < 0) or np.any(user_ids >= self.n_users):
+            raise ValueError("User IDs must be in [0, n_users)")
+        if np.any(item_ids < 0) or np.any(item_ids >= self.n_items):
+            raise ValueError("Item IDs must be in [0, n_items)")
+        
         # Preparation
-        self.n_ratings = len(y)
-        self.history = np.nan * np.zeros((self.max_iter_CD + 1, 2))
+        ## number of training observations
+        self.n_ratings = len(y) 
+        ## convergence trace 
+        self.history = np.full((self.max_iter_CD + 1, 2), np.nan)
+        ## sample weights
         self.sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
-
+        ## constraints on user and item
+        constraint_user = self.constraint_user if self.constraint_user is not None else []
+        constraint_item = self.constraint_item if self.constraint_item is not None else [] 
+        ## random number generator
+        rng = np.random.default_rng(self.random_state) 
+
+        ## indices to locate interactions given a user or item id
+        ### user side: Iu[u] = row indices of interactions by user u
         sort_idx_users = np.argsort(X[:, 0], kind='stable')
         sorted_users = X[sort_idx_users, 0]
         counts = np.unique(sorted_users, return_counts=True)[1]
-        self.Iu = [np.array([], dtype=int)] * self.n_users
+        self.Iu = [np.array([], dtype=int) for _ in range(self.n_users)]
         for u, idxs in zip(sorted_users[np.cumsum(counts) - counts], np.split(sort_idx_users, np.cumsum(counts)[:-1])):
             self.Iu[u] = idxs
-
+        ### item side: Ui[i] = row indices of interactions that involve item i
         sort_idx_items = np.argsort(X[:, 1], kind='stable')
         sorted_items = X[sort_idx_items, 1]
         counts = np.unique(sorted_items, return_counts=True)[1]
-        self.Ui = [np.array([], dtype=int)] * self.n_items
+        self.Ui = [np.array([], dtype=int) for _ in range(self.n_items)]
         for i, idxs in zip(sorted_items[np.cumsum(counts) - counts], np.split(sort_idx_items, np.cumsum(counts)[:-1])):
             self.Ui[i] = idxs
 
+        ## effective C when updating user/item blocks (to match rehline formulation: C * PLQ_loss + 0.5 * l_2)
         C_user = self.C * self.n_users / (self.rho) / 2
         C_item = self.C * self.n_items / (1 - self.rho) / 2
 
@@ -298,9 +316,9 @@ def fit(self, X, y, sample_weight=None):
                 )
             )
 
-        # Initialization
-        self.P = self.rng.normal(loc=self.init_mean, scale=self.init_sd, size=(self.n_users, self.rank))
-        self.Q = self.rng.normal(loc=self.init_mean, scale=self.init_sd, size=(self.n_items, self.rank))
+        # Model Initialization
+        self.P = rng.normal(loc=self.init_mean, scale=self.init_sd, size=(self.n_users, self.rank))
+        self.Q = rng.normal(loc=self.init_mean, scale=self.init_sd, size=(self.n_items, self.rank))
         self.bu = np.zeros(self.n_users) if self.biased else None
         self.bi = np.zeros(self.n_items) if self.biased else None
 
@@ -339,7 +357,7 @@ def fit(self, X, y, sample_weight=None):
                     C=C_user,
                     sample_weight=weight_tmp,
                 )
-                A, b = _make_constraint_rehline_param(constraint=self.constraint_user, X=Q_tmp, y=y_tmp)
+                A, b = _make_constraint_rehline_param(constraint=constraint_user, X=Q_tmp, y=y_tmp)
 
                 ### solve and update
                 result_tmp = ReHLine_solver(
@@ -406,7 +424,7 @@ def fit(self, X, y, sample_weight=None):
                     C=C_item,
                     sample_weight=weight_tmp,
                 )
-                A, b = _make_constraint_rehline_param(constraint=self.constraint_item, X=P_tmp, y=y_tmp)
+                A, b = _make_constraint_rehline_param(constraint=constraint_item, X=P_tmp, y=y_tmp)
 
                 ### solve and update
                 result_tmp = ReHLine_solver(
@@ -511,9 +529,10 @@ def obj(self, X, y):
             item_penalty = np.sum(self.Q**2) * (1 - self.rho) / self.n_items
             penalty = user_penalty + item_penalty
 
-        y_pred = self.decision_function(X)
-        U, V, Tau, S, T = _make_loss_rehline_param(loss=self.loss, X=X, y=y)
+        X_dummy = np.ones((len(y), 1)) # not used in loss computation, only shape matters for loss param construction
+        U, V, Tau, S, T = _make_loss_rehline_param(loss=self.loss, X=X_dummy, y=y)
         loss = ReHLoss(U, V, S, T, Tau)
+        y_pred = self.decision_function(X)
         loss_term = loss(y_pred)
 
         return loss_term, self.C * loss_term + penalty

From 5b4f691b162d4fe86b1f71f306411e0a3f367755 Mon Sep 17 00:00:00 2001
From: DataboyUsen <suxiaochen@yahoo.com>
Date: Sun, 5 Apr 2026 02:09:49 +0800
Subject: [PATCH 3/3] fix: revert constraint_user/constraint_item to init as
 empty list

- Previously attempted to follow sklearn convention: set to None in __init__,
  initialize to [] in fit()
- This caused CI failure because test accesses .constraint_user.append()
  before fit() is called
- Reverted to original behavior (init as []) to maintain compatibility with
  existing tests
- No impact on correctness: [] and None treated equivalently in fit()
---
 rehline/_mf_class.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/rehline/_mf_class.py b/rehline/_mf_class.py
index f194c6c..e88ead1 100644
--- a/rehline/_mf_class.py
+++ b/rehline/_mf_class.py
@@ -203,8 +203,8 @@ def __init__(
         self.n_users = n_users
         self.n_items = n_items
         self.loss = loss
-        self.constraint_user = constraint_user
-        self.constraint_item = constraint_item
+        self.constraint_user = constraint_user if constraint_user is not None else []
+        self.constraint_item = constraint_item if constraint_item is not None else []
         self.biased = biased
         ## -----------------------------hyper parameters-----------------------------
         self.rank = rank
@@ -281,9 +281,6 @@ def fit(self, X, y, sample_weight=None):
         self.history = np.full((self.max_iter_CD + 1, 2), np.nan)
         ## sample weights
         self.sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
-        ## constraints on user and item
-        constraint_user = self.constraint_user if self.constraint_user is not None else []
-        constraint_item = self.constraint_item if self.constraint_item is not None else [] 
         ## random number generator
         rng = np.random.default_rng(self.random_state) 
 
@@ -357,7 +354,7 @@ def fit(self, X, y, sample_weight=None):
                     C=C_user,
                     sample_weight=weight_tmp,
                 )
-                A, b = _make_constraint_rehline_param(constraint=constraint_user, X=Q_tmp, y=y_tmp)
+                A, b = _make_constraint_rehline_param(constraint=self.constraint_user, X=Q_tmp, y=y_tmp)
 
                 ### solve and update
                 result_tmp = ReHLine_solver(
@@ -424,7 +421,7 @@ def fit(self, X, y, sample_weight=None):
                     C=C_item,
                     sample_weight=weight_tmp,
                 )
-                A, b = _make_constraint_rehline_param(constraint=constraint_item, X=P_tmp, y=y_tmp)
+                A, b = _make_constraint_rehline_param(constraint=self.constraint_item, X=P_tmp, y=y_tmp)
 
                 ### solve and update
                 result_tmp = ReHLine_solver(