From 5db49199179e1c4289878ea0c7662d986e513f27 Mon Sep 17 00:00:00 2001 From: DataboyUsen Date: Fri, 3 Apr 2026 14:03:48 +0800 Subject: [PATCH 1/3] refactor(_mf_class.py): improve code quality and fix potential issues - Replace random_state handling with proper rng instance initialization - Remove pandas dependency by using numpy-only operations - Fix convergence check: apply abs() to obj_diff before comparison - Porperly move the initialization of model parameters to fit() method - Fix spelling errors in comments and docstrings --- rehline/_mf_class.py | 47 +++++++++++++++++++++++++++++--------------- 1 file changed, 31 insertions(+), 16 deletions(-) diff --git a/rehline/_mf_class.py b/rehline/_mf_class.py index 6e90984..8d031a2 100644 --- a/rehline/_mf_class.py +++ b/rehline/_mf_class.py @@ -3,7 +3,6 @@ import warnings import numpy as np -import pandas as pd from sklearn.base import BaseEstimator from sklearn.exceptions import ConvergenceWarning from sklearn.utils.validation import _check_sample_weight @@ -214,27 +213,29 @@ def __init__( raise ValueError("; ".join(errors)) # parameter initialization - ## -----------------------------basic perameters----------------------------- + ## -----------------------------basic parameters----------------------------- self.n_users = n_users self.n_items = n_items self.loss = loss self.constraint_user = constraint_user if constraint_user is not None else [] self.constraint_item = constraint_item if constraint_item is not None else [] self.biased = biased - ## -----------------------------hyper perameters----------------------------- + ## -----------------------------hyper parameters----------------------------- self.rank = rank self.C = C self.rho = rho - ## --------------------------coefficient perameters-------------------------- + ## --------------------------coefficient parameters-------------------------- self.init_mean = init_mean self.init_sd = init_sd self.random_state = random_state - if self.random_state: - np.random.seed(random_state) - self.P = np.random.normal(loc=init_mean, scale=init_sd, size=(n_users, rank)) - self.Q = np.random.normal(loc=init_mean, scale=init_sd, size=(n_items, rank)) - self.bu = np.zeros(n_users) if self.biased else None - self.bi = np.zeros(n_items) if self.biased else None + if random_state is not None: + self.rng = np.random.default_rng(random_state) + else: + self.rng = np.random.default_rng() + self.P = None + self.Q = None + self.bu = None + self.bi = None ## ----------------------------fitting parameters---------------------------- self.max_iter_CD = max_iter_CD self.tol_CD = tol_CD @@ -271,11 +272,19 @@ def fit(self, X, y, sample_weight=None): self.history = np.nan * np.zeros((self.max_iter_CD + 1, 2)) self.sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) - X_df = pd.DataFrame(X, columns=["user", "item"]) - uidx_map = X_df.groupby("user").indices - iidx_map = X_df.groupby("item").indices - self.Iu = [uidx_map.get(u, np.array([], dtype=int)) for u in range(self.n_users)] - self.Ui = [iidx_map.get(i, np.array([], dtype=int)) for i in range(self.n_items)] + sort_idx_users = np.argsort(X[:, 0], kind='stable') + sorted_users = X[sort_idx_users, 0] + counts = np.unique(sorted_users, return_counts=True)[1] + self.Iu = [np.array([], dtype=int)] * self.n_users + for u, idxs in zip(sorted_users[np.cumsum(counts) - counts], np.split(sort_idx_users, np.cumsum(counts)[:-1])): + self.Iu[u] = idxs + + sort_idx_items = np.argsort(X[:, 1], kind='stable') + sorted_items = X[sort_idx_items, 1] + counts = np.unique(sorted_items, return_counts=True)[1] + self.Ui = [np.array([], dtype=int)] * self.n_items + for i, idxs in zip(sorted_items[np.cumsum(counts) - counts], np.split(sort_idx_items, np.cumsum(counts)[:-1])): + self.Ui[i] = idxs C_user = self.C * self.n_users / (self.rho) / 2 C_item = self.C * self.n_items / (1 - self.rho) / 2 @@ -289,6 +298,12 @@ def fit(self, X, y, sample_weight=None): ) ) + # Initialization + self.P = self.rng.normal(loc=self.init_mean, scale=self.init_sd, size=(self.n_users, self.rank)) + self.Q = self.rng.normal(loc=self.init_mean, scale=self.init_sd, size=(self.n_items, self.rank)) + self.bu = np.zeros(self.n_users) if self.biased else None + self.bi = np.zeros(self.n_items) if self.biased else None + # CD algorithm self.history[0] = self.obj(X, y) for iter_idx in range(self.max_iter_CD): @@ -435,7 +450,7 @@ def fit(self, X, y, sample_weight=None): obj = f"{self.history[iter_idx + 1][1]:.6f}" print(f"{iter_idx + 1:<12} {mean_loss:<20} {obj:<20}") - if obj_diff < self.tol_CD: + if abs(obj_diff) < self.tol_CD: break return self From 4d0d3878c48a8f164987dffaff961ee9dda90b83 Mon Sep 17 00:00:00 2001 From: DataboyUsen Date: Sun, 5 Apr 2026 01:13:44 +0800 Subject: [PATCH 2/3] refactor(_mf_class): Modifications made in response to the suggestions for OPUS 4.6 - Fix cold start users sharing mutable list bug (suggestions by opus 4.6, P1) - Further adhere to sklearn style: move all model parameter definitions from __init__ to fit(), transfer validation logic from __init__ to fit() with extended checks, and correct rng instance usage (suggestions by opus 4.6, P2&P5) - Clarify obj() loss computation logic: replace inappropriate direct X usage with meaningless X_dummy placeholder, explicitly indicating it as an intermediate variable not involved in loss calculation (suggestions by opus 4.6, P4) - Add comments and optimize low-efficiency code (suggestions by opus 4.6, P7) --- rehline/_mf_class.py | 95 ++++++++++++++++++++++++++------------------ 1 file changed, 57 insertions(+), 38 deletions(-) diff --git a/rehline/_mf_class.py b/rehline/_mf_class.py index 8d031a2..f194c6c 100644 --- a/rehline/_mf_class.py +++ b/rehline/_mf_class.py @@ -198,44 +198,22 @@ def __init__( tol_CD=1e-4, verbose=0, ): - # check input - errors = [] - checks = [ - (0 < rho < 1, "rho must be between 0 and 1"), - (C > 0, "C must be positive"), - (tol_CD > 0, "tol_CD must be positive"), - (tol > 0, "tol must be positive"), - ] - for condition, error_msg in checks: - if not condition: - errors.append(error_msg) - if errors: - raise ValueError("; ".join(errors)) - # parameter initialization ## -----------------------------basic parameters----------------------------- self.n_users = n_users self.n_items = n_items self.loss = loss - self.constraint_user = constraint_user if constraint_user is not None else [] - self.constraint_item = constraint_item if constraint_item is not None else [] + self.constraint_user = constraint_user + self.constraint_item = constraint_item self.biased = biased ## -----------------------------hyper parameters----------------------------- self.rank = rank self.C = C self.rho = rho - ## --------------------------coefficient parameters-------------------------- + ## -------------------------initialization parameters------------------------ self.init_mean = init_mean self.init_sd = init_sd self.random_state = random_state - if random_state is not None: - self.rng = np.random.default_rng(random_state) - else: - self.rng = np.random.default_rng() - self.P = None - self.Q = None - self.bu = None - self.bi = None ## ----------------------------fitting parameters---------------------------- self.max_iter_CD = max_iter_CD self.tol_CD = tol_CD @@ -267,25 +245,65 @@ def fit(self, X, y, sample_weight=None): An instance of the estimator. """ + # check input + ## parameter validation + errors = [] + checks = [ + (0 < self.rho < 1, "rho must be between 0 and 1"), + (self.C > 0, "C must be positive"), + (self.tol_CD > 0, "tol_CD must be positive"), + (self.tol > 0, "tol must be positive"), + ] + for condition, error_msg in checks: + if not condition: + errors.append(error_msg) + if errors: + raise ValueError("; ".join(errors)) + + ## data validation + X = np.asarray(X) + y = np.asarray(y) + if X.ndim != 2 or X.shape[1] != 2: + raise ValueError("X must have shape (n_ratings, 2)") + if X.shape[0] != len(y): + raise ValueError("X and y must have the same number of samples") + user_ids = X[:, 0].astype(int) + item_ids = X[:, 1].astype(int) + if np.any(user_ids < 0) or np.any(user_ids >= self.n_users): + raise ValueError("User IDs must be in [0, n_users)") + if np.any(item_ids < 0) or np.any(item_ids >= self.n_items): + raise ValueError("Item IDs must be in [0, n_items)") + # Preparation - self.n_ratings = len(y) - self.history = np.nan * np.zeros((self.max_iter_CD + 1, 2)) + ## number of training observations + self.n_ratings = len(y) + ## convergence trace + self.history = np.full((self.max_iter_CD + 1, 2), np.nan) + ## sample weights self.sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) - + ## constraints on user and item + constraint_user = self.constraint_user if self.constraint_user is not None else [] + constraint_item = self.constraint_item if self.constraint_item is not None else [] + ## random number generator + rng = np.random.default_rng(self.random_state) + + ## indices to locate interactions given a user or item id + ### user side: Iu[u] = row indices of interactions by user u sort_idx_users = np.argsort(X[:, 0], kind='stable') sorted_users = X[sort_idx_users, 0] counts = np.unique(sorted_users, return_counts=True)[1] - self.Iu = [np.array([], dtype=int)] * self.n_users + self.Iu = [np.array([], dtype=int) for _ in range(self.n_users)] for u, idxs in zip(sorted_users[np.cumsum(counts) - counts], np.split(sort_idx_users, np.cumsum(counts)[:-1])): self.Iu[u] = idxs - + ### item side: Ui[i] = row indices of interactions that involve item i sort_idx_items = np.argsort(X[:, 1], kind='stable') sorted_items = X[sort_idx_items, 1] counts = np.unique(sorted_items, return_counts=True)[1] - self.Ui = [np.array([], dtype=int)] * self.n_items + self.Ui = [np.array([], dtype=int) for _ in range(self.n_items)] for i, idxs in zip(sorted_items[np.cumsum(counts) - counts], np.split(sort_idx_items, np.cumsum(counts)[:-1])): self.Ui[i] = idxs + ## effective C when updating user/item blocks (to match rehline formulation: C * PLQ_loss + 0.5 * l_2) C_user = self.C * self.n_users / (self.rho) / 2 C_item = self.C * self.n_items / (1 - self.rho) / 2 @@ -298,9 +316,9 @@ def fit(self, X, y, sample_weight=None): ) ) - # Initialization - self.P = self.rng.normal(loc=self.init_mean, scale=self.init_sd, size=(self.n_users, self.rank)) - self.Q = self.rng.normal(loc=self.init_mean, scale=self.init_sd, size=(self.n_items, self.rank)) + # Model Initialization + self.P = rng.normal(loc=self.init_mean, scale=self.init_sd, size=(self.n_users, self.rank)) + self.Q = rng.normal(loc=self.init_mean, scale=self.init_sd, size=(self.n_items, self.rank)) self.bu = np.zeros(self.n_users) if self.biased else None self.bi = np.zeros(self.n_items) if self.biased else None @@ -339,7 +357,7 @@ def fit(self, X, y, sample_weight=None): C=C_user, sample_weight=weight_tmp, ) - A, b = _make_constraint_rehline_param(constraint=self.constraint_user, X=Q_tmp, y=y_tmp) + A, b = _make_constraint_rehline_param(constraint=constraint_user, X=Q_tmp, y=y_tmp) ### solve and update result_tmp = ReHLine_solver( @@ -406,7 +424,7 @@ def fit(self, X, y, sample_weight=None): C=C_item, sample_weight=weight_tmp, ) - A, b = _make_constraint_rehline_param(constraint=self.constraint_item, X=P_tmp, y=y_tmp) + A, b = _make_constraint_rehline_param(constraint=constraint_item, X=P_tmp, y=y_tmp) ### solve and update result_tmp = ReHLine_solver( @@ -511,9 +529,10 @@ def obj(self, X, y): item_penalty = np.sum(self.Q**2) * (1 - self.rho) / self.n_items penalty = user_penalty + item_penalty - y_pred = self.decision_function(X) - U, V, Tau, S, T = _make_loss_rehline_param(loss=self.loss, X=X, y=y) + X_dummy = np.ones((len(y), 1)) # not used in loss computation, only shape matters for loss param construction + U, V, Tau, S, T = _make_loss_rehline_param(loss=self.loss, X=X_dummy, y=y) loss = ReHLoss(U, V, S, T, Tau) + y_pred = self.decision_function(X) loss_term = loss(y_pred) return loss_term, self.C * loss_term + penalty From 5b4f691b162d4fe86b1f71f306411e0a3f367755 Mon Sep 17 00:00:00 2001 From: DataboyUsen Date: Sun, 5 Apr 2026 02:09:49 +0800 Subject: [PATCH 3/3] fix: revert constraint_user/constraint_item to init as empty list - Previously attempted to follow sklearn convention: set to None in __init__, initialize to [] in fit() - This caused CI failure because test accesses .constraint_user.append() before fit() is called - Reverted to original behavior (init as []) to maintain compatibility with existing tests - No impact on correctness: [] and None treated equivalently in fit() --- rehline/_mf_class.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/rehline/_mf_class.py b/rehline/_mf_class.py index f194c6c..e88ead1 100644 --- a/rehline/_mf_class.py +++ b/rehline/_mf_class.py @@ -203,8 +203,8 @@ def __init__( self.n_users = n_users self.n_items = n_items self.loss = loss - self.constraint_user = constraint_user - self.constraint_item = constraint_item + self.constraint_user = constraint_user if constraint_user is not None else [] + self.constraint_item = constraint_item if constraint_item is not None else [] self.biased = biased ## -----------------------------hyper parameters----------------------------- self.rank = rank @@ -281,9 +281,6 @@ def fit(self, X, y, sample_weight=None): self.history = np.full((self.max_iter_CD + 1, 2), np.nan) ## sample weights self.sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) - ## constraints on user and item - constraint_user = self.constraint_user if self.constraint_user is not None else [] - constraint_item = self.constraint_item if self.constraint_item is not None else [] ## random number generator rng = np.random.default_rng(self.random_state) @@ -357,7 +354,7 @@ def fit(self, X, y, sample_weight=None): C=C_user, sample_weight=weight_tmp, ) - A, b = _make_constraint_rehline_param(constraint=constraint_user, X=Q_tmp, y=y_tmp) + A, b = _make_constraint_rehline_param(constraint=self.constraint_user, X=Q_tmp, y=y_tmp) ### solve and update result_tmp = ReHLine_solver( @@ -424,7 +421,7 @@ def fit(self, X, y, sample_weight=None): C=C_item, sample_weight=weight_tmp, ) - A, b = _make_constraint_rehline_param(constraint=constraint_item, X=P_tmp, y=y_tmp) + A, b = _make_constraint_rehline_param(constraint=self.constraint_item, X=P_tmp, y=y_tmp) ### solve and update result_tmp = ReHLine_solver(