@@ -364,6 +364,11 @@ def fit(self, X: np.ndarray, y: np.ndarray, bounds: Optional[List[Tuple[float, f
364364 Fits the Kriging model to training data X and y. This method is compatible
365365 with scikit-learn and uses differential evolution to optimize the hyperparameters
366366 (log(theta)).
367+ Fitting pipeline (Forrester, Ch. 2–3):
368+ - Set masks and θ dimensionality (isotropic → n_theta=1, else n_theta=k).
369+ - Assemble bounds on [log10 θ] (+ [log10 λ] for regression/reinterpolation; + p if enabled).
370+ - Maximize the concentrated likelihood (Ch. 3) to get [log10 θ, log10 λ, p].
371+ - Build R and store U, μ-related quantities implicitly via U (used at prediction).
367372
368373 Args:
369374 X (np.ndarray):
@@ -402,6 +407,8 @@ def fit(self, X: np.ndarray, y: np.ndarray, bounds: Optional[List[Tuple[float, f
402407 # Calculate and store min and max of X
403408 self .min_X = np .min (self .X_ , axis = 0 )
404409 self .max_X = np .max (self .X_ , axis = 0 )
410+
411+ # Bounds: [θ] for interpolation; [θ, λ] for regression/reinterpolation; [+ p] if optim_p
405412 if bounds is None :
406413 if self .method == "interpolation" :
407414 bounds = [(self .min_theta , self .max_theta )] * self .k
@@ -414,9 +421,11 @@ def fit(self, X: np.ndarray, y: np.ndarray, bounds: Optional[List[Tuple[float, f
414421 n_p = self .n_p if hasattr (self , "n_p" ) else self .k
415422 bounds += [(self .min_p , self .max_p )] * n_p
416423
424+ # ML optimization (Forrester, Ch. 3)
417425 self .logtheta_loglambda_p_ , _ = self .max_likelihood (bounds )
418426
419427 # store theta and Lambda in log scale
428+ # Store parameters on log10 scale; convert to linear only where needed
420429 self .theta = self .logtheta_loglambda_p_ [: self .n_theta ]
421430 if (self .method == "regression" ) or (self .method == "reinterpolation" ):
422431 # select the first n_theta values from logtheta_loglambda_p_:
@@ -431,6 +440,7 @@ def fit(self, X: np.ndarray, y: np.ndarray, bounds: Optional[List[Tuple[float, f
431440 raise ValueError ("method must be one of 'interpolation', 'regression', or 'reinterpolation'" )
432441
433442 # Once logtheta_lambda is found, compute the final correlation matrix
443+ # Finalize: compute R and its Cholesky at the found hyperparameters
434444 self .negLnLike , self .Psi_ , self .U_ = self .likelihood (self .logtheta_loglambda_p_ )
435445
436446 # Update log with the current values
@@ -441,6 +451,10 @@ def predict(self, X: np.ndarray, return_std=False, return_val: str = "y") -> np.
441451 """
442452 Predicts the Kriging response at a set of points X. This method is compatible
443453 with scikit-learn and returns predictions for the input points.
454+ Batch prediction wrapper around _pred:
455+ - Shapes normalized so X is (n_samples, k).
456+ - Forrester (Ch. 3/6): returns f̂(x), and optionally s(x) and EI
457+ (EI returned as −log10(EI) internally for stability).
444458
445459 Args:
446460 X (np.ndarray):
@@ -513,7 +527,13 @@ def build_Psi(self) -> None:
513527 correlation matrix as described in [Forr08a, p.57].
514528
515529 Notes:
516- Returns only the upper triangle, as the matrix is symmetric and the diagonal will be handled later.
530+ - Correlation follows the stationary Gaussian kernel used in Kriging:
531+ R = exp(-D), with D a weighted distance. See Forrester et al. (2008),
532+ Ch. 2, correlation modelling.
533+ - The code builds D as a sum of per-dimension distance contributions
534+ scaled by 10**theta (theta is stored in log10), then applies exp(-D).
535+ - Returns only the upper triangle; the symmetric and diagonal parts
536+ are handled by the caller.
517537
518538 Attributes:
519539 Psi (np.matrix): Correlation matrix Psi. Shape (n,n).
@@ -537,25 +557,30 @@ def build_Psi(self) -> None:
537557 """
538558 try :
539559 n , k = self .X_ .shape
560+ # weights: 10**theta; isotropic expands to k values
540561 theta10 = self ._get_theta10_from_logtheta ()
541562
542563 # Initialize the Psi matrix
543564 Psi = np .zeros ((n , n ), dtype = np .float64 )
544565
545566 # Calculate the distance matrix using ordered variables
567+ # Ordered/numeric part: squared Euclidean with weights (Forrester, Ch. 2)
546568 if self .ordered_mask .any ():
547569 X_ordered = self .X_ [:, self .ordered_mask ]
548570 D_ordered = squareform (pdist (X_ordered , metric = "sqeuclidean" , w = theta10 [self .ordered_mask ]))
549571 Psi += D_ordered
550572
573+ # Factorial part: categorical metric wrapped by exp(-D) (still stationary envelope)
551574 # Add the contribution of factor variables to the distance matrix
552575 if self .factor_mask .any ():
553576 X_factor = self .X_ [:, self .factor_mask ]
554577 D_factor = squareform (pdist (X_factor , metric = self .metric_factorial , w = theta10 [self .factor_mask ]))
555578 Psi += D_factor
556579
557- # Calculate correlation from distance
580+ # Correlation from distance: R = exp(-D) (Forrester, Ch. 2)
558581 Psi = np .exp (- Psi )
582+
583+ # Diagnostics (not in Forrester): inf/cond for stability
559584 # Check for infinite values
560585 self .inf_Psi = np .isinf (Psi ).any ()
561586 # Calculate condition number
@@ -572,6 +597,10 @@ def likelihood(self, x: np.ndarray) -> Tuple[float, np.ndarray, np.ndarray]:
572597 """
573598 Computes the negative of the concentrated log-likelihood for a given set
574599 of log(theta) parameters. Returns the negative log-likelihood, the correlation matrix Psi, and its Cholesky factor U.
600+ Negative concentrated log-likelihood (Forrester, Ch. 3):
601+ - Given R (here Psi with diagonal and nugget), μ = (1^T R^{-1} y)/(1^T R^{-1} 1),
602+ σ^2 = (r^T R^{-1} r)/n with r = y - 1·μ,
603+ - Concentrated −log L = (n/2) log(σ^2) + (1/2) log |R| (constants omitted).
575604
576605 Args:
577606 x (np.ndarray):
@@ -601,16 +630,18 @@ def likelihood(self, x: np.ndarray) -> Tuple[float, np.ndarray, np.ndarray]:
601630 X = self .X_
602631 y = self .y_ .flatten ()
603632
633+ # First entries are log10(theta); optionally followed by log10(lambda) and p
604634 self .theta = x [: self .n_theta ]
605635
606636 if (self .method == "regression" ) or (self .method == "reinterpolation" ):
607637 lambda_ = x [self .n_theta : self .n_theta + 1 ]
608- # lambda is in log scale, so transform it back:
638+ # nugget on linear scale; lambda is in log scale, so transform it back:
609639 lambda_ = 10.0 ** lambda_
610640 if self .optim_p :
611641 self .p_val = x [self .n_theta + 1 : self .n_theta + 1 + self .n_p ]
612642 elif self .method == "interpolation" :
613643 # use the original, untransformed eps:
644+ # small “nugget” for interpolation (Forrester, sec. on interpolation)
614645 lambda_ = self .eps
615646 if self .optim_p :
616647 self .p_val = x [self .n_theta : self .n_theta + self .n_p ]
@@ -620,34 +651,45 @@ def likelihood(self, x: np.ndarray) -> Tuple[float, np.ndarray, np.ndarray]:
620651 n = X .shape [0 ]
621652 one = np .ones (n )
622653
623- # Build the correlation matrix Psi
654+ # Build the correlation matrix Psi.
655+ # Build correlation R = upper + upper^T + I + λ I (Forrester, Ch. 3)
624656 Psi_upper_triangle = self .build_Psi ()
625657 Psi = Psi_upper_triangle + Psi_upper_triangle .T + np .eye (n ) + np .eye (n ) * lambda_
626658
659+ # Cholesky factorization R = U U^T
627660 try :
628661 U = np .linalg .cholesky (Psi )
629662 except LinAlgError :
663+ # Penalize ill-conditioning (implementation detail)
630664 return self .penalty , Psi , None
631665
666+ # log |R| via Cholesky (Forrester, Ch. 3)
632667 LnDetPsi = 2.0 * np .sum (np .log (np .abs (np .diag (U ))))
633668
669+ # R^{-1}y and R^{-1}1 via triangular solves
634670 temp_y = np .linalg .solve (U , y )
635671 temp_one = np .linalg .solve (U , one )
636672 vy = np .linalg .solve (U .T , temp_y )
637673 vone = np .linalg .solve (U .T , temp_one )
638674
675+ # μ̂ (ordinary Kriging constant mean) (Forrester, Ch. 3)
639676 mu = (one @ vy ) / (one @ vone )
677+
678+ # σ̂^2 (concentrated variance) (Forrester, Ch. 3)
640679 resid = y - one * mu
641680 tresid = np .linalg .solve (U , resid )
642681 tresid = np .linalg .solve (U .T , tresid )
643682 SigmaSqr = (resid @ tresid ) / n
644683
684+ # Concentrated −log L (Forrester, Ch. 3)
645685 negLnLike = (n / 2.0 ) * np .log (SigmaSqr ) + 0.5 * LnDetPsi
646686 return negLnLike , Psi , U
647687
648688 def build_psi_vec (self , x : np .ndarray ) -> None :
649689 """
650690 Build the psi vector required for predictive methods.
691+ ψ(x) := [exp(-D(x, x_i))]_{i=1..n}, i.e., correlation between a new x and the training sites
692+ using the same D as for R (Forrester, Ch. 2).
651693
652694 Args:
653695 x (ndarray): Point to calculate the psi vector for.
@@ -677,16 +719,19 @@ def build_psi_vec(self, x: np.ndarray) -> None:
677719 D = np .zeros (n )
678720
679721 # Compute ordered distance contributions
722+ # Ordered contributions to D(x, X) (weighted squared Euclidean)
680723 if self .ordered_mask .any ():
681724 X_ordered = self .X_ [:, self .ordered_mask ]
682725 x_ordered = x [self .ordered_mask ]
683726 D += cdist (x_ordered .reshape (1 , - 1 ), X_ordered , metric = "sqeuclidean" , w = theta10 [self .ordered_mask ]).ravel ()
684727 # Compute factor distance contributions
728+ # Factorial contributions (categorical distance)
685729 if self .factor_mask .any ():
686730 X_factor = self .X_ [:, self .factor_mask ]
687731 x_factor = x [self .factor_mask ]
688732 D += cdist (x_factor .reshape (1 , - 1 ), X_factor , metric = self .metric_factorial , w = theta10 [self .factor_mask ]).ravel ()
689733
734+ # Forrester, Ch. 2: R = exp(-D)
690735 psi = np .exp (- D )
691736 return psi
692737
@@ -697,6 +742,13 @@ def _pred(self, x: np.ndarray) -> float:
697742 """
698743 Computes a single-point Kriging prediction using the correlation matrix
699744 information. Internal helper method.
745+ Ordinary Kriging predictor (Forrester, Ch. 3):
746+ - f̂(x) = μ̂ + ψ(x)^T R^{-1} (y − 1 μ̂)
747+ - s^2(x) ~ σ̂^2 [1 + λ − ψ(x)^T R^{-1} ψ(x)]
748+ Note: The classic OK variance includes a mean-uncertainty term
749+ (1 − 1^T R^{-1} ψ)^2 / (1^T R^{-1} 1). This implementation uses a
750+ simplified form common in engineering Kriging codes.
751+ Expected improvement (EI) (Forrester, Ch. 6) computed for minimization.
700752
701753 Args:
702754 x (np.ndarray): 1D array of length k for the point at which to predict.
@@ -708,6 +760,7 @@ def _pred(self, x: np.ndarray) -> float:
708760 """
709761 y = self .y_ .flatten ()
710762
763+ # Load θ and λ (λ transformed to linear scale where needed)
711764 if (self .method == "regression" ) or (self .method == "reinterpolation" ):
712765 self .theta = self .logtheta_loglambda_p_ [: self .n_theta ]
713766 lambda_ = self .logtheta_loglambda_p_ [self .n_theta : self .n_theta + 1 ]
@@ -729,28 +782,36 @@ def _pred(self, x: np.ndarray) -> float:
729782 one = np .ones (n )
730783
731784 # Compute mu
785+ # μ̂ via backsolves (Forrester, Ch. 3)
732786 y_tilde = np .linalg .solve (U , y )
733787 y_tilde = np .linalg .solve (U .T , y_tilde )
734788 one_tilde = np .linalg .solve (U , one )
735789 one_tilde = np .linalg .solve (U .T , one_tilde )
736790 mu = (one @ y_tilde ) / (one @ one_tilde )
737791
792+ # Residual backsolve for R^{-1}(y − 1 μ̂)
738793 resid = y - one * mu
739794 resid_tilde = np .linalg .solve (U , resid )
740795 resid_tilde = np .linalg .solve (U .T , resid_tilde )
741796
742797 psi = self .build_psi_vec (x )
743798
744799 # Compute SigmaSqr and SSqr
800+ # σ̂^2 (Forrester, Ch. 3)
801+ # Note: identical to likelihood computation to maintain consistency
802+ # (up to small numerical differences).
745803 if (self .method == "interpolation" ) or (self .method == "regression" ):
746804 SigmaSqr = (resid @ resid_tilde ) / n
747805 # Compute SSqr
806+ # s^2(x) ≈ σ̂^2 [1 + λ − ψ^T R^{-1} ψ]
748807 psi_tilde = np .linalg .solve (U , psi )
749808 psi_tilde = np .linalg .solve (U .T , psi_tilde )
750809 # Eq. (3.1) in [forr08a] without lambda:
810+ # cf. Forrester (Ch. 3) variance structure; simplified here
751811 SSqr = SigmaSqr * (1 + lambda_ - psi @ psi_tilde )
752812 else :
753813 # method is "reinterpolation"
814+ # Reinterpolation: variance with nugget removed and ε added in R for the variance solve
754815 Psi_adjusted = self .Psi_ - np .eye (n ) * lambda_ + np .eye (n ) * self .eps
755816 SigmaSqr = (resid @ np .linalg .solve (U .T , np .linalg .solve (U , Psi_adjusted @ resid_tilde ))) / n
756817 # Compute Uint (Cholesky factor of the adjusted Psi matrix)
@@ -765,9 +826,12 @@ def _pred(self, x: np.ndarray) -> float:
765826 s = np .abs (SSqr ) ** 0.5
766827
767828 # Final prediction
829+ # f̂(x) (Forrester, Ch. 3)
768830 f = mu + psi @ resid_tilde
769831
770832 # Compute ExpImp
833+ # EI (minimization), following standard closed form (Forrester, Ch. 6).
834+ # Implementation uses erf for Φ and returns −log10(EI) for numerical stability.
771835 if self .return_ei :
772836 yBest = np .min (y )
773837 EITermOne = (yBest - f ) * (0.5 + 0.5 * erf ((1 / np .sqrt (2 )) * ((yBest - f ) / s )))
@@ -781,6 +845,8 @@ def max_likelihood(self, bounds: List[Tuple[float, float]]) -> Tuple[np.ndarray,
781845 """
782846 Maximizes the Kriging likelihood function using differential evolution
783847 over the range of log(theta) specified by bounds.
848+ Hyperparameter estimation via maximum likelihood (Forrester, Ch. 3).
849+ Objective is the concentrated −log L above, optimized with differential evolution.
784850
785851 Args:
786852 bounds (List[Tuple[float, float]]): Sequence of (low, high) bounds for log(theta).
@@ -807,6 +873,7 @@ def objective(logtheta_loglambda_p_: np.ndarray) -> float:
807873 neg_ln_like , _ , _ = self .likelihood (logtheta_loglambda_p_ )
808874 return neg_ln_like
809875
876+ # Use keyword args for easier test patching
810877 result = differential_evolution (func = objective , bounds = bounds , seed = self .seed )
811878 return result .x , result .fun
812879
0 commit comments