-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathstat_utils.py
More file actions
91 lines (71 loc) · 3.51 KB
/
stat_utils.py
File metadata and controls
91 lines (71 loc) · 3.51 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import numpy as np
def nan_covariance(data): #not sure how to initialise without a function like this, np.cov() does not work well as there is alot of NaN in the matrix
"""
replaces Nans by column means and calculates empirical covariance
:param data: data in (n,p) numpy array form, with Nans
:return: empirical covariance
"""
n, m = data.shape # this function basicaly computes covariance ignoring the mssing data
means = np.nanmean(data, axis=0)
data_filled = np.where(np.isnan(data), means, data)
return np.cov(data_filled, rowvar=False)
def log_likelihood1(X, mu, Sigma, distribution='gaussian'):
"""
Calculates complete (Gaussian by default) loglikelihood
:param X: (n,p) numpy
:param mu: (p,) numpy
:param Sigma: (p,p) numpy
:return: float loglikelihood
"""
if distribution == 'gaussian':
N, p = X.shape
mu = mu.reshape(1, -1) # Reshape mu to ensure correct matrix operations
X_centered = X - mu
Sigma_inv = np.linalg.inv(Sigma)
term = np.einsum('ij,ji->i', X_centered, np.dot(Sigma_inv, X_centered.T))
result = -N/2 * np.log(np.linalg.det(Sigma)) - 0.5 * np.sum(term)
else:
raise NotImplementedError
return result
def regularize_covariance(Sigma, eigenvalue_tol=1e-2, ridge_coefficient=5e-1, verbose=False, version2=False):
"""
regularizes covariance matrices with very small eigenvalues by adding lambda*In
:param Sigma: (p,p) numpy array (positive definite covariance matrix, must be possible to do eigenvalue decomposition)
:param eigenvalue_tol : threshold of the smallest eigenvalue that leads to regularization
:param ridge_coefficient : the lambda such that lambda*In gets added if the threshold is reached
:param version2: If "True", this executes a completely different version of the function, and the role of the parameters changes slightly. If the ration of smallest to biggest eigenvalue is smaller than eigenvalue_tol, we add ridge_coefficient times the biggest eigenvalue to the diagonal.
"""
eigenvalues, eigenvectors = np.linalg.eig(Sigma)
# eigenvectors are in the columns
p = Sigma.shape[0]
# I added this block which adds ridge*biggest_eigenvalue*Identity if the smallest eigenvalue is less than tol*biggest eigenvalue
if version2:
lambda1 = max(eigenvalues)
lambdan = min(eigenvalues)
if verbose:
print("Smallest | largest eigenvalue : ", lambdan, lambda1)
if lambdan / lambda1 < eigenvalue_tol:
if verbose:
print("We added :", ridge_coefficient * lambda1, " to the diagonal.")
Sigma = Sigma + ridge_coefficient * lambda1 * np.eye(p)
return Sigma
small_indices = np.where(eigenvalues < eigenvalue_tol)[0]
if verbose:
# print("The eigenvectors are the columns of :\n",eigenvectors)
print("The smallest eigenvalue is :", min(eigenvalues))
print("The eigenvalues below the tolerance of ", eigenvalue_tol, " are :\n", eigenvalues[small_indices])
if len(small_indices) > 0:
Sigma = Sigma + ridge_coefficient * np.eye(p)
if verbose:
print("We added ", ridge_coefficient, " to the diagonal.")
return Sigma
#Perhaps we need to change the name of this function to something more appropriate
def gaussian_MLE(data):
"""
Returns empirical mean and covariance.
"""
n = data.shape[0]
p = data.shape[1]
means = np.mean(data, axis=0)
Sigma=np.cov(data,rowvar=False)
return means, Sigma