Skip to content

Commit 56bc8ed

Browse files
0.29.24
1 parent 44d9c01 commit 56bc8ed

2 files changed

Lines changed: 309 additions & 1 deletion

File tree

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"
77

88
[project]
99
name = "spotpython"
10-
version = "0.29.23"
10+
version = "0.29.24"
1111
authors = [
1212
{ name="T. Bartz-Beielstein", email="tbb@bartzundbartz.de" }
1313
]

src/spotpython/utils/pca.py

Lines changed: 308 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,308 @@
33
from sklearn.decomposition import PCA
44
from sklearn.preprocessing import StandardScaler
55
import matplotlib.pyplot as plt
6+
import seaborn as sns
7+
8+
9+
def get_pca(df, n_components=3) -> tuple:
10+
"""
11+
Scale the numeric data and perform PCA.
12+
13+
Args:
14+
df (pd.DataFrame): Input DataFrame.
15+
n_components (int):
16+
Number of principal components to compute.
17+
Defaults to 3.
18+
19+
Returns:
20+
tuple:
21+
- pca (PCA): Fitted PCA object.
22+
- scaled_data (np.ndarray): Scaled numeric data.
23+
- feature_names (pd.Index): Names of the numeric features.
24+
- sample_names (pd.Index): Index of the samples.
25+
- pca_data (np.ndarray): PCA-transformed data.
26+
27+
Examples:
28+
>>> import pandas as pd
29+
>>> from spotpython.utils.pca import get_pca
30+
>>> df = pd.DataFrame({
31+
... "A": [1, 2, 3],
32+
... "B": [4, 5, 6],
33+
... "C": ["x", "y", "z"] # Non-numeric column will be ignored
34+
... })
35+
>>> pca, scaled_data, feature_names, sample_names, pca_data = get_pca(df)
36+
>>> print(feature_names)
37+
Index(['A', 'B'], dtype='object')
38+
>>> print(pca_data.shape)
39+
(3, 2)
40+
"""
41+
numeric_df = df.select_dtypes(include=[np.number])
42+
feature_names = numeric_df.columns
43+
pca = PCA(n_components=n_components)
44+
pca_scores = pca.fit_transform(numeric_df)
45+
pca_columns = [f"PC{i+1}" for i in range(pca_scores.shape[1])]
46+
df_pca_components = pd.DataFrame(data=pca_scores, columns=pca_columns)
47+
sample_names = df.index
48+
return pca, pca_scores, feature_names, sample_names, df_pca_components
49+
50+
51+
def plot_pca_scree(pca, df_name="", max_scree=None, figsize=(12, 6)) -> None:
52+
"""Plot the scree plot for Principal Component Analysis (PCA).
53+
54+
A scree plot shows the percentage of variance explained by each principal
55+
component in descending order. It helps in determining the optimal number
56+
of components to retain.
57+
58+
Args:
59+
pca (sklearn.decomposition.PCA): Fitted PCA object containing the
60+
explained variance ratios.
61+
df_name (str, optional): Name of the dataset to be displayed in the plot title.
62+
Defaults to empty string.
63+
max_scree (int, optional): Maximum number of principal components to plot.
64+
If None, all components are plotted. Defaults to None.
65+
figsize (tuple, optional): Size of the figure as (width, height).
66+
Defaults to (12, 6).
67+
68+
Returns:
69+
None: The function creates and displays a matplotlib plot.
70+
71+
Examples:
72+
>>> import numpy as np
73+
>>> from sklearn.decomposition import PCA
74+
>>> from sklearn.datasets import load_iris
75+
>>> from spotpython.utils.pca import plot_pca_scree
76+
>>>
77+
>>> # Load iris dataset
78+
>>> iris = load_iris()
79+
>>> X = iris.data
80+
>>>
81+
>>> # Fit PCA
82+
>>> pca = PCA()
83+
>>> pca.fit(X)
84+
>>>
85+
>>> # Create scree plot
86+
>>> plot_pca_scree(pca,
87+
... df_name="Iris Dataset",
88+
... max_scree=4,
89+
... figsize=(10, 5))
90+
"""
91+
per_var = np.round(pca.explained_variance_ratio_ * 100, decimals=1)
92+
full_labels = ["PC" + str(x) for x in range(1, len(per_var) + 1)]
93+
94+
# Limit the number of PCs in the scree plot
95+
if max_scree is not None:
96+
per_var = per_var[:max_scree]
97+
scree_labels = full_labels[:max_scree]
98+
else:
99+
scree_labels = full_labels
100+
101+
plt.figure(figsize=figsize)
102+
plt.plot(range(1, len(per_var) + 1), per_var, marker="o", linestyle="--")
103+
plt.xticks(range(1, len(per_var) + 1), scree_labels)
104+
plt.ylabel("Percentage of Explained Variance")
105+
plt.xlabel("Principal Component")
106+
plt.title(f"Scree Plot. {df_name}")
107+
plt.grid(True)
108+
plt.show()
109+
110+
111+
def plot_pca1vs2(pca, pca_data, df_name="", figsize=(12, 6)) -> None:
112+
"""Create a scatter plot of the first two principal components from PCA.
113+
114+
This function visualizes the first two principal components (PC1 vs PC2) from a PCA analysis,
115+
creating a scatter plot where each point represents a sample in the transformed space.
116+
The percentage of variance explained by each component is shown on the axes.
117+
118+
Args:
119+
pca (sklearn.decomposition.PCA): Fitted PCA object containing the explained
120+
variance ratios and components.
121+
pca_data (array-like): PCA-transformed data, where each row represents a sample
122+
and each column represents a principal component.
123+
df_name (str, optional): Name of the dataset to be displayed in the plot title.
124+
Defaults to empty string.
125+
figsize (tuple, optional): Size of the figure as (width, height).
126+
Defaults to (12, 6).
127+
128+
Returns:
129+
None: The function creates and displays a matplotlib plot.
130+
131+
Examples:
132+
>>> import numpy as np
133+
>>> from sklearn.decomposition import PCA
134+
>>> from sklearn.datasets import load_iris
135+
>>> from spotpython.utils.pca import plot_pca1vs2
136+
>>>
137+
>>> # Load and prepare the iris dataset
138+
>>> iris = load_iris()
139+
>>> X = iris.data
140+
>>>
141+
>>> # Fit PCA and transform the data
142+
>>> pca = PCA()
143+
>>> pca_data = pca.fit_transform(X)
144+
>>>
145+
>>> # Create PCA scatter plot
146+
>>> plot_pca1vs2(pca,
147+
... pca_data,
148+
... df_name="Iris Dataset",
149+
... figsize=(10, 5))
150+
151+
Note:
152+
- The function assumes that the input data has at least two principal components
153+
- Sample names are taken from the index of the created DataFrame
154+
- The percentage of variance explained is rounded to 1 decimal place
155+
"""
156+
pca_df = pd.DataFrame(pca_data, columns=["PC" + str(i + 1) for i in range(pca_data.shape[1])])
157+
158+
plt.figure(figsize=figsize)
159+
plt.scatter(pca_df["PC1"], pca_df["PC2"])
160+
for sample in pca_df.index:
161+
plt.annotate(sample, (pca_df.PC1.loc[sample], pca_df.PC2.loc[sample]))
162+
plt.title(f"PCA Graph. {df_name}")
163+
plt.xlabel(f"PC1 - {np.round(pca.explained_variance_ratio_[0] * 100, 1)}%")
164+
plt.ylabel(f"PC2 - {np.round(pca.explained_variance_ratio_[1] * 100, 1)}%")
165+
plt.grid(True)
166+
plt.show()
167+
168+
169+
def get_pca_topk(pca, feature_names, k=10) -> tuple:
170+
"""Identify the top k features that have the strongest influence on PC1 and PC2.
171+
172+
This function analyzes the loading scores (coefficients) of the first two principal
173+
components to determine which original features contribute most strongly to these
174+
components. The absolute values of the loading scores are used to rank feature
175+
importance.
176+
177+
Args:
178+
pca (sklearn.decomposition.PCA): Fitted PCA object containing the components_
179+
attribute with the principal components.
180+
feature_names (list-like): Names of the original features, must match the
181+
order of features used in PCA fitting.
182+
k (int, optional): Number of top features to select for each principal
183+
component. Defaults to 10.
184+
185+
Returns:
186+
tuple: A tuple containing two lists:
187+
- list[str]: Names of the k features most influential on PC1
188+
- list[str]: Names of the k features most influential on PC2
189+
190+
Examples:
191+
>>> import numpy as np
192+
>>> from sklearn.decomposition import PCA
193+
>>> from sklearn.datasets import load_iris
194+
>>> from spotpython.utils.pca import get_pca_topk
195+
>>>
196+
>>> # Load and prepare the iris dataset
197+
>>> iris = load_iris()
198+
>>> X = iris.data
199+
>>> feature_names = iris.feature_names
200+
>>>
201+
>>> # Fit PCA
202+
>>> pca = PCA()
203+
>>> pca.fit(X)
204+
>>>
205+
>>> # Get top 2 most influential features for PC1 and PC2
206+
>>> top_pc1, top_pc2 = get_pca_topk(pca,
207+
... feature_names=feature_names,
208+
... k=2)
209+
>>> print("Top PC1 features:", top_pc1)
210+
>>> print("Top PC2 features:", top_pc2)
211+
212+
Note:
213+
- The function assumes that PCA has been fitted on standardized data
214+
- The length of feature_names must match the number of features in the PCA input
215+
- k should not exceed the total number of features
216+
"""
217+
loading_scores_pc1 = pd.Series(pca.components_[0], index=feature_names)
218+
loading_scores_pc2 = pd.Series(pca.components_[1], index=feature_names)
219+
220+
sorted_loading_scores_pc1 = loading_scores_pc1.abs().sort_values(ascending=False)
221+
sorted_loading_scores_pc2 = loading_scores_pc2.abs().sort_values(ascending=False)
222+
223+
top_k_features_pc1 = sorted_loading_scores_pc1.head(k).index.tolist()
224+
top_k_features_pc2 = sorted_loading_scores_pc2.head(k).index.tolist()
225+
226+
return top_k_features_pc1, top_k_features_pc2
227+
228+
229+
def get_loading_scores(pca, feature_names) -> pd.DataFrame:
230+
"""Computes the loading scores matrix for Principal Component Analysis (PCA).
231+
232+
Creates and returns a DataFrame showing how each original feature contributes
233+
to each principal component.
234+
235+
Args:
236+
pca (sklearn.decomposition.PCA): Fitted PCA object containing the components_
237+
attribute with the principal components.
238+
feature_names (list-like): Names of the original features, must match the
239+
order of features used in PCA fitting.
240+
241+
Returns:
242+
pd.DataFrame: DataFrame containing the loading scores matrix with features
243+
as rows and principal components as columns.
244+
245+
Example:
246+
>>> from sklearn.decomposition import PCA
247+
>>> from sklearn.datasets import load_iris
248+
>>> from spotpython.utils.pca import print_loading_scores,
249+
>>>
250+
>>> # Load and prepare iris dataset
251+
>>> iris = load_iris()
252+
>>> X = iris.data
253+
>>> feature_names = iris.feature_names
254+
>>>
255+
>>> # Fit PCA
256+
>>> pca = PCA()
257+
>>> pca.fit(X)
258+
>>>
259+
>>> # Print loading scores
260+
>>> scores_df = print_loading_scores(pca, feature_names)
261+
>>> print(scores_df)
262+
"""
263+
loading_scores = pd.DataFrame(pca.components_.T, columns=[f"PC{i+1}" for i in range(pca.n_components_)], index=feature_names)
264+
return loading_scores
265+
266+
267+
def plot_loading_scores(loading_scores, figsize=(12, 8)) -> None:
268+
"""Creates a heatmap visualization of PCA loading scores.
269+
270+
Generates a heatmap showing the relationship between original features and
271+
principal components, with color intensity indicating the strength and
272+
direction of the relationship.
273+
274+
Args:
275+
loading_scores (pd.DataFrame): DataFrame containing the loading scores
276+
matrix with features as rows and principal components as columns.
277+
figsize (tuple, optional): Size of the figure as (width, height).
278+
Defaults to (12, 8).
279+
280+
Returns:
281+
None: The function creates and displays a matplotlib plot.
282+
283+
Example:
284+
>>> from sklearn.decomposition import PCA
285+
>>> from sklearn.datasets import load_iris
286+
>>> from spotpython.utils.pca import print_loading_scores, plot_loading_scores
287+
>>>
288+
>>> # Load and prepare iris dataset
289+
>>> iris = load_iris()
290+
>>> X = iris.data
291+
>>> feature_names = iris.feature_names
292+
>>>
293+
>>> # Fit PCA and get loading scores
294+
>>> pca = PCA()
295+
>>> pca.fit(X)
296+
>>> scores_df = print_loading_scores(pca, feature_names)
297+
>>>
298+
>>> # Create heatmap
299+
>>> plot_loading_scores(scores_df, figsize=(10, 6))
300+
"""
301+
plt.figure(figsize=figsize)
302+
sns.heatmap(loading_scores, annot=True, fmt=".2f", cmap="coolwarm", cbar_kws={"label": "Loading Score"}, linewidths=0.5)
303+
plt.title("PCA Loading Scores Heatmap")
304+
plt.xlabel("Principal Components")
305+
plt.ylabel("Original Features")
306+
plt.tight_layout()
307+
plt.show()
6308

7309

8310
def pca_analysis(
@@ -16,6 +318,12 @@ def pca_analysis(
16318
"""
17319
Perform PCA analysis on a DataFrame with specified scaling.
18320
321+
Notes:
322+
Deprecation Warning:
323+
This function is deprecated and will be removed in a future version.
324+
Use `get_pca`, `plot_pca_scree`, `plot_pca1vs2`, and `get_pca_topk`
325+
instead for more modular control over PCA analysis.
326+
19327
Args:
20328
df (pd.DataFrame):
21329
The input data frame to perform PCA on.

0 commit comments

Comments
 (0)