Skip to content

Commit 491afa8

Browse files
0.29.18
pca
1 parent c090401 commit 491afa8

2 files changed

Lines changed: 99 additions & 1 deletion

File tree

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"
77

88
[project]
99
name = "spotpython"
10-
version = "0.29.17"
10+
version = "0.29.18"
1111
authors = [
1212
{ name="T. Bartz-Beielstein", email="tbb@bartzundbartz.de" }
1313
]

src/spotpython/utils/pca.py

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
import numpy as np
2+
import pandas as pd
3+
from sklearn.decomposition import PCA
4+
from sklearn.preprocessing import StandardScaler
5+
import matplotlib.pyplot as plt
6+
7+
8+
def pca_analysis(
9+
df,
10+
df_name="",
11+
k=10,
12+
scaler=StandardScaler(),
13+
max_scree=None,
14+
figsize=(12, 6),
15+
) -> tuple:
16+
"""
17+
Perform PCA analysis on a DataFrame with specified scaling.
18+
19+
Args:
20+
df (pd.DataFrame):
21+
The input data frame to perform PCA on.
22+
df_name (str):
23+
The name of the data frame.
24+
k (int):
25+
The number of top features to select based on their influence on PC1.
26+
scaler (obj):
27+
An instance of a Scaler from sklearn (e.g., StandardScaler()).
28+
max_scree (int):
29+
The maximum number of principal components to plot in the scree plot. Default is None, which means all components will be plotted.
30+
figsize (tuple):
31+
The size of the figure for the plots (width, height).
32+
33+
Returns:
34+
tuple: Two pd.Index objects containing the names of the top k features most influential on PC1 and PC2, respectively.
35+
36+
Examples:
37+
>>> import pandas as pd
38+
>>> from spotpython.utils import pca_analysis
39+
>>> df = pd.DataFrame({
40+
... "A": [1, 2, 3],
41+
... "B": [1, 2, 3],
42+
... "C": [4, 5, 6]
43+
... })
44+
>>> pca_analysis(df)
45+
"""
46+
# Scale the data
47+
scaled_data = scaler.fit_transform(df)
48+
feature_names = df.columns
49+
sample_names = df.index
50+
51+
# Perform PCA
52+
pca = PCA()
53+
pca.fit(scaled_data)
54+
pca_data = pca.transform(scaled_data)
55+
56+
# Scree plot
57+
per_var = np.round(pca.explained_variance_ratio_ * 100, decimals=1)
58+
full_labels = ["PC" + str(x) for x in range(1, len(per_var) + 1)]
59+
60+
# Limit the number of PCs in the scree plot
61+
if max_scree is not None:
62+
per_var = per_var[:max_scree]
63+
scree_labels = full_labels[:max_scree]
64+
else:
65+
scree_labels = full_labels
66+
67+
plt.figure(figsize=figsize) # Set the figure size for the scree plot
68+
plt.bar(x=range(1, len(per_var) + 1), height=per_var, tick_label=scree_labels)
69+
plt.ylabel("Percentage of Explained Variance")
70+
plt.xlabel("Principal Component")
71+
plt.title(f"Scree Plot. {df_name}")
72+
plt.show()
73+
74+
# PCA plot
75+
plt.figure(figsize=figsize) # Set the figure size for the PCA plot
76+
pca_df = pd.DataFrame(pca_data, index=sample_names, columns=full_labels)
77+
78+
plt.scatter(pca_df.PC1, pca_df.PC2)
79+
plt.title(f"PCA Graph. {df_name}")
80+
plt.xlabel("PC1 - {0}%".format(per_var[0]))
81+
plt.ylabel("PC2 - {0}%".format(per_var[1]))
82+
83+
for sample in pca_df.index:
84+
plt.annotate(sample, (pca_df.PC1.loc[sample], pca_df.PC2.loc[sample]))
85+
86+
plt.show()
87+
88+
# Determine top k features influencing PC1 and PC2
89+
loading_scores_pc1 = pd.Series(pca.components_[0], index=feature_names)
90+
loading_scores_pc2 = pd.Series(pca.components_[1], index=feature_names)
91+
92+
sorted_loading_scores_pc1 = loading_scores_pc1.abs().sort_values(ascending=False)
93+
sorted_loading_scores_pc2 = loading_scores_pc2.abs().sort_values(ascending=False)
94+
95+
top_k_features_pc1 = sorted_loading_scores_pc1.head(k).index
96+
top_k_features_pc2 = sorted_loading_scores_pc2.head(k).index
97+
98+
return top_k_features_pc1, top_k_features_pc2

0 commit comments

Comments
 (0)