33from sklearn .decomposition import PCA
44from sklearn .preprocessing import StandardScaler
55import matplotlib .pyplot as plt
6+ import seaborn as sns
7+
8+
9+ def get_pca (df , n_components = 3 ) -> tuple :
10+ """
11+ Scale the numeric data and perform PCA.
12+
13+ Args:
14+ df (pd.DataFrame): Input DataFrame.
15+ n_components (int):
16+ Number of principal components to compute.
17+ Defaults to 3.
18+
19+ Returns:
20+ tuple:
21+ - pca (PCA): Fitted PCA object.
22+ - scaled_data (np.ndarray): Scaled numeric data.
23+ - feature_names (pd.Index): Names of the numeric features.
24+ - sample_names (pd.Index): Index of the samples.
25+ - pca_data (np.ndarray): PCA-transformed data.
26+
27+ Examples:
28+ >>> import pandas as pd
29+ >>> from spotpython.utils.pca import get_pca
30+ >>> df = pd.DataFrame({
31+ ... "A": [1, 2, 3],
32+ ... "B": [4, 5, 6],
33+ ... "C": ["x", "y", "z"] # Non-numeric column will be ignored
34+ ... })
35+ >>> pca, scaled_data, feature_names, sample_names, pca_data = get_pca(df)
36+ >>> print(feature_names)
37+ Index(['A', 'B'], dtype='object')
38+ >>> print(pca_data.shape)
39+ (3, 2)
40+ """
41+ numeric_df = df .select_dtypes (include = [np .number ])
42+ feature_names = numeric_df .columns
43+ pca = PCA (n_components = n_components )
44+ pca_scores = pca .fit_transform (numeric_df )
45+ pca_columns = [f"PC{ i + 1 } " for i in range (pca_scores .shape [1 ])]
46+ df_pca_components = pd .DataFrame (data = pca_scores , columns = pca_columns )
47+ sample_names = df .index
48+ return pca , pca_scores , feature_names , sample_names , df_pca_components
49+
50+
51+ def plot_pca_scree (pca , df_name = "" , max_scree = None , figsize = (12 , 6 )) -> None :
52+ """Plot the scree plot for Principal Component Analysis (PCA).
53+
54+ A scree plot shows the percentage of variance explained by each principal
55+ component in descending order. It helps in determining the optimal number
56+ of components to retain.
57+
58+ Args:
59+ pca (sklearn.decomposition.PCA): Fitted PCA object containing the
60+ explained variance ratios.
61+ df_name (str, optional): Name of the dataset to be displayed in the plot title.
62+ Defaults to empty string.
63+ max_scree (int, optional): Maximum number of principal components to plot.
64+ If None, all components are plotted. Defaults to None.
65+ figsize (tuple, optional): Size of the figure as (width, height).
66+ Defaults to (12, 6).
67+
68+ Returns:
69+ None: The function creates and displays a matplotlib plot.
70+
71+ Examples:
72+ >>> import numpy as np
73+ >>> from sklearn.decomposition import PCA
74+ >>> from sklearn.datasets import load_iris
75+ >>> from spotpython.utils.pca import plot_pca_scree
76+ >>>
77+ >>> # Load iris dataset
78+ >>> iris = load_iris()
79+ >>> X = iris.data
80+ >>>
81+ >>> # Fit PCA
82+ >>> pca = PCA()
83+ >>> pca.fit(X)
84+ >>>
85+ >>> # Create scree plot
86+ >>> plot_pca_scree(pca,
87+ ... df_name="Iris Dataset",
88+ ... max_scree=4,
89+ ... figsize=(10, 5))
90+ """
91+ per_var = np .round (pca .explained_variance_ratio_ * 100 , decimals = 1 )
92+ full_labels = ["PC" + str (x ) for x in range (1 , len (per_var ) + 1 )]
93+
94+ # Limit the number of PCs in the scree plot
95+ if max_scree is not None :
96+ per_var = per_var [:max_scree ]
97+ scree_labels = full_labels [:max_scree ]
98+ else :
99+ scree_labels = full_labels
100+
101+ plt .figure (figsize = figsize )
102+ plt .plot (range (1 , len (per_var ) + 1 ), per_var , marker = "o" , linestyle = "--" )
103+ plt .xticks (range (1 , len (per_var ) + 1 ), scree_labels )
104+ plt .ylabel ("Percentage of Explained Variance" )
105+ plt .xlabel ("Principal Component" )
106+ plt .title (f"Scree Plot. { df_name } " )
107+ plt .grid (True )
108+ plt .show ()
109+
110+
111+ def plot_pca1vs2 (pca , pca_data , df_name = "" , figsize = (12 , 6 )) -> None :
112+ """Create a scatter plot of the first two principal components from PCA.
113+
114+ This function visualizes the first two principal components (PC1 vs PC2) from a PCA analysis,
115+ creating a scatter plot where each point represents a sample in the transformed space.
116+ The percentage of variance explained by each component is shown on the axes.
117+
118+ Args:
119+ pca (sklearn.decomposition.PCA): Fitted PCA object containing the explained
120+ variance ratios and components.
121+ pca_data (array-like): PCA-transformed data, where each row represents a sample
122+ and each column represents a principal component.
123+ df_name (str, optional): Name of the dataset to be displayed in the plot title.
124+ Defaults to empty string.
125+ figsize (tuple, optional): Size of the figure as (width, height).
126+ Defaults to (12, 6).
127+
128+ Returns:
129+ None: The function creates and displays a matplotlib plot.
130+
131+ Examples:
132+ >>> import numpy as np
133+ >>> from sklearn.decomposition import PCA
134+ >>> from sklearn.datasets import load_iris
135+ >>> from spotpython.utils.pca import plot_pca1vs2
136+ >>>
137+ >>> # Load and prepare the iris dataset
138+ >>> iris = load_iris()
139+ >>> X = iris.data
140+ >>>
141+ >>> # Fit PCA and transform the data
142+ >>> pca = PCA()
143+ >>> pca_data = pca.fit_transform(X)
144+ >>>
145+ >>> # Create PCA scatter plot
146+ >>> plot_pca1vs2(pca,
147+ ... pca_data,
148+ ... df_name="Iris Dataset",
149+ ... figsize=(10, 5))
150+
151+ Note:
152+ - The function assumes that the input data has at least two principal components
153+ - Sample names are taken from the index of the created DataFrame
154+ - The percentage of variance explained is rounded to 1 decimal place
155+ """
156+ pca_df = pd .DataFrame (pca_data , columns = ["PC" + str (i + 1 ) for i in range (pca_data .shape [1 ])])
157+
158+ plt .figure (figsize = figsize )
159+ plt .scatter (pca_df ["PC1" ], pca_df ["PC2" ])
160+ for sample in pca_df .index :
161+ plt .annotate (sample , (pca_df .PC1 .loc [sample ], pca_df .PC2 .loc [sample ]))
162+ plt .title (f"PCA Graph. { df_name } " )
163+ plt .xlabel (f"PC1 - { np .round (pca .explained_variance_ratio_ [0 ] * 100 , 1 )} %" )
164+ plt .ylabel (f"PC2 - { np .round (pca .explained_variance_ratio_ [1 ] * 100 , 1 )} %" )
165+ plt .grid (True )
166+ plt .show ()
167+
168+
169+ def get_pca_topk (pca , feature_names , k = 10 ) -> tuple :
170+ """Identify the top k features that have the strongest influence on PC1 and PC2.
171+
172+ This function analyzes the loading scores (coefficients) of the first two principal
173+ components to determine which original features contribute most strongly to these
174+ components. The absolute values of the loading scores are used to rank feature
175+ importance.
176+
177+ Args:
178+ pca (sklearn.decomposition.PCA): Fitted PCA object containing the components_
179+ attribute with the principal components.
180+ feature_names (list-like): Names of the original features, must match the
181+ order of features used in PCA fitting.
182+ k (int, optional): Number of top features to select for each principal
183+ component. Defaults to 10.
184+
185+ Returns:
186+ tuple: A tuple containing two lists:
187+ - list[str]: Names of the k features most influential on PC1
188+ - list[str]: Names of the k features most influential on PC2
189+
190+ Examples:
191+ >>> import numpy as np
192+ >>> from sklearn.decomposition import PCA
193+ >>> from sklearn.datasets import load_iris
194+ >>> from spotpython.utils.pca import get_pca_topk
195+ >>>
196+ >>> # Load and prepare the iris dataset
197+ >>> iris = load_iris()
198+ >>> X = iris.data
199+ >>> feature_names = iris.feature_names
200+ >>>
201+ >>> # Fit PCA
202+ >>> pca = PCA()
203+ >>> pca.fit(X)
204+ >>>
205+ >>> # Get top 2 most influential features for PC1 and PC2
206+ >>> top_pc1, top_pc2 = get_pca_topk(pca,
207+ ... feature_names=feature_names,
208+ ... k=2)
209+ >>> print("Top PC1 features:", top_pc1)
210+ >>> print("Top PC2 features:", top_pc2)
211+
212+ Note:
213+ - The function assumes that PCA has been fitted on standardized data
214+ - The length of feature_names must match the number of features in the PCA input
215+ - k should not exceed the total number of features
216+ """
217+ loading_scores_pc1 = pd .Series (pca .components_ [0 ], index = feature_names )
218+ loading_scores_pc2 = pd .Series (pca .components_ [1 ], index = feature_names )
219+
220+ sorted_loading_scores_pc1 = loading_scores_pc1 .abs ().sort_values (ascending = False )
221+ sorted_loading_scores_pc2 = loading_scores_pc2 .abs ().sort_values (ascending = False )
222+
223+ top_k_features_pc1 = sorted_loading_scores_pc1 .head (k ).index .tolist ()
224+ top_k_features_pc2 = sorted_loading_scores_pc2 .head (k ).index .tolist ()
225+
226+ return top_k_features_pc1 , top_k_features_pc2
227+
228+
229+ def get_loading_scores (pca , feature_names ) -> pd .DataFrame :
230+ """Computes the loading scores matrix for Principal Component Analysis (PCA).
231+
232+ Creates and returns a DataFrame showing how each original feature contributes
233+ to each principal component.
234+
235+ Args:
236+ pca (sklearn.decomposition.PCA): Fitted PCA object containing the components_
237+ attribute with the principal components.
238+ feature_names (list-like): Names of the original features, must match the
239+ order of features used in PCA fitting.
240+
241+ Returns:
242+ pd.DataFrame: DataFrame containing the loading scores matrix with features
243+ as rows and principal components as columns.
244+
245+ Example:
246+ >>> from sklearn.decomposition import PCA
247+ >>> from sklearn.datasets import load_iris
248+ >>> from spotpython.utils.pca import print_loading_scores,
249+ >>>
250+ >>> # Load and prepare iris dataset
251+ >>> iris = load_iris()
252+ >>> X = iris.data
253+ >>> feature_names = iris.feature_names
254+ >>>
255+ >>> # Fit PCA
256+ >>> pca = PCA()
257+ >>> pca.fit(X)
258+ >>>
259+ >>> # Print loading scores
260+ >>> scores_df = print_loading_scores(pca, feature_names)
261+ >>> print(scores_df)
262+ """
263+ loading_scores = pd .DataFrame (pca .components_ .T , columns = [f"PC{ i + 1 } " for i in range (pca .n_components_ )], index = feature_names )
264+ return loading_scores
265+
266+
267+ def plot_loading_scores (loading_scores , figsize = (12 , 8 )) -> None :
268+ """Creates a heatmap visualization of PCA loading scores.
269+
270+ Generates a heatmap showing the relationship between original features and
271+ principal components, with color intensity indicating the strength and
272+ direction of the relationship.
273+
274+ Args:
275+ loading_scores (pd.DataFrame): DataFrame containing the loading scores
276+ matrix with features as rows and principal components as columns.
277+ figsize (tuple, optional): Size of the figure as (width, height).
278+ Defaults to (12, 8).
279+
280+ Returns:
281+ None: The function creates and displays a matplotlib plot.
282+
283+ Example:
284+ >>> from sklearn.decomposition import PCA
285+ >>> from sklearn.datasets import load_iris
286+ >>> from spotpython.utils.pca import print_loading_scores, plot_loading_scores
287+ >>>
288+ >>> # Load and prepare iris dataset
289+ >>> iris = load_iris()
290+ >>> X = iris.data
291+ >>> feature_names = iris.feature_names
292+ >>>
293+ >>> # Fit PCA and get loading scores
294+ >>> pca = PCA()
295+ >>> pca.fit(X)
296+ >>> scores_df = print_loading_scores(pca, feature_names)
297+ >>>
298+ >>> # Create heatmap
299+ >>> plot_loading_scores(scores_df, figsize=(10, 6))
300+ """
301+ plt .figure (figsize = figsize )
302+ sns .heatmap (loading_scores , annot = True , fmt = ".2f" , cmap = "coolwarm" , cbar_kws = {"label" : "Loading Score" }, linewidths = 0.5 )
303+ plt .title ("PCA Loading Scores Heatmap" )
304+ plt .xlabel ("Principal Components" )
305+ plt .ylabel ("Original Features" )
306+ plt .tight_layout ()
307+ plt .show ()
6308
7309
8310def pca_analysis (
@@ -16,6 +318,12 @@ def pca_analysis(
16318 """
17319 Perform PCA analysis on a DataFrame with specified scaling.
18320
321+ Notes:
322+ Deprecation Warning:
323+ This function is deprecated and will be removed in a future version.
324+ Use `get_pca`, `plot_pca_scree`, `plot_pca1vs2`, and `get_pca_topk`
325+ instead for more modular control over PCA analysis.
326+
19327 Args:
20328 df (pd.DataFrame):
21329 The input data frame to perform PCA on.
0 commit comments