sequential-parameter-optimization
diff --git a/‎notebooks/00_spotPython_tests.ipynb‎
Lines changed: 22 additions & 182 deletions b/‎notebooks/00_spotPython_tests.ipynb‎
Lines changed: 22 additions & 182 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 2 additions & 1 deletion b/‎pyproject.toml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/spotpython/utils/stats.py‎
Lines changed: 272 additions & 0 deletions b/‎src/spotpython/utils/stats.py‎
Lines changed: 272 additions & 0 deletions
diff --git a/‎test/test_get_all_lm.py‎
Lines changed: 48 additions & 0 deletions b/‎test/test_get_all_lm.py‎
Lines changed: 48 additions & 0 deletions
diff --git a/‎test/test_get_vars_from_formula.py‎
Lines changed: 31 additions & 0 deletions b/‎test/test_get_vars_from_formula.py‎
Lines changed: 31 additions & 0 deletions
@@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "spotpython"
-version = "0.24.31"
+version = "0.24.32"
 authors = [
   { name="T. Bartz-Beielstein", email="tbb@bartzundbartz.de" }
 ]
@@ -48,6 +48,7 @@ dependencies = [
   "scipy",
   "spotriver>=0.4.1",
   "seaborn",
+  "statsmodels",
   "tabulate",
   "tensorboard",
   "torch",
 
@@ -2,6 +2,11 @@
 import numpy as np
 from scipy.stats import norm, t
 from numpy.linalg import pinv, inv, LinAlgError
+import copy
+import itertools
+import matplotlib.pyplot as plt
+import seaborn as sns
+from statsmodels.formula.api import ols
 
 
 def cov_to_cor(covariance_matrix) -> np.ndarray:
@@ -199,3 +204,270 @@ def pairwise_semi_partial_correlation(x, y, z, method="pearson"):
         "gp": spcor_result["gp"],
         "method": method,
     }
+
+
+def get_all_vars_from_formula(formula) -> list:
+    """Utility function to extract variables from a formula.
+
+    Args:
+        formula (str): A formula.
+
+    Returns:
+        list: A list of variables.
+
+    Examples:
+        >>> from spotpython.utils.stats import get_all_vars_from_formula
+            get_all_vars_from_formula("y ~ x1 + x2")
+                ['y', 'x1', 'x2']
+            get_all_vars_from_formula("y ~ ")
+                ['y']
+    """
+    # Split the formula into the dependent and independent variables
+    dependent, independent = formula.split("~")
+    # Strip whitespace and split the independent variables by '+'
+    independent_vars = independent.strip().split("+") if independent.strip() else []
+    # Combine the dependent variable with the independent variables
+    return [dependent.strip()] + [var.strip() for var in independent_vars]
+
+
+def fit_all_lm(basic, xlist, data, remove_na=True) -> dict:
+    """Fit a linear regression model for all possible combinations of independent variables.
+
+    Args:
+        basic (str): The basic model formula.
+        xlist (list): A list of independent variables.
+        data (pandas.DataFrame): The data frame containing the variables.
+        remove_na (bool): Whether to remove missing values from the data frame.
+
+    Returns:
+        dict: A dictionary containing the estimated coefficients, confidence intervals,
+            p-values, AIC values, sample size, and the basic model formula.
+
+    Examples:
+        >>> from spotpython.utils.stats import fit_all_lm
+        >>> import pandas as pd
+        >>> data = pd.DataFrame({
+        >>>     'y': [1, 2, 3],
+        >>>     'x1': [4, 5, 6],
+        >>>     'x2': [7, 8, 9]
+        >>> })
+        >>> fit_all_lm("y ~ x1", ["x2"], data)
+        {'estimate':   variables  estimate  conf_low  conf_high    p         aic  n
+        0    basic  1.000000  1.000000   1.000000  0.0  0.000000  3
+        1       x2  1.000000  1.000000   1.000000  0.0  0.000000  3}
+    """
+    # Prepare the data frame
+    data = copy.deepcopy(data)
+    data = data[get_all_vars_from_formula(basic) + xlist]
+    if remove_na:
+        data = data.dropna()
+    print(data.head())
+    # basic model
+    mod_0 = ols(basic, data=data).fit()
+    p = mod_0.pvalues.iloc[1]
+    print(f"p-values: {p}")
+    estimate = mod_0.params.iloc[1]
+    print(f"estimate: {estimate}")
+    conf_int = mod_0.conf_int().iloc[1]
+    print(f"conf_int: {conf_int}")
+    aic_value = mod_0.aic
+    print(f"aic: {aic_value}")
+    n = len(mod_0.resid)
+    df_0 = pd.DataFrame([["basic", estimate, conf_int[0], conf_int[1], p, aic_value, n]], columns=["variables", "estimate", "conf_low", "conf_high", "p", "aic", "n"])
+
+    # All combinations model
+    comb_lst = list(itertools.chain.from_iterable(itertools.combinations(xlist, r) for r in range(1, len(xlist) + 1)))
+    models = [ols(f"{basic} + {' + '.join(comb)}", data=data).fit() for comb in comb_lst]
+
+    df_list = []
+    for i, model in enumerate(models):
+        p = model.pvalues.iloc[1]
+        estimate = model.params.iloc[1]
+        conf_int = model.conf_int().iloc[1]
+        aic_value = model.aic
+        n = len(model.resid)
+        comb_str = ", ".join(comb_lst[i])
+        df_list.append([comb_str, estimate, conf_int[0], conf_int[1], p, aic_value, n])
+
+    df_coef = pd.DataFrame(df_list, columns=["variables", "estimate", "conf_low", "conf_high", "p", "aic", "n"])
+    estimates = pd.concat([df_0, df_coef], ignore_index=True)
+    return {"estimate": estimates, "xlist": xlist, "fun": "all_lm", "basic": basic, "family": "lm"}
+
+
+def plot_coeff_vs_pvals(data, xlabels=None, xlim=(0, 1), xlab="p-value", ylim=None, ylab=None, xscale_log=True, yscale_log=False, title=None, show=True) -> None:
+    """Plot the coefficient estimates from fit_all_lm against the corresponding p-values.
+
+    Args:
+        data (dict):
+            A dictionary containing the estimated coefficients, p-values, and other information.
+            Generated by the fit_all_lm function.
+        xlabels (list):
+            A list of x-axis labels.
+        xlim (tuple):
+            A tuple of the x-axis limits.
+        xlab (str):
+            The x-axis label.
+        ylim (tuple):
+            A tuple of the y-axis limits.
+        ylab (str):
+            The y-axis label.
+        xscale_log (bool):
+            Whether to use a log scale on the x-axis.
+        yscale_log (bool):
+            Whether to use a log scale on the y-axis.
+        title (str):
+            The plot title.
+        show (bool):
+            Whether to display the plot.
+
+    Returns:
+        None
+
+    Notes:
+        * Based on the R package 'allestimates' by Zhiqiang Wang, see https://cran.r-project.org/package=allestimates
+
+    References:
+        Wang, Z. (2007). Two Postestimation Commands for Assessing Confounding Effects in Epidemiological Studies. The Stata Journal, 7(2), 183-196. https://doi.org/10.1177/1536867X0700700203
+
+    Examples:
+        >>> from spotpython.utils.stats import plot_coeff_vs_pvals, fit_all_lm
+        >>> import pandas as pd
+        >>> data = pd.DataFrame({
+        >>>     'y': [1, 2, 3],
+        >>>     'x1': [4, 5, 6],
+        >>>     'x2': [7, 8, 9]
+        >>> })
+        >>> estimates = fit_all_lm("y ~ x1", ["x2"], data)
+        >>> plot_coeff_vs_pvals(estimates)
+    """
+    data = copy.deepcopy(data)
+    if xlabels is None:
+        xlabels = [0, 0.001, 0.01, 0.05, 0.2, 0.5, 1]
+    xbreaks = np.power(xlabels, np.log(0.5) / np.log(0.05))
+
+    result_df = data["estimate"]
+    if ylab is None:
+        ylab = "Coefficient" if data["fun"] == "all_lm" else "Effect estimates"
+    hline = 0 if data["fun"] == "all_lm" else 1
+
+    result_df["p_value"] = np.power(result_df["p"], np.log(0.5) / np.log(0.05))
+    if ylim is None:
+        maxv = max(result_df["estimate"].max(), abs(result_df["estimate"].min()))
+        ylim = (-maxv, maxv) if data["fun"] == "all_lm" else (1 / maxv, maxv)
+
+    plt.figure(figsize=(10, 6))
+    sns.scatterplot(data=result_df, x="p_value", y="estimate")
+    if xscale_log:
+        plt.xscale("log")
+    if yscale_log:
+        plt.yscale("log")
+    plt.xticks(ticks=xbreaks, labels=xlabels)
+    plt.axvline(x=0.5, linestyle="--")
+    plt.axhline(y=hline, linestyle="--")
+    plt.xlim(xlim)
+    plt.ylim(ylim)
+    plt.xlabel(xlab)
+    plt.ylabel(ylab)
+    if title:
+        plt.title(title)
+    plt.grid(True)
+    if show:
+        plt.show()
+
+
+def plot_coeff_vs_pvals_by_included(data, xlabels=None, xlim=(0, 1), xlab="P value", ylim=None, ylab=None, yscale_log=False, title=None, grid=True, ncol=2, show=True) -> None:
+    """
+    Generates a panel of scatter plots with effect estimates of all possible models against p-values.
+    Uses a dictionry generated by the fit_all_lm function.
+    Each plot includes effect estimates from all models including a specific variable.
+
+    Args:
+        data (dict): A dictionary, generated by the fit_all_lm function, containing the following keys:
+            - estimate (pd.DataFrame): A DataFrame containing the estimates.
+            - xlist (list): A list of variables.
+            - fun (str): The function name.
+            - family (str): The family of the model.
+        xlabels (list): A list of x-axis labels.
+        xlim (tuple): The x-axis limits.
+        xlab (str): The x-axis label.
+        ylim (tuple): The y-axis limits.
+        ylab (str): The y-axis label.
+        yscale_log (bool): Whether to scale y-axis to log10. Default is False.
+        title (str): The title of the plot.
+        grid (bool): Whether to display gridlines. Default is True.
+        ncol (int): Number of columns in the plot grid. Default is 2.
+
+    Returns:
+        None
+
+    Notes:
+        * Based on the R package 'allestimates' by Zhiqiang Wang, see https://cran.r-project.org/package=allestimates
+
+    References:
+        Wang, Z. (2007). Two Postestimation Commands for Assessing Confounding Effects in Epidemiological Studies. The Stata Journal, 7(2), 183-196. https://doi.org/10.1177/1536867X0700700203
+
+
+    Examples:
+        data = {
+            "estimate": pd.DataFrame({
+                "variables": ["Crude", "AL", "AM", "AN", "AO"],
+                "estimate": [0.5, 0.6, 0.7, 0.8, 0.9],
+                "conf_low": [0.1, 0.2, 0.3, 0.4, 0.5],
+                "conf_high": [0.9, 1.0, 1.1, 1.2, 1.3],
+                "p": [0.01, 0.02, 0.03, 0.04, 0.05],
+                "aic": [100, 200, 300, 400, 500],
+                "n": [10, 20, 30, 40, 50]
+            }),
+            "xlist": ["AL", "AM", "AN", "AO"],
+            "fun": "all_lm"
+        }
+        plot_coeff_vs_pvals_by_included(data)
+    """
+    if xlabels is None:
+        xlabels = [0, 0.001, 0.01, 0.05, 0.2, 0.5, 1]
+    xbreaks = np.power(xlabels, np.log(0.5) / np.log(0.05))
+
+    result_df = data["estimate"]
+    if ylab is None:
+        ylab = {"all_lm": "Coefficient", "poisson": "Rate ratio", "binomial": "Odds ratio"}.get(data.get("fun"), "Effect estimates")
+
+    hline = 0 if data["fun"] == "all_lm" else 1
+
+    result_df["p_value"] = np.power(result_df["p"], np.log(0.5) / np.log(0.05))
+    if ylim is None:
+        maxv = max(result_df["estimate"].max(), abs(result_df["estimate"].min()))
+        if data["fun"] == "all_lm":
+            ylim = (-maxv, maxv)
+        else:
+            ylim = (1 / maxv, maxv)
+
+    # Create a DataFrame to mark inclusion of variables
+    mark_df = pd.DataFrame({x: result_df["variables"].str.contains(x).astype(int) for x in data["xlist"]})
+    df_scatter = pd.concat([result_df, mark_df], axis=1)
+
+    # Melt the DataFrame for plotting
+    df_long = df_scatter.melt(id_vars=["variables", "estimate", "conf_low", "conf_high", "p", "aic", "n", "p_value"], value_vars=data["xlist"], var_name="variable", value_name="inclusion")
+    df_long["inclusion"] = df_long["inclusion"].apply(lambda x: "Included" if x > 0 else "Not included")
+
+    # Plotting
+    g = sns.FacetGrid(df_long, col="variable", hue="inclusion", palette={"Included": "blue", "Not included": "orange"}, col_wrap=ncol, height=4, sharex=False, sharey=False)
+    g.map(sns.scatterplot, "p_value", "estimate")
+    g.add_legend()
+    for ax in g.axes.flat:
+        ax.set_xticks(xbreaks)
+        ax.set_xticklabels(xlabels)
+        ax.set_xlim(xlim)
+        ax.set_ylim(ylim)
+        ax.axvline(x=0.5, linestyle="--", linewidth=1.5, color="black")  # Black dashed vertical line
+        ax.axhline(y=hline, linestyle="--", linewidth=1.5, color="black")  # Black dashed horizontal line
+        if grid:
+            ax.grid(True)
+    if yscale_log:
+        g.set(yscale="log")
+    g.set_axis_labels(xlab, ylab)
+    g.set_titles("{col_name}")
+    if title:
+        plt.subplots_adjust(top=0.9)
+        g.figure.suptitle(title)
+    if show:
+        plt.show()
@@ -0,0 +1,48 @@
+import pytest
+import pandas as pd
+from spotpython.utils.stats import fit_all_lm
+
+def test_fit_all_lm():
+    # Test case 1: Basic model with one independent variable
+    data = pd.DataFrame({
+        'y': [1, 2, 3],
+        'x1': [4, 5, 6],
+        'x2': [7, 8, 9]
+    })
+    result = fit_all_lm("y ~ x1", ["x2"], data)
+    expected_vars = ['basic', 'x2']
+    assert list(result['estimate']['variables']) == expected_vars
+    assert result['fun'] == 'all_lm'
+    assert result['basic'] == 'y ~ x1'
+    assert result['family'] == 'lm'
+
+    # Test case 2: Model with multiple independent variables
+    data = pd.DataFrame({
+        'y': [1, 2, 3, 4],
+        'x1': [4, 5, 6, 7],
+        'x2': [7, 8, 9, 10],
+        'x3': [10, 11, 12, 13]
+    })
+    result = fit_all_lm("y ~ x1", ["x2", "x3"], data)
+    expected_vars = ['basic', 'x2', 'x3', 'x2, x3']
+    assert list(result['estimate']['variables']) == expected_vars
+    assert result['fun'] == 'all_lm'
+    assert result['basic'] == 'y ~ x1'
+    assert result['family'] == 'lm'
+
+    # Test case 3: Model with missing values
+    data = pd.DataFrame({
+        'y': [1, 2, None, 4],
+        'x1': [4, 5, 6, 7],
+        'x2': [7, 8, 9, 10]
+    })
+    result = fit_all_lm("y ~ x1", ["x2"], data, remove_na=True)
+    expected_vars = ['basic', 'x2']
+    assert list(result['estimate']['variables']) == expected_vars
+    assert result['fun'] == 'all_lm'
+    assert result['basic'] == 'y ~ x1'
+    assert result['family'] == 'lm'
+    assert result['estimate']['n'].iloc[0] == 3  # Check if missing values were removed
+
+if __name__ == "__main__":
+    pytest.main()
@@ -0,0 +1,31 @@
+import pytest
+from spotpython.utils.stats import get_all_vars_from_formula
+
+def test_get_all_vars_from_formula():
+    # Test case 1: Simple formula
+    formula = "y ~ x1 + x2"
+    expected_vars = ['y', 'x1', 'x2']
+    assert get_all_vars_from_formula(formula) == expected_vars
+
+    # Test case 2: Formula with extra spaces
+    formula = "  y  ~  x1  +  x2  "
+    expected_vars = ['y', 'x1', 'x2']
+    assert get_all_vars_from_formula(formula) == expected_vars
+
+    # Test case 3: Formula with multiple independent variables
+    formula = "y ~ x1 + x2 + x3 + x4"
+    expected_vars = ['y', 'x1', 'x2', 'x3', 'x4']
+    assert get_all_vars_from_formula(formula) == expected_vars
+
+    # Test case 4: Formula with no independent variables
+    formula = "y ~ "
+    expected_vars = ['y']
+    assert get_all_vars_from_formula(formula) == expected_vars
+
+    # Test case 5: Formula with only one independent variable
+    formula = "y ~ x1"
+    expected_vars = ['y', 'x1']
+    assert get_all_vars_from_formula(formula) == expected_vars
+
+if __name__ == "__main__":
+    pytest.main()