0.14.46

bartzbeielstein · bartzbeielstein · commit 7c2999323061 · 2024-07-14T15:35:54.000+02:00
csvdataset updated
diff --git a/notebooks/00_spotPython_tests.ipynb b/notebooks/00_spotPython_tests.ipynb
@@ -4399,25 +4399,14 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 1,
-      "metadata": {},
-      "outputs": [
-        {
-          "ename": "NameError",
-          "evalue": "name 'MockDataSet' is not defined",
-          "output_type": "error",
-          "traceback": [
-            "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-            "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
-            "Cell \u001b[0;32mIn[1], line 2\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mspotPython\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01minit\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m get_feature_names\n\u001b[0;32m----> 2\u001b[0m fun_control \u001b[38;5;241m=\u001b[39m {\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdata_set\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[43mMockDataSet\u001b[49m(names\u001b[38;5;241m=\u001b[39m[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfeature1\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfeature2\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfeature3\u001b[39m\u001b[38;5;124m\"\u001b[39m])}\n\u001b[1;32m      3\u001b[0m get_feature_names(fun_control)\n",
-            "\u001b[0;31mNameError\u001b[0m: name 'MockDataSet' is not defined"
-          ]
-        }
-      ],
-      "source": [
-        "from spotPython.utils.init import get_feature_names\n",
-        "fun_control = {\"data_set\": MockDataSet(names=[\"feature1\", \"feature2\", \"feature3\"])}\n",
-        "get_feature_names(fun_control)\n"
+      "execution_count": 4,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from sklearn.datasets import load_diabetes\n",
+        "data = load_diabetes(return_X_y=False, as_frame=True)\n",
+        "# svaing the data to a csv file\n",
+        "data.frame.to_csv('~/data.csv', index=False)"
       ]
     },
     {
diff --git a/pyproject.toml b/pyproject.toml
@@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "spotpython"
-version = "0.14.45"
+version = "0.14.46"
 authors = [
   { name="T. Bartz-Beielstein", email="tbb@bartzundbartz.de" }
 ]
diff --git a/src/spotPython/data/csvdataset.py b/src/spotPython/data/csvdataset.py
@@ -8,38 +8,6 @@
 class CSVDataset(Dataset):
     """
     A PyTorch Dataset for handling CSV data.
-
-    Args:
-        filename (str): The path to the CSV file. Defaults to "data.csv".
-        directory (str): The path to the directory where the CSV file is stored. Defaults to None.
-        feature_type (torch.dtype): The data type of the features. Defaults to torch.float.
-        target_column (str): The name of the target column. Defaults to "y".
-        target_type (torch.dtype): The data type of the targets. Defaults to torch.long.
-        train (bool): Whether the dataset is for training or not. Defaults to True.
-        rmNA (bool): Whether to remove rows with NA values or not. Defaults to True.
-        dropId (bool): Whether to drop the "id" column or not. Defaults to False.
-        **desc (Any): Additional keyword arguments.
-
-    Attributes:
-        data (Tensor): The data features.
-        targets (Tensor): The data targets.
-
-    Examples:
-        >>> from torch.utils.data import DataLoader
-            from spotPython.data.csvdataset import CSVDataset
-            import torch
-            dataset = CSVDataset(csv_file='data.csv', target_column='prognosis', feature_type=torch.long)
-            # Set batch size for DataLoader
-            batch_size = 5
-            # Create DataLoader
-            dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
-            # Iterate over the data in the DataLoader
-            for batch in dataloader:
-                inputs, targets = batch
-                print(f"Batch Size: {inputs.size(0)}")
-                print("---------------")
-                print(f"Inputs: {inputs}")
-                print(f"Targets: {targets}")
     """
 
     def __init__(
@@ -48,10 +16,12 @@ def __init__(
         directory: None = None,
         feature_type: torch.dtype = torch.float,
         target_column: str = "y",
-        target_type: torch.dtype = torch.long,
+        target_type: torch.dtype = torch.float,
         train: bool = True,
         rmNA=True,
         dropId=False,
+        oe=OrdinalEncoder(),
+        le=LabelEncoder(),
         **desc,
     ) -> None:
         super().__init__()
@@ -63,6 +33,8 @@ def __init__(
         self.train = train
         self.rmNA = rmNA
         self.dropId = dropId
+        self.oe = oe
+        self.le = le
         self.data, self.targets = self._load_data()
 
     @property
@@ -78,30 +50,48 @@ def _repr_content(self):
         return content
 
     def _load_data(self) -> tuple:
-        # print(f"Loading data from {self.path}")
         df = pd.read_csv(self.path, index_col=False)
-        # rm rows with NA
+
+        # Remove rows with NA if specified
         if self.rmNA:
             df = df.dropna()
-        if self.dropId:
-            df = df.drop(columns=["id"])
 
-        oe = OrdinalEncoder()
-        # Apply LabelEncoder to string columns
-        le = LabelEncoder()
-        # df = df.apply(lambda col: le.fit_transform(col) if col.dtypes == object else col)
+        # Drop the id column if specified
+        if self.dropId and "id" in df.columns:
+            df = df.drop(columns=["id"])
 
         # Split DataFrame into feature and target DataFrames
         feature_df = df.drop(columns=[self.target_column])
-        feature_df = oe.fit_transform(feature_df)
+
+        # Identify non-numerical columns in the feature DataFrame
+        non_numerical_columns = feature_df.select_dtypes(exclude=["number"]).columns.tolist()
+
+        # Apply OrdinalEncoder to non-numerical feature columns
+        if non_numerical_columns:
+            if self.oe is None:
+                raise ValueError(
+                    f"\n!!! non_numerical_columns in data: {non_numerical_columns}"
+                    "\nOrdinalEncoder object oe must be provided for encoding non-numerical columns"
+                )
+            feature_df[non_numerical_columns] = self.oe.fit_transform(feature_df[non_numerical_columns])
+
         target_df = df[self.target_column]
-        # only apply LabelEncoder to target column if it is a string
-        if target_df.dtype == object:
-            target_df = le.fit_transform(target_df)
 
-        # Convert DataFrames to PyTorch tensors
-        feature_tensor = torch.tensor(feature_df, dtype=self.feature_type)
-        target_tensor = torch.tensor(target_df, dtype=self.target_type)
+        # Check if the target column is non-numerical using dtype
+        if not pd.api.types.is_numeric_dtype(target_df):
+            if self.le is None:
+                raise ValueError(
+                    f"\n!!! The target column '{self.target_column}' is non-numerical"
+                    "\nLabelEncoder object le must be provided for encoding non-numerical target"
+                )
+            target_df = self.le.fit_transform(target_df)
+
+        # Convert DataFrames to NumPy arrays and then to PyTorch tensors
+        feature_array = feature_df.to_numpy()
+        target_array = target_df
+
+        feature_tensor = torch.tensor(feature_array, dtype=self.feature_type)
+        target_tensor = torch.tensor(target_array, dtype=self.target_type)
 
         return feature_tensor, target_tensor
 
diff --git a/src/spotPython/data/diabetes.py b/src/spotPython/data/diabetes.py
@@ -11,20 +11,19 @@ class Diabetes(Dataset):
     as well as the response of interest,
     a quantitative measure of disease progression one year after baseline.
     Number of Instances: 442
-    Number of Attributes:
-    First 10 columns are numeric predictive values.
+    Number of Attributes:First 10 columns are numeric predictive values.
     Target: Column 11 is a quantitative measure of disease progression one year after baseline.
     Attribute Information:
-    * age age in years
-    * sex
-    * bmi body mass index
-    * bp average blood pressure
-    * s1 tc, total serum cholesterol
-    * s2 ldl, low-density lipoproteins
-    * s3 hdl, high-density lipoproteins
-    * s4 tch, total cholesterol / HDL
-    * s5 ltg, possibly log of serum triglycerides level
-    * s6 glu, blood sugar level
+        * age age in years
+        * sex
+        * bmi body mass index
+        * bp average blood pressure
+        * s1 tc, total serum cholesterol
+        * s2 ldl, low-density lipoproteins
+        * s3 hdl, high-density lipoproteins
+        * s4 tch, total cholesterol / HDL
+        * s5 ltg, possibly log of serum triglycerides level
+        * s6 glu, blood sugar level
 
     Args:
         feature_type (torch.dtype): The data type of the features. Defaults to torch.float.
diff --git a/src/spotPython/hyperdict/light_hyper_dict.json b/src/spotPython/hyperdict/light_hyper_dict.json
@@ -307,5 +307,102 @@
             "lower": 0,
             "upper": 2
         }
+    },
+    "NNLinearRegressor": {
+        "l1": {
+            "type": "int",
+            "default": 3,
+            "transform": "transform_power_2_int",
+            "lower": 3,
+            "upper": 8
+        },
+        "epochs": {
+            "type": "int",
+            "default": 4,
+            "transform": "transform_power_2_int",
+            "lower": 4,
+            "upper": 9
+        },
+        "batch_size": {
+            "type": "int",
+            "default": 4,
+            "transform": "transform_power_2_int",
+            "lower": 1,
+            "upper": 4
+        },
+        "act_fn": {
+            "levels": [
+                "Sigmoid",
+                "Tanh",
+                "ReLU",
+                "LeakyReLU",
+                "ELU",
+                "Swish"
+            ],
+            "type": "factor",
+            "default": "ReLU",
+            "transform": "None",
+            "class_name": "spotPython.torch.activation",
+            "core_model_parameter_type": "instance()",
+            "lower": 0,
+            "upper": 5
+        },
+        "optimizer": {
+            "levels": [
+                "Adadelta",
+                "Adagrad",
+                "Adam",
+                "AdamW",
+                "SparseAdam",
+                "Adamax",
+                "ASGD",
+                "NAdam",
+                "RAdam",
+                "RMSprop",
+                "Rprop",
+                "SGD"
+            ],
+            "type": "factor",
+            "default": "SGD",
+            "transform": "None",
+            "class_name": "torch.optim",
+            "core_model_parameter_type": "str",
+            "lower": 0,
+            "upper": 11
+        },
+        "dropout_prob": {
+            "type": "float",
+            "default": 0.01,
+            "transform": "None",
+            "lower": 0.0,
+            "upper": 0.25
+        },
+        "lr_mult": {
+            "type": "float",
+            "default": 1.0,
+            "transform": "None",
+            "lower": 0.1,
+            "upper": 10.0
+        },
+        "patience": {
+            "type": "int",
+            "default": 2,
+            "transform": "transform_power_2_int",
+            "lower": 2,
+            "upper": 6
+        },
+        "initialization": {
+            "levels": [
+                "Default",
+                "Kaiming",
+                "Xavier"
+            ],
+            "type": "factor",
+            "default": "Default",
+            "transform": "None",
+            "core_model_parameter_type": "str",
+            "lower": 0,
+            "upper": 2
+        }
     }
 }
diff --git a/test/test_csv_dataset.py b/test/test_csv_dataset.py
@@ -0,0 +1,51 @@
+from spotPython.data.csvdataset import CSVDataset
+import pytest
+import torch
+from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
+
+
+def create_mock_csv(data: str, filename: str):
+    with open(filename, "w") as f:
+        f.write(data)
+
+
+@pytest.fixture
+def mock_csv_file(tmp_path):
+    data = """id,feature1,feature2,feature3,y
+              1,A,10.1,100.1,positive
+              2,B,20.2,200.2,negative
+              3,C,30.3,300.3,positive
+              4,D,40.4,400.4,negative
+              5,E,50.5,500.5,positive"""
+    filename = tmp_path / "data.csv"
+    create_mock_csv(data, filename)
+    return filename
+
+
+def test_csvdataset_remove_na(mock_csv_file):
+    # Add a row with NA values
+    data_with_na = """id,feature1,feature2,feature3,y
+                     1,A,10.1,100.1,positive
+                     2,B,20.2,200.2,negative
+                     3,C,30.3,300.3,positive
+                     4,D,,400.4,negative
+                     5,E,50.5,500.5,positive"""
+    temp_dir = mock_csv_file.parent
+    filename_na = temp_dir / "data_with_na.csv"
+    create_mock_csv(data_with_na, filename_na)
+
+    dataset = CSVDataset(filename=filename_na, target_column="y", rmNA=True, oe=OrdinalEncoder(), le=LabelEncoder())
+    assert len(dataset) == 4  # One row with NA should be removed
+    assert dataset.data.shape[0] == 4  # Four rows left
+
+
+def test_csvdataset_non_numerical_target(mock_csv_file):
+    dataset = CSVDataset(
+        filename=mock_csv_file, target_column="y", target_type=torch.long, oe=OrdinalEncoder(), le=LabelEncoder()
+    )
+    assert len(set(dataset.targets.tolist())) == 2  # There should be two unique target classes after label encoding
+
+
+def test_csvdataset_len(mock_csv_file):
+    dataset = CSVDataset(filename=mock_csv_file, target_column="y", oe=OrdinalEncoder(), le=LabelEncoder())
+    assert len(dataset) == 5  # Check the correct length

Original file line number	Diff line number	Diff line change
`@@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"`
`7`	`7`
`8`	`8`	`[project]`
`9`	`9`	`name = "spotpython"`
`10`		`-version = "0.14.45"`
	`10`	`+version = "0.14.46"`
`11`	`11`	`authors = [`
`12`	`12`	`{ name="T. Bartz-Beielstein", email="tbb@bartzundbartz.de" }`
`13`	`13`	`]`