Skip to content

Commit 7c29993

Browse files
0.14.46
csvdataset updated
1 parent edb8139 commit 7c29993

6 files changed

Lines changed: 206 additions & 80 deletions

File tree

notebooks/00_spotPython_tests.ipynb

Lines changed: 8 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -4399,25 +4399,14 @@
43994399
},
44004400
{
44014401
"cell_type": "code",
4402-
"execution_count": 1,
4403-
"metadata": {},
4404-
"outputs": [
4405-
{
4406-
"ename": "NameError",
4407-
"evalue": "name 'MockDataSet' is not defined",
4408-
"output_type": "error",
4409-
"traceback": [
4410-
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
4411-
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
4412-
"Cell \u001b[0;32mIn[1], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mspotPython\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01minit\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m get_feature_names\n\u001b[0;32m----> 2\u001b[0m fun_control \u001b[38;5;241m=\u001b[39m {\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdata_set\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[43mMockDataSet\u001b[49m(names\u001b[38;5;241m=\u001b[39m[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfeature1\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfeature2\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfeature3\u001b[39m\u001b[38;5;124m\"\u001b[39m])}\n\u001b[1;32m 3\u001b[0m get_feature_names(fun_control)\n",
4413-
"\u001b[0;31mNameError\u001b[0m: name 'MockDataSet' is not defined"
4414-
]
4415-
}
4416-
],
4417-
"source": [
4418-
"from spotPython.utils.init import get_feature_names\n",
4419-
"fun_control = {\"data_set\": MockDataSet(names=[\"feature1\", \"feature2\", \"feature3\"])}\n",
4420-
"get_feature_names(fun_control)\n"
4402+
"execution_count": 4,
4403+
"metadata": {},
4404+
"outputs": [],
4405+
"source": [
4406+
"from sklearn.datasets import load_diabetes\n",
4407+
"data = load_diabetes(return_X_y=False, as_frame=True)\n",
4408+
"# svaing the data to a csv file\n",
4409+
"data.frame.to_csv('~/data.csv', index=False)"
44214410
]
44224411
},
44234412
{

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"
77

88
[project]
99
name = "spotpython"
10-
version = "0.14.45"
10+
version = "0.14.46"
1111
authors = [
1212
{ name="T. Bartz-Beielstein", email="tbb@bartzundbartz.de" }
1313
]

src/spotPython/data/csvdataset.py

Lines changed: 38 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -8,38 +8,6 @@
88
class CSVDataset(Dataset):
99
"""
1010
A PyTorch Dataset for handling CSV data.
11-
12-
Args:
13-
filename (str): The path to the CSV file. Defaults to "data.csv".
14-
directory (str): The path to the directory where the CSV file is stored. Defaults to None.
15-
feature_type (torch.dtype): The data type of the features. Defaults to torch.float.
16-
target_column (str): The name of the target column. Defaults to "y".
17-
target_type (torch.dtype): The data type of the targets. Defaults to torch.long.
18-
train (bool): Whether the dataset is for training or not. Defaults to True.
19-
rmNA (bool): Whether to remove rows with NA values or not. Defaults to True.
20-
dropId (bool): Whether to drop the "id" column or not. Defaults to False.
21-
**desc (Any): Additional keyword arguments.
22-
23-
Attributes:
24-
data (Tensor): The data features.
25-
targets (Tensor): The data targets.
26-
27-
Examples:
28-
>>> from torch.utils.data import DataLoader
29-
from spotPython.data.csvdataset import CSVDataset
30-
import torch
31-
dataset = CSVDataset(csv_file='data.csv', target_column='prognosis', feature_type=torch.long)
32-
# Set batch size for DataLoader
33-
batch_size = 5
34-
# Create DataLoader
35-
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
36-
# Iterate over the data in the DataLoader
37-
for batch in dataloader:
38-
inputs, targets = batch
39-
print(f"Batch Size: {inputs.size(0)}")
40-
print("---------------")
41-
print(f"Inputs: {inputs}")
42-
print(f"Targets: {targets}")
4311
"""
4412

4513
def __init__(
@@ -48,10 +16,12 @@ def __init__(
4816
directory: None = None,
4917
feature_type: torch.dtype = torch.float,
5018
target_column: str = "y",
51-
target_type: torch.dtype = torch.long,
19+
target_type: torch.dtype = torch.float,
5220
train: bool = True,
5321
rmNA=True,
5422
dropId=False,
23+
oe=OrdinalEncoder(),
24+
le=LabelEncoder(),
5525
**desc,
5626
) -> None:
5727
super().__init__()
@@ -63,6 +33,8 @@ def __init__(
6333
self.train = train
6434
self.rmNA = rmNA
6535
self.dropId = dropId
36+
self.oe = oe
37+
self.le = le
6638
self.data, self.targets = self._load_data()
6739

6840
@property
@@ -78,30 +50,48 @@ def _repr_content(self):
7850
return content
7951

8052
def _load_data(self) -> tuple:
81-
# print(f"Loading data from {self.path}")
8253
df = pd.read_csv(self.path, index_col=False)
83-
# rm rows with NA
54+
55+
# Remove rows with NA if specified
8456
if self.rmNA:
8557
df = df.dropna()
86-
if self.dropId:
87-
df = df.drop(columns=["id"])
8858

89-
oe = OrdinalEncoder()
90-
# Apply LabelEncoder to string columns
91-
le = LabelEncoder()
92-
# df = df.apply(lambda col: le.fit_transform(col) if col.dtypes == object else col)
59+
# Drop the id column if specified
60+
if self.dropId and "id" in df.columns:
61+
df = df.drop(columns=["id"])
9362

9463
# Split DataFrame into feature and target DataFrames
9564
feature_df = df.drop(columns=[self.target_column])
96-
feature_df = oe.fit_transform(feature_df)
65+
66+
# Identify non-numerical columns in the feature DataFrame
67+
non_numerical_columns = feature_df.select_dtypes(exclude=["number"]).columns.tolist()
68+
69+
# Apply OrdinalEncoder to non-numerical feature columns
70+
if non_numerical_columns:
71+
if self.oe is None:
72+
raise ValueError(
73+
f"\n!!! non_numerical_columns in data: {non_numerical_columns}"
74+
"\nOrdinalEncoder object oe must be provided for encoding non-numerical columns"
75+
)
76+
feature_df[non_numerical_columns] = self.oe.fit_transform(feature_df[non_numerical_columns])
77+
9778
target_df = df[self.target_column]
98-
# only apply LabelEncoder to target column if it is a string
99-
if target_df.dtype == object:
100-
target_df = le.fit_transform(target_df)
10179

102-
# Convert DataFrames to PyTorch tensors
103-
feature_tensor = torch.tensor(feature_df, dtype=self.feature_type)
104-
target_tensor = torch.tensor(target_df, dtype=self.target_type)
80+
# Check if the target column is non-numerical using dtype
81+
if not pd.api.types.is_numeric_dtype(target_df):
82+
if self.le is None:
83+
raise ValueError(
84+
f"\n!!! The target column '{self.target_column}' is non-numerical"
85+
"\nLabelEncoder object le must be provided for encoding non-numerical target"
86+
)
87+
target_df = self.le.fit_transform(target_df)
88+
89+
# Convert DataFrames to NumPy arrays and then to PyTorch tensors
90+
feature_array = feature_df.to_numpy()
91+
target_array = target_df
92+
93+
feature_tensor = torch.tensor(feature_array, dtype=self.feature_type)
94+
target_tensor = torch.tensor(target_array, dtype=self.target_type)
10595

10696
return feature_tensor, target_tensor
10797

src/spotPython/data/diabetes.py

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -11,20 +11,19 @@ class Diabetes(Dataset):
1111
as well as the response of interest,
1212
a quantitative measure of disease progression one year after baseline.
1313
Number of Instances: 442
14-
Number of Attributes:
15-
First 10 columns are numeric predictive values.
14+
Number of Attributes:First 10 columns are numeric predictive values.
1615
Target: Column 11 is a quantitative measure of disease progression one year after baseline.
1716
Attribute Information:
18-
* age age in years
19-
* sex
20-
* bmi body mass index
21-
* bp average blood pressure
22-
* s1 tc, total serum cholesterol
23-
* s2 ldl, low-density lipoproteins
24-
* s3 hdl, high-density lipoproteins
25-
* s4 tch, total cholesterol / HDL
26-
* s5 ltg, possibly log of serum triglycerides level
27-
* s6 glu, blood sugar level
17+
* age age in years
18+
* sex
19+
* bmi body mass index
20+
* bp average blood pressure
21+
* s1 tc, total serum cholesterol
22+
* s2 ldl, low-density lipoproteins
23+
* s3 hdl, high-density lipoproteins
24+
* s4 tch, total cholesterol / HDL
25+
* s5 ltg, possibly log of serum triglycerides level
26+
* s6 glu, blood sugar level
2827
2928
Args:
3029
feature_type (torch.dtype): The data type of the features. Defaults to torch.float.

src/spotPython/hyperdict/light_hyper_dict.json

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -307,5 +307,102 @@
307307
"lower": 0,
308308
"upper": 2
309309
}
310+
},
311+
"NNLinearRegressor": {
312+
"l1": {
313+
"type": "int",
314+
"default": 3,
315+
"transform": "transform_power_2_int",
316+
"lower": 3,
317+
"upper": 8
318+
},
319+
"epochs": {
320+
"type": "int",
321+
"default": 4,
322+
"transform": "transform_power_2_int",
323+
"lower": 4,
324+
"upper": 9
325+
},
326+
"batch_size": {
327+
"type": "int",
328+
"default": 4,
329+
"transform": "transform_power_2_int",
330+
"lower": 1,
331+
"upper": 4
332+
},
333+
"act_fn": {
334+
"levels": [
335+
"Sigmoid",
336+
"Tanh",
337+
"ReLU",
338+
"LeakyReLU",
339+
"ELU",
340+
"Swish"
341+
],
342+
"type": "factor",
343+
"default": "ReLU",
344+
"transform": "None",
345+
"class_name": "spotPython.torch.activation",
346+
"core_model_parameter_type": "instance()",
347+
"lower": 0,
348+
"upper": 5
349+
},
350+
"optimizer": {
351+
"levels": [
352+
"Adadelta",
353+
"Adagrad",
354+
"Adam",
355+
"AdamW",
356+
"SparseAdam",
357+
"Adamax",
358+
"ASGD",
359+
"NAdam",
360+
"RAdam",
361+
"RMSprop",
362+
"Rprop",
363+
"SGD"
364+
],
365+
"type": "factor",
366+
"default": "SGD",
367+
"transform": "None",
368+
"class_name": "torch.optim",
369+
"core_model_parameter_type": "str",
370+
"lower": 0,
371+
"upper": 11
372+
},
373+
"dropout_prob": {
374+
"type": "float",
375+
"default": 0.01,
376+
"transform": "None",
377+
"lower": 0.0,
378+
"upper": 0.25
379+
},
380+
"lr_mult": {
381+
"type": "float",
382+
"default": 1.0,
383+
"transform": "None",
384+
"lower": 0.1,
385+
"upper": 10.0
386+
},
387+
"patience": {
388+
"type": "int",
389+
"default": 2,
390+
"transform": "transform_power_2_int",
391+
"lower": 2,
392+
"upper": 6
393+
},
394+
"initialization": {
395+
"levels": [
396+
"Default",
397+
"Kaiming",
398+
"Xavier"
399+
],
400+
"type": "factor",
401+
"default": "Default",
402+
"transform": "None",
403+
"core_model_parameter_type": "str",
404+
"lower": 0,
405+
"upper": 2
406+
}
310407
}
311408
}

test/test_csv_dataset.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
from spotPython.data.csvdataset import CSVDataset
2+
import pytest
3+
import torch
4+
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
5+
6+
7+
def create_mock_csv(data: str, filename: str):
8+
with open(filename, "w") as f:
9+
f.write(data)
10+
11+
12+
@pytest.fixture
13+
def mock_csv_file(tmp_path):
14+
data = """id,feature1,feature2,feature3,y
15+
1,A,10.1,100.1,positive
16+
2,B,20.2,200.2,negative
17+
3,C,30.3,300.3,positive
18+
4,D,40.4,400.4,negative
19+
5,E,50.5,500.5,positive"""
20+
filename = tmp_path / "data.csv"
21+
create_mock_csv(data, filename)
22+
return filename
23+
24+
25+
def test_csvdataset_remove_na(mock_csv_file):
26+
# Add a row with NA values
27+
data_with_na = """id,feature1,feature2,feature3,y
28+
1,A,10.1,100.1,positive
29+
2,B,20.2,200.2,negative
30+
3,C,30.3,300.3,positive
31+
4,D,,400.4,negative
32+
5,E,50.5,500.5,positive"""
33+
temp_dir = mock_csv_file.parent
34+
filename_na = temp_dir / "data_with_na.csv"
35+
create_mock_csv(data_with_na, filename_na)
36+
37+
dataset = CSVDataset(filename=filename_na, target_column="y", rmNA=True, oe=OrdinalEncoder(), le=LabelEncoder())
38+
assert len(dataset) == 4 # One row with NA should be removed
39+
assert dataset.data.shape[0] == 4 # Four rows left
40+
41+
42+
def test_csvdataset_non_numerical_target(mock_csv_file):
43+
dataset = CSVDataset(
44+
filename=mock_csv_file, target_column="y", target_type=torch.long, oe=OrdinalEncoder(), le=LabelEncoder()
45+
)
46+
assert len(set(dataset.targets.tolist())) == 2 # There should be two unique target classes after label encoding
47+
48+
49+
def test_csvdataset_len(mock_csv_file):
50+
dataset = CSVDataset(filename=mock_csv_file, target_column="y", oe=OrdinalEncoder(), le=LabelEncoder())
51+
assert len(dataset) == 5 # Check the correct length

0 commit comments

Comments
 (0)