Skip to content

Commit 54489cf

Browse files
committed
update scaler
1 parent c585668 commit 54489cf

2 files changed

Lines changed: 69 additions & 29 deletions

File tree

notebooks/00_spotPython_tests.ipynb

Lines changed: 58 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -3793,25 +3793,52 @@
37933793
},
37943794
{
37953795
"cell_type": "code",
3796-
"execution_count": null,
3796+
"execution_count": 1,
37973797
"metadata": {},
3798-
"outputs": [],
3798+
"outputs": [
3799+
{
3800+
"name": "stdout",
3801+
"output_type": "stream",
3802+
"text": [
3803+
"LightDataModule.setup(): stage: None\n",
3804+
"train_size: 0.25, val_size: 0.25 used for train & val data.\n",
3805+
"test_size: 0.5 used for test dataset.\n",
3806+
"test_size: 0.5 used for predict dataset.\n",
3807+
"Training set size: 3\n"
3808+
]
3809+
}
3810+
],
37993811
"source": [
38003812
"from spotPython.data.lightdatamodule import LightDataModule\n",
38013813
"from spotPython.data.csvdataset import CSVDataset\n",
38023814
"from spotPython.data.pkldataset import PKLDataset\n",
3815+
"from spotPython.utils.scaler import TorchStandardScaler\n",
38033816
"import torch\n",
3804-
"dataset = CSVDataset(csv_file='data.csv', target_column='prognosis', feature_type=torch.long)\n",
3805-
"data_module = LightDataModule(dataset=dataset, batch_size=5, test_size=0.5)\n",
3817+
"\n",
3818+
"scaler=TorchStandardScaler()\n",
3819+
"\n",
3820+
"dataset = CSVDataset(csv_file='data.csv', target_column='prognosis', feature_type=torch.float64)\n",
3821+
"data_module = LightDataModule(dataset=dataset, batch_size=5, test_size=0.5, scaler=scaler)\n",
38063822
"data_module.setup()\n",
38073823
"print(f\"Training set size: {len(data_module.data_train)}\")\n"
38083824
]
38093825
},
38103826
{
38113827
"cell_type": "code",
3812-
"execution_count": null,
3828+
"execution_count": 2,
38133829
"metadata": {},
3814-
"outputs": [],
3830+
"outputs": [
3831+
{
3832+
"data": {
3833+
"text/plain": [
3834+
"0.19878798965729408"
3835+
]
3836+
},
3837+
"execution_count": 2,
3838+
"metadata": {},
3839+
"output_type": "execute_result"
3840+
}
3841+
],
38153842
"source": [
38163843
"from sklearn.datasets import load_diabetes\n",
38173844
"diabetes = load_diabetes()\n",
@@ -3821,9 +3848,21 @@
38213848
},
38223849
{
38233850
"cell_type": "code",
3824-
"execution_count": null,
3851+
"execution_count": 4,
38253852
"metadata": {},
3826-
"outputs": [],
3853+
"outputs": [
3854+
{
3855+
"name": "stdout",
3856+
"output_type": "stream",
3857+
"text": [
3858+
"Batch Size: 1\n",
3859+
"---------------\n",
3860+
"Inputs: tensor([[ 0.0381, 0.0507, 0.0617, 0.0219, -0.0442, -0.0348, -0.0434, -0.0026,\n",
3861+
" 0.0199, -0.0176]])\n",
3862+
"Targets: tensor([151.])\n"
3863+
]
3864+
}
3865+
],
38273866
"source": [
38283867
"from spotPython.data.lightdatamodule import LightDataModule\n",
38293868
"from spotPython.data.csvdataset import CSVDataset\n",
@@ -3908,7 +3947,7 @@
39083947
},
39093948
{
39103949
"cell_type": "code",
3911-
"execution_count": 1,
3950+
"execution_count": 6,
39123951
"metadata": {},
39133952
"outputs": [
39143953
{
@@ -3923,15 +3962,14 @@
39233962
"Validation set size: 5160\n",
39243963
"Test set size: 10320\n",
39253964
"LightDataModule.train_dataloader(). data_train size: 5160\n",
3926-
"[tensor([[ 5.6063e+00, 1.6000e+01, 6.4174e+00, 9.6957e-01, 1.5250e+03,\n",
3927-
" 3.3152e+00, 3.7450e+01, -1.2190e+02],\n",
3928-
" [ 3.3462e+00, 3.4000e+01, 3.9503e+00, 9.8619e-01, 8.0500e+02,\n",
3929-
" 2.2238e+00, 3.4020e+01, -1.1841e+02]]), tensor([3.2050, 3.0700])]\n",
3965+
"[tensor([[-0.2677, -0.2508, -0.2664, -0.2752, 2.1991, -0.2714, -0.2160, -0.4747],\n",
3966+
" [-0.2714, -0.2216, -0.2704, -0.2752, 1.0301, -0.2732, -0.2216, -0.4690]],\n",
3967+
" grad_fn=<StackBackward0>), tensor([3.2050, 3.0700])]\n",
39303968
"LightDataModule.train_dataloader(). data_train size: 5160\n",
3931-
"[[ 5.6062999e+00 1.6000000e+01 6.4173913e+00 9.6956521e-01\n",
3932-
" 1.5250000e+03 3.3152175e+00 3.7450001e+01 -1.2190000e+02]\n",
3933-
" [ 3.3462000e+00 3.4000000e+01 3.9502761e+00 9.8618782e-01\n",
3934-
" 8.0500000e+02 2.2237568e+00 3.4020000e+01 -1.1841000e+02]]\n"
3969+
"[[-0.267703 -0.25082865 -0.26638618 -0.2752308 2.1990557 -0.2714226\n",
3970+
" -0.21600425 -0.47471142]\n",
3971+
" [-0.2713723 -0.22160538 -0.27039158 -0.2752038 1.0301248 -0.27319458\n",
3972+
" -0.2215729 -0.46904534]]\n"
39353973
]
39363974
}
39373975
],
@@ -3940,15 +3978,15 @@
39403978
"from spotPython.data.california_housing import CaliforniaHousing\n",
39413979
"import torch\n",
39423980
"dataset = CaliforniaHousing(feature_type=torch.float32, target_type=torch.float32)\n",
3943-
"data_module = LightDataModule(dataset=dataset, batch_size=2, test_size=0.5)\n",
3981+
"data_module = LightDataModule(dataset=dataset, batch_size=2, test_size=0.5, scaler=scaler)\n",
39443982
"data_module.setup()\n",
39453983
"print(f\"Training set size: {len(data_module.data_train)}\")\n",
39463984
"print(f\"Validation set size: {len(data_module.data_val)}\")\n",
39473985
"print(f\"Test set size: {len(data_module.data_test)}\")\n",
39483986
"# print the first batch of the training set from data_module.data_train\n",
39493987
"print(next(iter(data_module.train_dataloader())))\n",
39503988
"# print the first batch of the training set from data_module.data_train as a numpy array\n",
3951-
"print(next(iter(data_module.train_dataloader()))[0].numpy())\n"
3989+
"print(next(iter(data_module.train_dataloader()))[0].detach().numpy())\n"
39523990
]
39533991
},
39543992
{
@@ -4050,7 +4088,7 @@
40504088
"name": "python",
40514089
"nbconvert_exporter": "python",
40524090
"pygments_lexer": "ipython3",
4053-
"version": "3.11.7"
4091+
"version": "3.11.8"
40544092
}
40554093
},
40564094
"nbformat": 4,

src/spotPython/data/lightdatamodule.py

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -54,10 +54,12 @@ class LightDataModule(L.LightningDataModule):
5454
Examples:
5555
>>> from spotPython.data.lightdatamodule import LightDataModule
5656
from spotPython.data.csvdataset import CSVDataset
57+
from spotPython.utils.scaler import TorchStandardScaler
5758
import torch
5859
# data.csv is simple csv file with 11 samples
5960
dataset = CSVDataset(csv_file='data.csv', target_column='prognosis', feature_type=torch.long)
60-
data_module = LightDataModule(dataset=dataset, batch_size=5, test_size=0.5)
61+
scaler = TorchStandardScaler()
62+
data_module = LightDataModule(dataset=dataset, batch_size=5, test_size=0.5, scaler=scaler)
6163
data_module.setup()
6264
print(f"Training set size: {len(data_module.data_train)}")
6365
print(f"Validation set size: {len(data_module.data_val)}")
@@ -152,13 +154,13 @@ def setup(self, stage: Optional[str] = None) -> None:
152154
train_val_data = torch.cat([self.data_train[i][0] for i in range(len(self.data_train))])
153155
self.scaler.fit(train_val_data)
154156
self.data_train = [(self.scaler.transform(data), target) for data, target in self.data_train]
155-
data_tensors_train = [torch.tensor(data, dtype=torch.float32) for data, target in self.data_train]
156-
target_tensors_train = [torch.tensor(target, dtype=torch.float32) for data, target in self.data_train]
157+
data_tensors_train = [data.clone().detach().requires_grad_(True) for data, target in self.data_train]
158+
target_tensors_train = [target.clone().detach() for data, target in self.data_train]
157159
self.data_train = TensorDataset(torch.stack(data_tensors_train), torch.stack(target_tensors_train))
158160
#print(self.data_train)
159161
self.data_val = [(self.scaler.transform(data), target) for data, target in self.data_val]
160-
data_tensors_val = [torch.tensor(data, dtype=torch.float32) for data, target in self.data_val]
161-
target_tensors_val = [torch.tensor(target, dtype=torch.float32) for data, target in self.data_val]
162+
data_tensors_val = [data.clone().detach().requires_grad_(True) for data, target in self.data_val]
163+
target_tensors_val = [target.clone().detach() for data, target in self.data_val]
162164
self.data_val = TensorDataset(torch.stack(data_tensors_val), torch.stack(target_tensors_val))
163165

164166
# Assign test dataset for use in dataloader(s)
@@ -169,8 +171,8 @@ def setup(self, stage: Optional[str] = None) -> None:
169171
self.data_test, _ = random_split(self.data_full, [test_size, full_train_size], generator=generator_test)
170172
if self.scaler is not None:
171173
self.data_test = [(self.scaler.transform(data), target) for data, target in self.data_test]
172-
data_tensors_test = [torch.tensor(data, dtype=torch.float32) for data, target in self.data_test]
173-
target_tensors_test = [torch.tensor(target, dtype=torch.float32) for data, target in self.data_test]
174+
data_tensors_test = [data.clone().detach().requires_grad_(True) for data, target in self.data_test]
175+
target_tensors_test = [target.clone().detach() for data, target in self.data_test]
174176
self.data_test = TensorDataset(torch.stack(data_tensors_test), torch.stack(target_tensors_test))
175177

176178
# if stage == "predict" or stage is None:
@@ -192,8 +194,8 @@ def setup(self, stage: Optional[str] = None) -> None:
192194
)
193195
if self.scaler is not None:
194196
self.data_predict = [(self.scaler.transform(data), target) for data, target in self.data_predict]
195-
data_tensors_predict= [torch.tensor(data, dtype=torch.float32) for data, target in self.data_predict]
196-
target_tensors_predict = [torch.tensor(target, dtype=torch.float32) for data, target in self.data_predict]
197+
data_tensors_predict= [data.clone().detach().requires_grad_(True) for data, target in self.data_predict]
198+
target_tensors_predict = [target.clone().detach() for data, target in self.data_predict]
197199
self.data_predict = TensorDataset(torch.stack(data_tensors_predict), torch.stack(target_tensors_predict))
198200

199201
def train_dataloader(self) -> DataLoader:

0 commit comments

Comments
 (0)