csv and pkl datasets moved to data

bartzbeielstein · bartzbeielstein · commit 6e73687f8b2f · 2023-11-10T16:22:48.000+01:00
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,4 +1,3 @@
 include src/spotPython/data/*.csv
 include src/spotPython/data/*.json
-
-
+include src/spotPython/data/*.pkl
diff --git a/notebooks/00_spotPython_tests.ipynb b/notebooks/00_spotPython_tests.ipynb
@@ -137,30 +137,71 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
+      "execution_count": 1,
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Loading data from /Users/bartz/miniforge3/envs/spotCondaEnv/lib/python3.11/site-packages/spotPython/data/data.csv\n",
+            "torch.Size([11, 64])\n",
+            "torch.Size([11])\n"
+          ]
+        }
+      ],
       "source": [
-        "from spotPython.light.csvdataset import CSVDataset\n",
-        "dataset = CSVDataset(csv_file='./data/spotPython/data.csv', target_column='prognosis')\n",
+        "from spotPython.data.csvdataset import CSVDataset\n",
+        "# dataset = CSVDataset(csv_file='./data/spotPython/data.csv', target_column='prognosis')\n",
+        "dataset = CSVDataset(target_column='prognosis')\n",
         "print(dataset.data.shape)\n",
         "print(dataset.targets.shape)            "
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
+      "execution_count": 5,
+      "metadata": {},
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "'Split: Train'"
+            ]
+          },
+          "execution_count": 5,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
       "source": [
         "dataset.extra_repr()"
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
+      "execution_count": 6,
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Batch Size: 3\n",
+            "---------------\n",
+            "Inputs: tensor([[1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,\n",
+            "         1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,\n",
+            "         0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0],\n",
+            "        [0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0,\n",
+            "         1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1,\n",
+            "         0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+            "        [1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,\n",
+            "         1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0,\n",
+            "         1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1]])\n",
+            "Targets: tensor([6, 8, 3])\n"
+          ]
+        }
+      ],
       "source": [
         "from torch.utils.data import DataLoader\n",
         "# Set batch size for DataLoader\n",
@@ -203,9 +244,9 @@
       "metadata": {},
       "outputs": [],
       "source": [
-        "from spotPython.light.csvdataset import CSVDataset\n",
+        "from spotPython.data.csvdataset import CSVDataset\n",
         "import torch\n",
-        "dataset = CSVDataset(csv_file='./data/spotPython/data.csv', target_column='prognosis', feature_type=torch.long)"
+        "dataset = CSVDataset(csv_file='data.csv', target_column='prognosis', feature_type=torch.long)"
       ]
     },
     {
@@ -353,35 +394,60 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
+      "execution_count": 4,
       "metadata": {},
       "outputs": [],
       "source": [
-        "# from spotPython.light.pkldataset import PKLDataset\n",
-        "# import torch\n",
-        "# dataset = PKLDataset(pkl_file='./data/spotPython/data.pkl', target_column='prognosis', feature_type=torch.long)"
+        "from spotPython.data.pkldataset import PKLDataset\n",
+        "import torch\n",
+        "dataset = PKLDataset(target_column='prognosis', feature_type=torch.long)"
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
+      "execution_count": 3,
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Batch Size: 5\n",
+            "---------------\n",
+            "Inputs: tensor([[1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,\n",
+            "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
+            "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+            "        [1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,\n",
+            "         0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0,\n",
+            "         0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0],\n",
+            "        [1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,\n",
+            "         1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,\n",
+            "         0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0],\n",
+            "        [1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0,\n",
+            "         1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
+            "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+            "        [0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,\n",
+            "         0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,\n",
+            "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])\n",
+            "Targets: tensor([ 0,  1,  6,  9, 10])\n"
+          ]
+        }
+      ],
       "source": [
-        "# from torch.utils.data import DataLoader\n",
-        "# # Set batch size for DataLoader\n",
-        "# batch_size = 5\n",
-        "# # Create DataLoader\n",
-        "# dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)\n",
+        "from torch.utils.data import DataLoader\n",
+        "# Set batch size for DataLoader\n",
+        "batch_size = 5\n",
+        "# Create DataLoader\n",
+        "dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)\n",
         "\n",
-        "# # Iterate over the data in the DataLoader\n",
-        "# for batch in dataloader:\n",
-        "#     inputs, targets = batch\n",
-        "#     print(f\"Batch Size: {inputs.size(0)}\")\n",
-        "#     print(\"---------------\")\n",
-        "#     print(f\"Inputs: {inputs}\")\n",
-        "#     print(f\"Targets: {targets}\")\n",
-        "#     break"
+        "# Iterate over the data in the DataLoader\n",
+        "for batch in dataloader:\n",
+        "    inputs, targets = batch\n",
+        "    print(f\"Batch Size: {inputs.size(0)}\")\n",
+        "    print(\"---------------\")\n",
+        "    print(f\"Inputs: {inputs}\")\n",
+        "    print(f\"Targets: {targets}\")\n",
+        "    break"
       ]
     },
     {
diff --git a/pyproject.toml b/pyproject.toml
@@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "spotPython"
-version = "0.6.37"
+version = "0.6.38"
 authors = [
   { name="T. Bartz-Beielstein", email="tbb@bartzundbartz.de" }
 ]
diff --git a/src/spotPython/data/csvdataset.py b/src/spotPython/data/csvdataset.py
@@ -2,41 +2,82 @@
 import pandas as pd
 from torch.utils.data import Dataset
 from sklearn.preprocessing import LabelEncoder
+import pathlib
 
 
 class CSVDataset(Dataset):
     """
     A PyTorch Dataset for handling CSV data.
 
     Args:
-        csv_file (str): The path to the CSV file. Defaults to "./data/spotPython/data.csv".
+        filename (str): The path to the CSV file. Defaults to "data.csv".
+        directory (str): The path to the directory where the CSV file is stored. Defaults to None.
+        feature_type (torch.dtype): The data type of the features. Defaults to torch.float.
+        target_column (str): The name of the target column. Defaults to "y".
+        target_type (torch.dtype): The data type of the targets. Defaults to torch.long.
         train (bool): Whether the dataset is for training or not. Defaults to True.
+        rmNA (bool): Whether to remove rows with NA values or not. Defaults to True.
+        **desc: Additional keyword arguments.
 
     Attributes:
         data (Tensor): The data features.
         targets (Tensor): The data targets.
+
+    Examples:
+        >>> from torch.utils.data import DataLoader
+            from spotPython.data.csvdataset import CSVDataset
+            import torch
+            dataset = CSVDataset(csv_file='data.csv', target_column='prognosis', feature_type=torch.long)
+            # Set batch size for DataLoader
+            batch_size = 5
+            # Create DataLoader
+            dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
+
+            # Iterate over the data in the DataLoader
+            for batch in dataloader:
+                inputs, targets = batch
+                print(f"Batch Size: {inputs.size(0)}")
+                print("---------------")
+                print(f"Inputs: {inputs}")
+                print(f"Targets: {targets}")
     """
 
     def __init__(
         self,
-        csv_file: str = "./data/spotPython/data.csv",
+        filename: str = "data.csv",
+        directory: None = None,
         feature_type: torch.dtype = torch.float,
         target_column: str = "y",
         target_type: torch.dtype = torch.long,
         train: bool = True,
         rmNA=True,
+        **desc,
     ) -> None:
         super().__init__()
-        self.csv_file = csv_file
+        self.filename = filename
+        self.directory = directory
         self.feature_type = feature_type
         self.target_type = target_type
         self.target_column = target_column
         self.train = train
         self.rmNA = rmNA
         self.data, self.targets = self._load_data()
 
+    @property
+    def path(self):
+        if self.directory:
+            return pathlib.Path(self.directory).joinpath(self.filename)
+        return pathlib.Path(__file__).parent.joinpath(self.filename)
+
+    @property
+    def _repr_content(self):
+        content = super()._repr_content
+        content["Path"] = str(self.path)
+        return content
+
     def _load_data(self) -> tuple:
-        df = pd.read_csv(self.csv_file, index_col=False)
+        print(f"Loading data from {self.path}")
+        df = pd.read_csv(self.path, index_col=False)
         # rm rows with NA
         if self.rmNA:
             df = df.dropna()
@@ -66,7 +107,7 @@ def __getitem__(self, idx: int) -> tuple:
 
         Examples:
             >>> from spotPython.light.csvdataset import CSVDataset
-                dataset = CSVDataset(csv_file='./data/spotPython/data.csv', target_column='prognosis')
+                dataset = CSVDataset(filename='./data/spotPython/data.csv', target_column='prognosis')
                 print(dataset.data.shape)
                 print(dataset.targets.shape)
                 torch.Size([11, 65])
diff --git a/src/spotPython/data/data.csv b/src/spotPython/data/data.csv
@@ -0,0 +1,12 @@
+sudden_fever,headache,mouth_bleed,nose_bleed,muscle_pain,joint_pain,vomiting,rash,diarrhea,hypotension,pleural_effusion,ascites,gastro_bleeding,swelling,nausea,chills,myalgia,digestion_trouble,fatigue,skin_lesions,stomach_pain,orbital_pain,neck_pain,weakness,back_pain,weight_loss,gum_bleed,jaundice,coma,diziness,inflammation,red_eyes,loss_of_appetite,urination_loss,slow_heart_rate,abdominal_pain,light_sensitivity,yellow_skin,yellow_eyes,facial_distortion,microcephaly,rigor,bitter_tongue,convulsion,anemia,cocacola_urine,hypoglycemia,prostraction,hyperpyrexia,stiff_neck,irritability,confusion,tremor,paralysis,lymph_swells,breathing_restriction,toe_inflammation,finger_inflammation,lips_irritation,itchiness,ulcers,toenail_loss,speech_problem,bullseye_rash,prognosis
+1,0,0,0,1,1,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Chikungunya
+1,0,0,0,1,1,1,1,0,1,0,1,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,Dengue
+1,1,1,1,0,1,0,1,0,1,0,0,1,1,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,Rift Valley fever
+1,1,0,1,1,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,1,0,1,0,1,0,1,1,0,0,1,1,1,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Yellow Fever
+0,0,1,0,0,1,0,0,0,0,1,1,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,1,1,1,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Zika
+1,1,0,1,1,0,0,1,1,1,1,1,1,0,0,0,0,0,0,1,0,1,0,1,0,1,0,0,0,0,1,1,0,1,1,0,1,1,0,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Malaria
+0,0,0,1,1,1,1,0,0,0,0,0,1,0,1,1,1,0,1,0,1,0,0,1,0,0,1,0,1,1,1,0,0,0,0,1,1,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,Japanese encephalitis
+0,1,0,0,0,1,1,0,0,0,0,0,0,0,1,1,1,0,0,1,1,1,0,0,1,0,0,0,0,0,0,0,1,1,0,0,1,0,0,1,0,0,0,1,1,0,0,1,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,West Nile fever
+1,1,1,1,1,0,1,1,0,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1,1,1,1,1,1,0,1,0,1,1,0,1,1,1,1,0,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,Plague
+0,1,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,1,1,0,0,Tungiasis
+1,1,1,0,0,1,1,1,0,0,1,1,0,0,1,1,1,1,1,0,1,1,1,1,1,0,1,1,1,0,1,0,1,0,1,1,0,1,1,0,0,0,1,0,1,1,1,0,1,1,1,1,1,1,0,1,1,0,0,1,1,0,1,1,Lyme disease
diff --git a/src/spotPython/data/pkldataset.py b/src/spotPython/data/pkldataset.py
@@ -2,41 +2,76 @@
 import pandas as pd
 from torch.utils.data import Dataset
 from sklearn.preprocessing import LabelEncoder
+import pathlib
 
 
 class PKLDataset(Dataset):
     """
     A PyTorch Dataset for handling pickle (*.pkl) data.
 
     Args:
-        pkl_file (str): The path to the pkl file. Defaults to "./data/spotPython/data.pkl".
+        filename (str): The path to the pkl file. Defaults to "data.pkl".
         train (bool): Whether the dataset is for training or not. Defaults to True.
 
     Attributes:
         data (Tensor): The data features.
         targets (Tensor): The data targets.
+
+    Examples:
+        >>> from spotPython.data.pkldataset import PKLDataset
+            import torch
+            from torch.utils.data import DataLoader
+            dataset = PKLDataset(target_column='prognosis', feature_type=torch.long)
+            # Set batch size for DataLoader
+            batch_size = 5
+            # Create DataLoader
+            dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
+
+            # Iterate over the data in the DataLoader
+            for batch in dataloader:
+                inputs, targets = batch
+                print(f"Batch Size: {inputs.size(0)}")
+                print("---------------")
+                print(f"Inputs: {inputs}")
+                print(f"Targets: {targets}")
+                break
     """
 
     def __init__(
         self,
-        pkl_file: str = "./data/spotPython/data.pkl",
+        filename: str = "data.pkl",
+        directory: None = None,
         feature_type: torch.dtype = torch.float,
         target_column: str = "y",
         target_type: torch.dtype = torch.long,
         train: bool = True,
         rmNA=True,
+        **desc,
     ) -> None:
         super().__init__()
-        self.pkl_file = pkl_file
+        self.filename = filename
+        self.directory = directory
         self.feature_type = feature_type
         self.target_type = target_type
         self.target_column = target_column
         self.train = train
         self.rmNA = rmNA
         self.data, self.targets = self._load_data()
 
+    @property
+    def path(self):
+        if self.directory:
+            return pathlib.Path(self.directory).joinpath(self.filename)
+        return pathlib.Path(__file__).parent.joinpath(self.filename)
+
+    @property
+    def _repr_content(self):
+        content = super()._repr_content
+        content["Path"] = str(self.path)
+        return content
+
     def _load_data(self) -> tuple:
-        with open(self.pkl_file, "rb") as f:
+        with open(self.path, "rb") as f:
             df = pd.read_pickle(f)
         # rm rows with NA
         if self.rmNA:
@@ -67,7 +102,7 @@ def __getitem__(self, idx: int) -> tuple:
 
         Examples:
             >>> from spotPython.light.pkldataset import pklDataset
-                dataset = pklDataset(pkl_file='./data/spotPython/data.pkl', target_column='prognosis')
+                dataset = pklDataset(filename='./data/spotPython/data.pkl', target_column='prognosis')
                 print(dataset.data.shape)
                 print(dataset.targets.shape)
                 torch.Size([11, 65])
diff --git a/test/test_csvdataset.py b/test/test_csvdataset.py
diff --git a/test/test_pkldataset.py b/test/test_pkldataset.py

Original file line number	Diff line number	Diff line change
`@@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"`
`7`	`7`
`8`	`8`	`[project]`
`9`	`9`	`name = "spotPython"`
`10`		`-version = "0.6.37"`
	`10`	`+version = "0.6.38"`
`11`	`11`	`authors = [`
`12`	`12`	`{ name="T. Bartz-Beielstein", email="tbb@bartzundbartz.de" }`
`13`	`13`	`]`