google-deepmind · huangyz0918 · Jun 8, 2026 · Jun 8, 2026
diff --git a/proeval/README.md b/proeval/README.md
@@ -307,7 +307,61 @@ for i in range(5):
 
 ---
 
-## 3. LLMPredictor — LLM Evaluation
+## 3. Dataset — Bring Your Own Data
+
+`Dataset` bundles **questions + ground truths + a `DatasetConfig`** in a single
+object that the predictor (and, later, the sampler) operates on. Use it whenever
+you want to evaluate models on data that isn't already wired into
+`DATASET_CONFIGS`.
+
+### Constructors
+
+```python
+from proeval import Dataset, DATASET_CONFIGS, LLMPredictor
+
+# (a) Built-in: load one of the 9 datasets shipped with ProEval
+ds = Dataset.from_builtin("svamp")
+
+# (b) From in-memory lists (simplest custom case)
+ds = Dataset.from_lists(
+    name="my_yesno",
+    questions=["Is the sky blue?", "Is fire cold?"],
+    ground_truths=["yes", "no"],
+    prompt_template=lambda q: f"{q} Respond JSON: {{'answer': 'yes'|'no'}}",
+    extract_prediction=lambda d: d["answer"],
+    extract_ground_truth=lambda gt: str(gt).lower(),
+    compare_predictions=lambda p, g: 0.0 if str(p).lower() == g else 1.0,
+)
+
+# (c) From a CSV file
+ds = Dataset.from_csv(
+    "my_data.csv",
+    question_col="question",
+    ground_truth_col="answer",
+    config=DATASET_CONFIGS["strategyqa"],   # reuse an existing config
+)
+```
+
+If you already have a built-in `DatasetConfig` that fits your scoring needs,
+pass it via `config=...` and skip the four eval-function arguments.
+
+### Predict
+
+```python
+predictor = LLMPredictor(model="google/gemma-3-4b-it")
+
+# Either direction works — they're equivalent
+results = ds.predict(predictor, parallel=True, workers=10)
+results = predictor.predict_dataset(ds, parallel=True, workers=10)
+# results: list of (question, ground_truth, raw_response, prediction, score)
+```
+
+`Dataset` also supports `len(ds)`, `ds[i]`, and iteration — returning
+`(question, ground_truth)` tuples.
+
+---
+
+## 4. LLMPredictor — LLM Evaluation
 
 Evaluate LLMs on supported datasets with structured JSON parsing, retry logic, and parallel batching.
 
@@ -445,7 +499,7 @@ csv_mgr.save()                             # Write DataFrame to CSV
 
 ---
 
-## 4. EncoderTrainer — Train a Neural Encoder
+## 5. EncoderTrainer — Train a Neural Encoder
 
 Train a neural encoder for cross-benchmark BQ prior (Setting 1).
 
@@ -494,7 +548,7 @@ Use `--checkpoint-path path/to/encoder.pth` to specify the exact save location.
 
 ---
 
-## 5. Utility Functions
+## 6. Utility Functions
 
 ### Data Loading
 
@@ -540,7 +594,7 @@ resolve_model_name("claude35_sonnet")  # → "anthropic/claude-3.5-sonnet"
 
 ---
 
-## 6. Experiment CLI Scripts
+## 7. Experiment CLI Scripts
 
 All experiment scripts live in `experiment/` and are run as Python modules:
 

diff --git a/proeval/__init__.py b/proeval/__init__.py
@@ -41,6 +41,7 @@
 from proeval.evaluator import DATASET_CONFIGS, DatasetConfig, LLMPredictor, OpenRouterClient
 from proeval.generator import TopicAwareGenerator
 from proeval.sampler import BQPriorSampler, BQSampler, SamplingResult
+from proeval.utils import Dataset
 
 __all__ = [
     "BQPriorSampler",
@@ -49,6 +50,7 @@
     "TopicAwareGenerator",
     "LLMPredictor",
     "OpenRouterClient",
+    "Dataset",
     "DatasetConfig",
     "DATASET_CONFIGS",
 ]
diff --git a/proeval/evaluator/predictor.py b/proeval/evaluator/predictor.py
@@ -510,6 +510,30 @@ def predict_batch(
             results.append((q, gt, raw, pred if pred is not None else "PARSE_ERROR", score))
         return results
 
+    def predict_dataset(
+        self,
+        dataset,
+        parallel: bool = True,
+        workers: int = 10,
+        max_parse_retries: int = 3,
+        show_progress: bool = True,
+        skip_error: bool = False,
+    ) -> List[Tuple[Any, Any, str, Any, float]]:
+        """Run predictions over a :class:`~proeval.utils.Dataset`.
+
+        Thin convenience wrapper — delegates to
+        :meth:`proeval.utils.Dataset.predict`. Lets callers write
+        ``predictor.predict_dataset(ds)`` instead of ``ds.predict(predictor)``.
+        """
+        return dataset.predict(
+            self,
+            parallel=parallel,
+            workers=workers,
+            max_parse_retries=max_parse_retries,
+            show_progress=show_progress,
+            skip_error=skip_error,
+        )
+
     def predict_batch_parallel(
         self,
         questions: List[str],

diff --git a/proeval/utils/__init__.py b/proeval/utils/__init__.py
@@ -12,15 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""ProEval utilities — metrics (no plotting).
+"""ProEval utilities — Dataset wrapper, metrics.
 
 Public API::
 
+    from proeval.utils import Dataset
     from proeval.utils import topic_entropy, embedding_coverage, failure_rate
     from proeval.utils import compute_samples_to_threshold, print_results_table
     from proeval.utils import MODEL_NAME_MAP
 """
 
+from proeval.utils.dataset import Dataset
 from proeval.utils.metrics import (
     compute_all_metrics,
     compute_samples_to_threshold,
@@ -35,6 +37,7 @@
 from proeval.utils.model_names import MODEL_NAME_MAP
 
 __all__ = [
+    "Dataset",
     "topic_entropy",
     "embedding_coverage",
     "overall_diversity",