Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 58 additions & 4 deletions proeval/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,61 @@ for i in range(5):

---

## 3. LLMPredictor — LLM Evaluation
## 3. Dataset — Bring Your Own Data

`Dataset` bundles **questions + ground truths + a `DatasetConfig`** in a single
object that the predictor (and, later, the sampler) operates on. Use it whenever
you want to evaluate models on data that isn't already wired into
`DATASET_CONFIGS`.

### Constructors

```python
from proeval import Dataset, DATASET_CONFIGS, LLMPredictor

# (a) Built-in: load one of the 9 datasets shipped with ProEval
ds = Dataset.from_builtin("svamp")

# (b) From in-memory lists (simplest custom case)
ds = Dataset.from_lists(
name="my_yesno",
questions=["Is the sky blue?", "Is fire cold?"],
ground_truths=["yes", "no"],
prompt_template=lambda q: f"{q} Respond JSON: {{'answer': 'yes'|'no'}}",
extract_prediction=lambda d: d["answer"],
extract_ground_truth=lambda gt: str(gt).lower(),
compare_predictions=lambda p, g: 0.0 if str(p).lower() == g else 1.0,
)

# (c) From a CSV file
ds = Dataset.from_csv(
"my_data.csv",
question_col="question",
ground_truth_col="answer",
config=DATASET_CONFIGS["strategyqa"], # reuse an existing config
)
```

If you already have a built-in `DatasetConfig` that fits your scoring needs,
pass it via `config=...` and skip the four eval-function arguments.

### Predict

```python
predictor = LLMPredictor(model="google/gemma-3-4b-it")

# Either direction works — they're equivalent
results = ds.predict(predictor, parallel=True, workers=10)
results = predictor.predict_dataset(ds, parallel=True, workers=10)
# results: list of (question, ground_truth, raw_response, prediction, score)
```

`Dataset` also supports `len(ds)`, `ds[i]`, and iteration — returning
`(question, ground_truth)` tuples.

---

## 4. LLMPredictor — LLM Evaluation

Evaluate LLMs on supported datasets with structured JSON parsing, retry logic, and parallel batching.

Expand Down Expand Up @@ -445,7 +499,7 @@ csv_mgr.save() # Write DataFrame to CSV

---

## 4. EncoderTrainer — Train a Neural Encoder
## 5. EncoderTrainer — Train a Neural Encoder

Train a neural encoder for cross-benchmark BQ prior (Setting 1).

Expand Down Expand Up @@ -494,7 +548,7 @@ Use `--checkpoint-path path/to/encoder.pth` to specify the exact save location.

---

## 5. Utility Functions
## 6. Utility Functions

### Data Loading

Expand Down Expand Up @@ -540,7 +594,7 @@ resolve_model_name("claude35_sonnet") # → "anthropic/claude-3.5-sonnet"

---

## 6. Experiment CLI Scripts
## 7. Experiment CLI Scripts

All experiment scripts live in `experiment/` and are run as Python modules:

Expand Down
2 changes: 2 additions & 0 deletions proeval/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
from proeval.evaluator import DATASET_CONFIGS, DatasetConfig, LLMPredictor, OpenRouterClient
from proeval.generator import TopicAwareGenerator
from proeval.sampler import BQPriorSampler, BQSampler, SamplingResult
from proeval.utils import Dataset

__all__ = [
"BQPriorSampler",
Expand All @@ -49,6 +50,7 @@
"TopicAwareGenerator",
"LLMPredictor",
"OpenRouterClient",
"Dataset",
"DatasetConfig",
"DATASET_CONFIGS",
]
24 changes: 24 additions & 0 deletions proeval/evaluator/predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -510,6 +510,30 @@ def predict_batch(
results.append((q, gt, raw, pred if pred is not None else "PARSE_ERROR", score))
return results

def predict_dataset(
self,
dataset,
parallel: bool = True,
workers: int = 10,
max_parse_retries: int = 3,
show_progress: bool = True,
skip_error: bool = False,
) -> List[Tuple[Any, Any, str, Any, float]]:
"""Run predictions over a :class:`~proeval.utils.Dataset`.

Thin convenience wrapper — delegates to
:meth:`proeval.utils.Dataset.predict`. Lets callers write
``predictor.predict_dataset(ds)`` instead of ``ds.predict(predictor)``.
"""
return dataset.predict(
self,
parallel=parallel,
workers=workers,
max_parse_retries=max_parse_retries,
show_progress=show_progress,
skip_error=skip_error,
)

def predict_batch_parallel(
self,
questions: List[str],
Expand Down
5 changes: 4 additions & 1 deletion proeval/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,17 @@
# See the License for the specific language governing permissions and
# limitations under the License.

"""ProEval utilities — metrics (no plotting).
"""ProEval utilities — Dataset wrapper, metrics.

Public API::

from proeval.utils import Dataset
from proeval.utils import topic_entropy, embedding_coverage, failure_rate
from proeval.utils import compute_samples_to_threshold, print_results_table
from proeval.utils import MODEL_NAME_MAP
"""

from proeval.utils.dataset import Dataset
from proeval.utils.metrics import (
compute_all_metrics,
compute_samples_to_threshold,
Expand All @@ -35,6 +37,7 @@
from proeval.utils.model_names import MODEL_NAME_MAP

__all__ = [
"Dataset",
"topic_entropy",
"embedding_coverage",
"overall_diversity",
Expand Down
Loading