From 184a3d96287a75b0748c161114b2a980b741deb6 Mon Sep 17 00:00:00 2001 From: bartzbeielstein <32470350+bartzbeielstein@users.noreply.github.com> Date: Mon, 8 Jun 2026 00:08:42 +0200 Subject: [PATCH] feat(configurator): convert ConfigMulti and ConfigEntsoe to dataclasses @dataclass with field(default_factory=...) for the mutable/Timedelta defaults and __post_init__ -> validate_config(). _PARAM_NAMES is now derived from dataclasses.fields() instead of a hand-maintained tuple, removing the field-vs-param drift hazard. get_params()/set_params() surface (periods__* deep notation, ValueError on unknown params, round-trip stability) unchanged; only __eq__/__repr__ are newly auto-generated -> minor bump. Regenerated config reference pages. Refs: GitLab #4 / Option 3 (RunState migration ADR) Co-Authored-By: Claude Opus 4.8 (1M context) --- ...onfigurator.config_entsoe.ConfigEntsoe.qmd | 130 +++--- .../configurator.config_multi.ConfigMulti.qmd | 104 ++--- .../configurator/_base_config.py | 6 +- .../configurator/config_entsoe.py | 431 ++++++------------ .../configurator/config_multi.py | 429 ++++++----------- 5 files changed, 375 insertions(+), 725 deletions(-) diff --git a/docs/reference/configurator.config_entsoe.ConfigEntsoe.qmd b/docs/reference/configurator.config_entsoe.ConfigEntsoe.qmd index ea56a48e..4d8bbbc4 100644 --- a/docs/reference/configurator.config_entsoe.ConfigEntsoe.qmd +++ b/docs/reference/configurator.config_entsoe.ConfigEntsoe.qmd @@ -3,11 +3,11 @@ ```python configurator.config_entsoe.ConfigEntsoe( country_code='DE', - periods=None, - lags_consider=None, - train_size=None, + periods=default_periods(), + lags_consider=(lambda: list(range(1, 24)))(), + train_size=(lambda: pd.Timedelta(days=(3 * 365)))(), end_train_default='2025-12-31 00:00+00:00', - delta_val=None, + delta_val=(lambda: pd.Timedelta(hours=(24 * 7 * 10)))(), predict_size=24, cv_block_size=None, refit_size=7, @@ -59,7 +59,7 @@ configurator.config_entsoe.ConfigEntsoe( exog_max_gap_hours=0, exog_max_tail_gap_hours=0, exog_provider_window='full', - retrain_max_age=None, + retrain_max_age=(lambda: pd.Timedelta(days=7))(), target_qc_range_mw=None, target_qc_step_mw=None, target_qc_window_days=None, @@ -84,66 +84,66 @@ protocol. ## Parameters {.doc-section .doc-section-parameters} -| Name | Type | Description | Default | -|------------------------------------|------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------| -| country_code | [str](`str`) | ISO 3166-1 alpha-2 country code (e.g. ``"DE"``). | `'DE'` | -| periods | [Optional](`typing.Optional`)\[[List](`typing.List`)\[[Period](`spotforecast2_safe.data.Period`)\]\] | Cyclical feature encodings. | `None` | -| lags_consider | [Optional](`typing.Optional`)\[[List](`typing.List`)\[[int](`int`)\]\] | Lag values for autoregressive features. | `None` | -| train_size | [Optional](`typing.Optional`)\[[pd](`pandas`).[Timedelta](`pandas.Timedelta`)\] | Training window. | `None` | -| end_train_default | [str](`str`) | Default end-of-training timestamp (ISO). | `'2025-12-31 00:00+00:00'` | -| delta_val | [Optional](`typing.Optional`)\[[pd](`pandas`).[Timedelta](`pandas.Timedelta`)\] | Validation window. | `None` | -| predict_size | [int](`int`) | Prediction horizon in hours. | `24` | -| cv_block_size | [int](`int`) \| None | Cross-validation test-block width in hours. Defaults to ``None``, meaning the CV uses ``predict_size``. Set to a fixed value (e.g. ``24``) to decouple the cross-validation horizon from a render-dependent live ``predict_size``. | `None` | -| refit_size | [int](`int`) | Refit cadence in days. | `7` | -| random_state | [int](`int`) | Random seed. | `314159` | -| n_hyperparameters_trials | [int](`int`) | Hyperparameter-tuning trial budget. | `20` | -| data_filename | [str](`str`) | Path to the merged interim CSV. | `'interim/energy_load.csv'` | -| targets | [Optional](`typing.Optional`)\[[List](`typing.List`)\[[str](`str`)\]\] | Active target column names. ``None`` until set after data loading. For ENTSO-E this is typically ``["Actual Load"]``. | `None` | -| use_outlier_detection | [bool](`bool`) | Apply IsolationForest-based outlier removal. Defaults to ``True``. | `True` | -| contamination | [float](`float`) | IsolationForest contamination fraction. | `0.01` | -| imputation_method | [str](`str`) | Gap-filling strategy. | `'weighted'` | -| window_size | [int](`int`) | Rolling window for weighted imputation. Also the LightGBM rolling-mean feature window in the ENTSO-E factories. | `72` | -| imputation_window_size | [Optional](`typing.Optional`)\[[int](`int`)\] | Width of the gap-penalty zone (in hours) around each imputed value for the ``"weighted"`` strategy. When ``None`` (default), falls back to ``window_size``, so existing behaviour is unchanged. Set this to decouple the imputation penalty zone from the rolling-feature window. | `None` | -| use_exogenous_features | [bool](`bool`) | Build weather/calendar/holiday features. | `True` | -| latitude | [float](`float`) | Location latitude. | `51.5136` | -| longitude | [float](`float`) | Location longitude. | `7.4653` | -| timezone | [str](`str`) | IANA timezone string. | `'UTC'` | -| state | [str](`str`) | Subdivision code for regional holidays. | `'NW'` | -| include_weather_windows | [bool](`bool`) | Weather-window feature toggle. | `False` | -| include_holiday_features | [bool](`bool`) | Holiday feature toggle. | `False` | -| include_holiday_adjacency_features | [bool](`bool`) | Brückentag and before/after-holiday indicator toggle. Defaults to ``False``. | `False` | -| poly_features_degree | [int](`int`) | Polynomial-interaction degree passed to the feature builder. ``1`` (default) generates no interactions; ``2`` adds pairwise bilinear terms; ``3+`` higher order. | `1` | -| max_poly_features | [int](`int`) | Cap on polynomial interaction columns. When more than this many ``poly_*`` columns are generated, only the top ``max_poly_features`` ranked by mutual information with the target are kept (``<= 0`` disables the cap). Defaults to ``10``. | `10` | -| poly_mi_n_jobs | [Optional](`typing.Optional`)\[[int](`int`)\] | Parallel jobs for the mutual-information ranking that enforces ``max_poly_features``. ``-1`` (default) uses all cores; ``None`` runs single-threaded. Parallelism does not change the selection. | `-1` | -| poly_mi_sample_size | [Optional](`typing.Optional`)\[[int](`int`)\] | Row cap for that ranking; longer series are scored on a reproducible random subsample of this size (seeded by ``random_state``), which can change which borderline columns make the top K. ``None`` scores every row (the pre-15.8 behaviour). Defaults to ``4000``. | `4000` | -| include_covid_infection_rate | [bool](`bool`) | Append the bundled German national COVID-19 7-day incidence (RKI) as an exogenous level regressor. Defaults to ``False``. | `False` | -| include_entsoe_forecast_load | [bool](`bool`) | Append the ENTSO-E day-ahead Forecasted Load as a near-oracle exogenous prior. Defaults to ``False``. | `False` | -| include_entsoe_renewable_forecast | [bool](`bool`) | Append the ENTSO-E day-ahead wind and solar generation forecast. Defaults to ``False``. | `False` | -| include_entsoe_net_load | [bool](`bool`) | Append the ENTSO-E day-ahead net load (Forecasted Load minus wind/solar forecast). Defaults to ``False``. | `False` | -| include_entsoe_day_ahead_price | [bool](`bool`) | Append the ENTSO-E day-ahead spot price (DE/LU). Defaults to ``False``. | `False` | -| index_name | [str](`str`) | Datetime column name when the DataFrame index is reset. ENTSO-E CSVs use ``"Time (UTC)"``; defaults to that. | `'Time (UTC)'` | -| bounds | [Optional](`typing.Optional`)\[[List](`typing.List`)\[[tuple](`tuple`)\]\] | Per-column outlier bounds. For single-target ENTSO-E this is typically ``None`` or a single ``[(lower, upper)]`` entry. | `None` | -| verbose | [bool](`bool`) | Verbose pipeline output. | `False` | -| cache_home | [Optional](`typing.Optional`)\[[Any](`typing.Any`)\] | Cache directory override. | `None` | -| n_trials_optuna | [int](`int`) | Optuna Bayesian-search trial budget. | `15` | -| n_trials_spotoptim | [int](`int`) | SpotOptim surrogate-search trial budget. | `10` | -| n_initial_spotoptim | [int](`int`) | SpotOptim initial random evaluations. | `5` | -| n_jobs_spotoptim | [Optional](`typing.Optional`)\[[int](`int`)\] | Worker count for SpotOptim's parallel (steady-state) evaluation. ``None`` (default) runs sequentially; ``-1`` uses all CPU cores; a positive integer pins the worker count. Parallel tuning is faster but, being steady-state, changes the search trajectory, so the tuned result is not bit-identical to a sequential run even with a fixed ``random_state``. | `None` | -| warm_start_lags | [bool](`bool`) | Seed the SpotOptim search with ``lags_consider``. | `False` | -| task | [str](`str`) | Active prediction task name. | `'lazy'` | -| agg_weights | [Optional](`typing.Optional`)\[[List](`typing.List`)\[[float](`float`)\]\] | Per-target aggregation weights. For single-target use this is typically ``[1.0]`` or ``None``. | `None` | -| forecaster_factory | [Optional](`typing.Optional`)\[[Any](`typing.Any`)\] | Callable ``factory(config, *, weight_func, target) -> forecaster`` consumed by ``BaseTask.create_forecaster``. ``None`` falls back to the default LightGBM factory. | `None` | -| data_loader | [Optional](`typing.Optional`)\[[Any](`typing.Any`)\] | Callable ``data_loader(config)`` returning a pandas DataFrame. Invoked by ``BaseTask.prepare_data`` when no DataFrame is supplied — the ENTSO-E pipeline hook for ``download_new_data`` / ``merge_build_manual``. | `None` | -| test_data_loader | [Optional](`typing.Optional`)\[[Any](`typing.Any`)\] | Callable ``test_data_loader(config)`` returning a pandas DataFrame with ground-truth values for the prediction horizon. Invoked by ``BaseTask.prepare_data`` when no test DataFrame is supplied; the returned frame populates ``test_actual`` and ``metrics_future`` in the prediction package. | `None` | -| auto_save_models | [bool](`bool`) | Whether ``BaseTask._run_strategy`` should persist fitted forecasters to ``/models/`` after every training run. Defaults to ``True``. | `True` | -| data_frame_name | [str](`str`) | Identifier for the active dataset. Used by ``BaseTask`` to name cache subdirectories, model files, and the per-dataset log file. Defaults to ``"default"``. | `'default'` | -| number_folds | [int](`int`) | Number of folds used by ``BaseTask.cv_ts`` when building the ``TimeSeriesSplit`` cross-validation splitter for tuning tasks. Defaults to ``10``. | `10` | -| on_weather_failure | [Literal](`typing.Literal`)\[\'raise\', \'skip\'\] | Policy for handling Open-Meteo fetch failures inside ``BaseTask.build_exogenous_features``. ``"raise"`` (default) aborts the pipeline with a ``WeatherFetchError`` and preserves the safety-critical fail-safe semantics. ``"skip"`` logs a warning and continues with empty weather features so the rest of the pipeline can run without the Open-Meteo dependency. | `'raise'` | -| on_exog_provider_failure | [Literal](`typing.Literal`)\[\'raise\', \'skip\'\] | Policy for an exogenous-provider failure inside ``ExogBuilder.build``. ``"raise"`` (default) propagates the ``ExogProviderError`` (fail-safe); ``"skip"`` logs a warning and omits that provider's columns. | `'raise'` | -| exog_max_gap_hours | [int](`int`) | Maximum length, in hours, of a contiguous run of missing exogenous-provider values healed before the provider is rejected. Interior gaps are time-interpolated; leading/trailing edge gaps are back-/forward-filled. ``0`` (default) keeps the strict fail-safe (any gap raises). Healed runs are logged with count and span. Only already-published day-ahead vintages are involved, so healing is leakage-clean (CR-3). | `0` | -| exog_max_tail_gap_hours | [int](`int`) | Extended healing budget, in hours, applied exclusively to the trailing-edge NaN run (the run containing the last index timestamp). The effective tail budget is ``max(exog_max_gap_hours, exog_max_tail_gap_hours)``. The canonical use case is the ENTSO-E day-ahead publication frontier: the last published vintage is zero-order-held forward to the forecast horizon without touching interior gaps (CR-3-clean). When ``exog_max_tail_gap_hours <= exog_max_gap_hours`` the parameter is inert (the interior budget already covers the tail) and a warning is logged. Defaults to ``0``. | `0` | -| exog_provider_window | [Literal](`typing.Literal`)\[\'full\', \'train\'\] | Span the exogenous providers are validated against. ``"full"`` (default) requires coverage of the entire ``data_start``→``cov_end`` request, matching prior behaviour. ``"train"`` validates only the consumed window ``[start_train_ts, cov_end]``, tolerating missing values before the training window. Honoured by the MultiTask pipeline; the forecaster-wrapper path currently always validates the full span. | `'full'` | -| retrain_max_age | [pd](`pandas`).[Timedelta](`pandas.Timedelta`) | Maximum age of a previously trained model before retraining is required. Consumed by ``spotforecast2_safe.manager.trainer.should_retrain`` to gate scheduled retraining workflows. Defaults to ``Timedelta(days=7)``. | `None` | +| Name | Type | Description | Default | +|------------------------------------|------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------| +| country_code | [str](`str`) | ISO 3166-1 alpha-2 country code (e.g. ``"DE"``). | `'DE'` | +| periods | [Optional](`typing.Optional`)\[[List](`typing.List`)\[[Period](`spotforecast2_safe.data.Period`)\]\] | Cyclical feature encodings. | `default_periods()` | +| lags_consider | [Optional](`typing.Optional`)\[[List](`typing.List`)\[[int](`int`)\]\] | Lag values for autoregressive features. | `(lambda: list(range(1, 24)))()` | +| train_size | [Optional](`typing.Optional`)\[[pd](`pandas`).[Timedelta](`pandas.Timedelta`)\] | Training window. | `(lambda: pd.Timedelta(days=(3 * 365)))()` | +| end_train_default | [str](`str`) | Default end-of-training timestamp (ISO). | `'2025-12-31 00:00+00:00'` | +| delta_val | [Optional](`typing.Optional`)\[[pd](`pandas`).[Timedelta](`pandas.Timedelta`)\] | Validation window. | `(lambda: pd.Timedelta(hours=(24 * 7 * 10)))()` | +| predict_size | [int](`int`) | Prediction horizon in hours. | `24` | +| cv_block_size | [int](`int`) \| None | Cross-validation test-block width in hours. Defaults to ``None``, meaning the CV uses ``predict_size``. Set to a fixed value (e.g. ``24``) to decouple the cross-validation horizon from a render-dependent live ``predict_size``. | `None` | +| refit_size | [int](`int`) | Refit cadence in days. | `7` | +| random_state | [int](`int`) | Random seed. | `314159` | +| n_hyperparameters_trials | [int](`int`) | Hyperparameter-tuning trial budget. | `20` | +| data_filename | [str](`str`) | Path to the merged interim CSV. | `'interim/energy_load.csv'` | +| targets | [Optional](`typing.Optional`)\[[List](`typing.List`)\[[str](`str`)\]\] | Active target column names. ``None`` until set after data loading. For ENTSO-E this is typically ``["Actual Load"]``. | `None` | +| use_outlier_detection | [bool](`bool`) | Apply IsolationForest-based outlier removal. Defaults to ``True``. | `True` | +| contamination | [float](`float`) | IsolationForest contamination fraction. | `0.01` | +| imputation_method | [str](`str`) | Gap-filling strategy. | `'weighted'` | +| window_size | [int](`int`) | Rolling window for weighted imputation. Also the LightGBM rolling-mean feature window in the ENTSO-E factories. | `72` | +| imputation_window_size | [Optional](`typing.Optional`)\[[int](`int`)\] | Width of the gap-penalty zone (in hours) around each imputed value for the ``"weighted"`` strategy. When ``None`` (default), falls back to ``window_size``, so existing behaviour is unchanged. Set this to decouple the imputation penalty zone from the rolling-feature window. | `None` | +| use_exogenous_features | [bool](`bool`) | Build weather/calendar/holiday features. | `True` | +| latitude | [float](`float`) | Location latitude. | `51.5136` | +| longitude | [float](`float`) | Location longitude. | `7.4653` | +| timezone | [str](`str`) | IANA timezone string. | `'UTC'` | +| state | [str](`str`) | Subdivision code for regional holidays. | `'NW'` | +| include_weather_windows | [bool](`bool`) | Weather-window feature toggle. | `False` | +| include_holiday_features | [bool](`bool`) | Holiday feature toggle. | `False` | +| include_holiday_adjacency_features | [bool](`bool`) | Brückentag and before/after-holiday indicator toggle. Defaults to ``False``. | `False` | +| poly_features_degree | [int](`int`) | Polynomial-interaction degree passed to the feature builder. ``1`` (default) generates no interactions; ``2`` adds pairwise bilinear terms; ``3+`` higher order. | `1` | +| max_poly_features | [int](`int`) | Cap on polynomial interaction columns. When more than this many ``poly_*`` columns are generated, only the top ``max_poly_features`` ranked by mutual information with the target are kept (``<= 0`` disables the cap). Defaults to ``10``. | `10` | +| poly_mi_n_jobs | [Optional](`typing.Optional`)\[[int](`int`)\] | Parallel jobs for the mutual-information ranking that enforces ``max_poly_features``. ``-1`` (default) uses all cores; ``None`` runs single-threaded. Parallelism does not change the selection. | `-1` | +| poly_mi_sample_size | [Optional](`typing.Optional`)\[[int](`int`)\] | Row cap for that ranking; longer series are scored on a reproducible random subsample of this size (seeded by ``random_state``), which can change which borderline columns make the top K. ``None`` scores every row (the pre-15.8 behaviour). Defaults to ``4000``. | `4000` | +| include_covid_infection_rate | [bool](`bool`) | Append the bundled German national COVID-19 7-day incidence (RKI) as an exogenous level regressor. Defaults to ``False``. | `False` | +| include_entsoe_forecast_load | [bool](`bool`) | Append the ENTSO-E day-ahead Forecasted Load as a near-oracle exogenous prior. Defaults to ``False``. | `False` | +| include_entsoe_renewable_forecast | [bool](`bool`) | Append the ENTSO-E day-ahead wind and solar generation forecast. Defaults to ``False``. | `False` | +| include_entsoe_net_load | [bool](`bool`) | Append the ENTSO-E day-ahead net load (Forecasted Load minus wind/solar forecast). Defaults to ``False``. | `False` | +| include_entsoe_day_ahead_price | [bool](`bool`) | Append the ENTSO-E day-ahead spot price (DE/LU). Defaults to ``False``. | `False` | +| index_name | [str](`str`) | Datetime column name when the DataFrame index is reset. ENTSO-E CSVs use ``"Time (UTC)"``; defaults to that. | `'Time (UTC)'` | +| bounds | [Optional](`typing.Optional`)\[[List](`typing.List`)\[[tuple](`tuple`)\]\] | Per-column outlier bounds. For single-target ENTSO-E this is typically ``None`` or a single ``[(lower, upper)]`` entry. | `None` | +| verbose | [bool](`bool`) | Verbose pipeline output. | `False` | +| cache_home | [Optional](`typing.Optional`)\[[Any](`typing.Any`)\] | Cache directory override. | `None` | +| n_trials_optuna | [int](`int`) | Optuna Bayesian-search trial budget. | `15` | +| n_trials_spotoptim | [int](`int`) | SpotOptim surrogate-search trial budget. | `10` | +| n_initial_spotoptim | [int](`int`) | SpotOptim initial random evaluations. | `5` | +| n_jobs_spotoptim | [Optional](`typing.Optional`)\[[int](`int`)\] | Worker count for SpotOptim's parallel (steady-state) evaluation. ``None`` (default) runs sequentially; ``-1`` uses all CPU cores; a positive integer pins the worker count. Parallel tuning is faster but, being steady-state, changes the search trajectory, so the tuned result is not bit-identical to a sequential run even with a fixed ``random_state``. | `None` | +| warm_start_lags | [bool](`bool`) | Seed the SpotOptim search with ``lags_consider``. | `False` | +| task | [str](`str`) | Active prediction task name. | `'lazy'` | +| agg_weights | [Optional](`typing.Optional`)\[[List](`typing.List`)\[[float](`float`)\]\] | Per-target aggregation weights. For single-target use this is typically ``[1.0]`` or ``None``. | `None` | +| forecaster_factory | [Optional](`typing.Optional`)\[[Any](`typing.Any`)\] | Callable ``factory(config, *, weight_func, target) -> forecaster`` consumed by ``BaseTask.create_forecaster``. ``None`` falls back to the default LightGBM factory. | `None` | +| data_loader | [Optional](`typing.Optional`)\[[Any](`typing.Any`)\] | Callable ``data_loader(config)`` returning a pandas DataFrame. Invoked by ``BaseTask.prepare_data`` when no DataFrame is supplied — the ENTSO-E pipeline hook for ``download_new_data`` / ``merge_build_manual``. | `None` | +| test_data_loader | [Optional](`typing.Optional`)\[[Any](`typing.Any`)\] | Callable ``test_data_loader(config)`` returning a pandas DataFrame with ground-truth values for the prediction horizon. Invoked by ``BaseTask.prepare_data`` when no test DataFrame is supplied; the returned frame populates ``test_actual`` and ``metrics_future`` in the prediction package. | `None` | +| auto_save_models | [bool](`bool`) | Whether ``BaseTask._run_strategy`` should persist fitted forecasters to ``/models/`` after every training run. Defaults to ``True``. | `True` | +| data_frame_name | [str](`str`) | Identifier for the active dataset. Used by ``BaseTask`` to name cache subdirectories, model files, and the per-dataset log file. Defaults to ``"default"``. | `'default'` | +| number_folds | [int](`int`) | Number of folds used by ``BaseTask.cv_ts`` when building the ``TimeSeriesSplit`` cross-validation splitter for tuning tasks. Defaults to ``10``. | `10` | +| on_weather_failure | [Literal](`typing.Literal`)\[\'raise\', \'skip\'\] | Policy for handling Open-Meteo fetch failures inside ``BaseTask.build_exogenous_features``. ``"raise"`` (default) aborts the pipeline with a ``WeatherFetchError`` and preserves the safety-critical fail-safe semantics. ``"skip"`` logs a warning and continues with empty weather features so the rest of the pipeline can run without the Open-Meteo dependency. | `'raise'` | +| on_exog_provider_failure | [Literal](`typing.Literal`)\[\'raise\', \'skip\'\] | Policy for an exogenous-provider failure inside ``ExogBuilder.build``. ``"raise"`` (default) propagates the ``ExogProviderError`` (fail-safe); ``"skip"`` logs a warning and omits that provider's columns. | `'raise'` | +| exog_max_gap_hours | [int](`int`) | Maximum length, in hours, of a contiguous run of missing exogenous-provider values healed before the provider is rejected. Interior gaps are time-interpolated; leading/trailing edge gaps are back-/forward-filled. ``0`` (default) keeps the strict fail-safe (any gap raises). Healed runs are logged with count and span. Only already-published day-ahead vintages are involved, so healing is leakage-clean (CR-3). | `0` | +| exog_max_tail_gap_hours | [int](`int`) | Extended healing budget, in hours, applied exclusively to the trailing-edge NaN run (the run containing the last index timestamp). The effective tail budget is ``max(exog_max_gap_hours, exog_max_tail_gap_hours)``. The canonical use case is the ENTSO-E day-ahead publication frontier: the last published vintage is zero-order-held forward to the forecast horizon without touching interior gaps (CR-3-clean). When ``exog_max_tail_gap_hours <= exog_max_gap_hours`` the parameter is inert (the interior budget already covers the tail) and a warning is logged. Defaults to ``0``. | `0` | +| exog_provider_window | [Literal](`typing.Literal`)\[\'full\', \'train\'\] | Span the exogenous providers are validated against. ``"full"`` (default) requires coverage of the entire ``data_start``→``cov_end`` request, matching prior behaviour. ``"train"`` validates only the consumed window ``[start_train_ts, cov_end]``, tolerating missing values before the training window. Honoured by the MultiTask pipeline; the forecaster-wrapper path currently always validates the full span. | `'full'` | +| retrain_max_age | [pd](`pandas`).[Timedelta](`pandas.Timedelta`) | Maximum age of a previously trained model before retraining is required. Consumed by ``spotforecast2_safe.manager.trainer.should_retrain`` to gate scheduled retraining workflows. Defaults to ``Timedelta(days=7)``. | `(lambda: pd.Timedelta(days=7))()` | ## Attributes {.doc-section .doc-section-attributes} diff --git a/docs/reference/configurator.config_multi.ConfigMulti.qmd b/docs/reference/configurator.config_multi.ConfigMulti.qmd index 6c657ad1..59b90762 100644 --- a/docs/reference/configurator.config_multi.ConfigMulti.qmd +++ b/docs/reference/configurator.config_multi.ConfigMulti.qmd @@ -3,11 +3,11 @@ ```python configurator.config_multi.ConfigMulti( country_code='DE', - periods=None, - lags_consider=None, - train_size=None, + periods=default_periods(), + lags_consider=(lambda: list(range(1, 24)))(), + train_size=(lambda: pd.Timedelta(days=(3 * 365)))(), end_train_default='2025-12-31 00:00+00:00', - delta_val=None, + delta_val=(lambda: pd.Timedelta(hours=(24 * 7 * 10)))(), predict_size=24, cv_block_size=None, refit_size=7, @@ -83,54 +83,54 @@ API queries and holiday feature generation. ## Parameters {.doc-section .doc-section-parameters} -| Name | Type | Description | Default | -|------------------------------------|------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------| -| country_code | [str](`str`) | ISO 3166-1 alpha-2 country code (e.g. ``"DE"``). Used for both API queries and holiday feature generation. | `'DE'` | -| periods | [Optional](`typing.Optional`)\[[List](`typing.List`)\[[Period](`spotforecast2_safe.data.Period`)\]\] | List of Period objects defining cyclical feature encodings. | `None` | -| lags_consider | [Optional](`typing.Optional`)\[[List](`typing.List`)\[[int](`int`)\]\] | List of lag values to consider for feature selection. | `None` | -| train_size | [Optional](`typing.Optional`)\[[pd](`pandas`).[Timedelta](`pandas.Timedelta`)\] | Time window for training data. | `None` | -| end_train_default | [str](`str`) | Default end date for training period (ISO format with timezone). | `'2025-12-31 00:00+00:00'` | -| delta_val | [Optional](`typing.Optional`)\[[pd](`pandas`).[Timedelta](`pandas.Timedelta`)\] | Validation window size. | `None` | -| predict_size | [int](`int`) | Number of hours to predict ahead. | `24` | -| cv_block_size | [int](`int`) \| None | Cross-validation test-block width in hours. Defaults to ``None``, meaning the CV uses ``predict_size``. Set to a fixed value (e.g. ``24``) to decouple the cross-validation horizon from a render-dependent live ``predict_size``. | `None` | -| refit_size | [int](`int`) | Number of days between model refits. | `7` | -| random_state | [int](`int`) | Random seed for reproducibility. | `314159` | -| n_hyperparameters_trials | [int](`int`) | Number of trials for hyperparameter optimization. | `20` | -| data_filename | [str](`str`) | Path to the interim merged data file. | `'interim/energy_load.csv'` | -| targets | [Optional](`typing.Optional`)\[[List](`typing.List`)\[[str](`str`)\]\] | List of target column names to train models for. When ``None`` (default), no targets are pre-selected; set this attribute after loading the dataset (e.g. ``config.targets = df.columns.tolist()``). Replaces standalone ``TARGETS`` and ``target_columns`` variables in pipeline scripts, providing a single source of truth for the active target set. | `None` | -| use_outlier_detection | [bool](`bool`) | If True, apply IsolationForest-based outlier removal. | `True` | -| contamination | [float](`float`) | Proportion of outliers for IsolationForest (0 < contamination < 0.5). | `0.01` | -| imputation_method | [str](`str`) | Gap-filling strategy — ``"weighted"`` (n2n-style rolling weights) or ``"linear"`` (linear interpolation). | `'weighted'` | -| window_size | [int](`int`) | Rolling window size in hours for gap detection (weighted imputation). | `72` | -| use_exogenous_features | [bool](`bool`) | If True, build weather/calendar/day-night/holiday features. | `True` | -| latitude | [float](`float`) | Latitude of the target location in decimal degrees. | `51.5136` | -| longitude | [float](`float`) | Longitude of the target location in decimal degrees. | `7.4653` | -| timezone | [str](`str`) | IANA timezone string for the target location (e.g. ``"Europe/Berlin"``). | `'UTC'` | -| state | [str](`str`) | ISO 3166-2 subdivision code for regional holidays (e.g. ``"NW"``). | `'NW'` | -| include_weather_windows | [bool](`bool`) | If True, include rolling weather-window features. | `False` | -| include_holiday_features | [bool](`bool`) | If True, include public-holiday indicator features. | `False` | -| include_holiday_adjacency_features | [bool](`bool`) | If True, include Brückentag and before/after-holiday indicators (``is_brueckentag``, ``is_before_holiday``, ``is_after_holiday``). Defaults to ``False``. | `False` | -| poly_features_degree | [int](`int`) | Polynomial-interaction degree. ``1`` (default) generates no interactions; ``2`` adds pairwise bilinear terms; ``3+`` higher order. | `1` | -| max_poly_features | [int](`int`) | Cap on polynomial interaction columns; only the top ``max_poly_features`` ranked by mutual information with the target are kept (``<= 0`` disables). Defaults to ``10``. | `10` | -| poly_mi_n_jobs | [Optional](`typing.Optional`)\[[int](`int`)\] | Parallel jobs for the mutual-information ranking that enforces ``max_poly_features``. ``-1`` (default) uses all cores; ``None`` runs single-threaded. Parallelism does not change the selection. | `-1` | -| poly_mi_sample_size | [Optional](`typing.Optional`)\[[int](`int`)\] | Row cap for that ranking; longer series are scored on a reproducible random subsample of this size (seeded by ``random_state``), which can change which borderline columns make the top K. ``None`` scores every row (the pre-15.8 behaviour). Defaults to ``4000``. | `4000` | -| index_name | [str](`str`) | Name assigned to the datetime column when the index is reset. Defaults to ``"DateTime"``. | `'DateTime'` | -| bounds | [Optional](`typing.Optional`)\[[List](`typing.List`)\[[tuple](`tuple`)\]\] | Per-column outlier bounds as a list of ``(lower, upper)`` tuples, one entry per target column. ``None`` until set. | `None` | -| verbose | [bool](`bool`) | If ``True``, enable verbose output for pipeline steps. Defaults to ``False``. | `False` | -| cache_home | [Optional](`typing.Optional`)\[[Any](`typing.Any`)\] | Path to the cache directory. ``None`` means the library default (``~/spotforecast2_cache/``) is used. | `None` | -| n_trials_optuna | [int](`int`) | Number of Optuna Bayesian-search trials for hyperparameter optimization (task 3). Defaults to ``15``. | `15` | -| n_trials_spotoptim | [int](`int`) | Number of SpotOptim surrogate-search trials (task 4). Defaults to ``10``. | `10` | -| n_initial_spotoptim | [int](`int`) | Number of initial random evaluations for SpotOptim (task 4). Defaults to ``5``. | `5` | -| n_jobs_spotoptim | [Optional](`typing.Optional`)\[[int](`int`)\] | Worker count for SpotOptim's parallel (steady-state) evaluation. ``None`` (default) runs sequentially; ``-1`` uses all CPU cores; a positive integer pins the worker count. Parallel tuning is faster but, being steady-state, changes the search trajectory, so the tuned result is not bit-identical to a sequential run even with a fixed ``random_state``. | `None` | -| warm_start_lags | [bool](`bool`) | When True, the SpotOptim task injects ``lags_consider`` as a candidate lag set and seeds the optimizer's first evaluation with it. Defaults to ``False``. | `False` | -| task | [str](`str`) | Active prediction task — one of ``"lazy"``, ``"training"``, ``"optuna"``, or ``"spotoptim"``. Defaults to ``"lazy"``. | `'lazy'` | -| agg_weights | [Optional](`typing.Optional`)\[[List](`typing.List`)\[[float](`float`)\]\] | Per-target aggregation weights used when combining individual target forecasts into a single weighted sum. The list must contain one weight per entry in ``targets`` (in the same order). Positive values add the target's contribution; negative values invert it. Slice the list to ``agg_weights[:len(targets)]`` when only a subset of targets is active. Defaults to ``None`` (no weights pre-defined; set after loading the dataset). | `None` | -| auto_save_models | [bool](`bool`) | Whether ``BaseTask._run_strategy`` should persist fitted forecasters to ``/models/`` after every training run. Defaults to ``True`` so that saved models are immediately available for ``PredictTask`` without an explicit ``save_models()`` call. | `True` | -| data_frame_name | [str](`str`) | Identifier for the active dataset. Used by ``BaseTask`` to name cache subdirectories, model files, and the per-dataset log file. Defaults to ``"default"``. | `'default'` | -| on_weather_failure | [Literal](`typing.Literal`)\[\'raise\', \'skip\'\] | Policy for handling Open-Meteo fetch failures inside ``BaseTask.build_exogenous_features``. ``"raise"`` (default) aborts the pipeline with a ``WeatherFetchError`` and preserves the safety-critical fail-safe semantics. ``"skip"`` logs a warning and continues with empty weather features so the rest of the pipeline can run without the Open-Meteo dependency. | `'raise'` | -| exog_max_gap_hours | [int](`int`) | Maximum length, in hours, of a contiguous run of missing exogenous-provider values healed before the provider is rejected. Interior gaps are time-interpolated; leading/trailing edge gaps are back-/forward-filled. ``0`` (default) keeps the strict fail-safe (any gap raises). Healed runs are logged with count and span. Only already-published day-ahead vintages are involved, so healing is leakage-clean (CR-3). | `0` | -| exog_max_tail_gap_hours | [int](`int`) | Extended healing budget, in hours, applied exclusively to the trailing-edge NaN run (the run containing the last index timestamp). The effective tail budget is ``max(exog_max_gap_hours, exog_max_tail_gap_hours)``. The canonical use case is the ENTSO-E day-ahead publication frontier: the last published vintage is zero-order-held forward to the forecast horizon without touching interior gaps (CR-3-clean). When ``exog_max_tail_gap_hours <= exog_max_gap_hours`` the parameter is inert (the interior budget already covers the tail) and a warning is logged. Defaults to ``0``. | `0` | -| exog_provider_window | [Literal](`typing.Literal`)\[\'full\', \'train\'\] | Span the exogenous providers are validated against. ``"full"`` (default) requires coverage of the entire ``data_start``→``cov_end`` request, matching prior behaviour. ``"train"`` validates only the consumed window ``[start_train_ts, cov_end]``, tolerating missing values before the training window. Honoured by the MultiTask pipeline; the forecaster-wrapper path currently always validates the full span. | `'full'` | +| Name | Type | Description | Default | +|------------------------------------|------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------| +| country_code | [str](`str`) | ISO 3166-1 alpha-2 country code (e.g. ``"DE"``). Used for both API queries and holiday feature generation. | `'DE'` | +| periods | [Optional](`typing.Optional`)\[[List](`typing.List`)\[[Period](`spotforecast2_safe.data.Period`)\]\] | List of Period objects defining cyclical feature encodings. | `default_periods()` | +| lags_consider | [Optional](`typing.Optional`)\[[List](`typing.List`)\[[int](`int`)\]\] | List of lag values to consider for feature selection. | `(lambda: list(range(1, 24)))()` | +| train_size | [Optional](`typing.Optional`)\[[pd](`pandas`).[Timedelta](`pandas.Timedelta`)\] | Time window for training data. | `(lambda: pd.Timedelta(days=(3 * 365)))()` | +| end_train_default | [str](`str`) | Default end date for training period (ISO format with timezone). | `'2025-12-31 00:00+00:00'` | +| delta_val | [Optional](`typing.Optional`)\[[pd](`pandas`).[Timedelta](`pandas.Timedelta`)\] | Validation window size. | `(lambda: pd.Timedelta(hours=(24 * 7 * 10)))()` | +| predict_size | [int](`int`) | Number of hours to predict ahead. | `24` | +| cv_block_size | [int](`int`) \| None | Cross-validation test-block width in hours. Defaults to ``None``, meaning the CV uses ``predict_size``. Set to a fixed value (e.g. ``24``) to decouple the cross-validation horizon from a render-dependent live ``predict_size``. | `None` | +| refit_size | [int](`int`) | Number of days between model refits. | `7` | +| random_state | [int](`int`) | Random seed for reproducibility. | `314159` | +| n_hyperparameters_trials | [int](`int`) | Number of trials for hyperparameter optimization. | `20` | +| data_filename | [str](`str`) | Path to the interim merged data file. | `'interim/energy_load.csv'` | +| targets | [Optional](`typing.Optional`)\[[List](`typing.List`)\[[str](`str`)\]\] | List of target column names to train models for. When ``None`` (default), no targets are pre-selected; set this attribute after loading the dataset (e.g. ``config.targets = df.columns.tolist()``). Replaces standalone ``TARGETS`` and ``target_columns`` variables in pipeline scripts, providing a single source of truth for the active target set. | `None` | +| use_outlier_detection | [bool](`bool`) | If True, apply IsolationForest-based outlier removal. | `True` | +| contamination | [float](`float`) | Proportion of outliers for IsolationForest (0 < contamination < 0.5). | `0.01` | +| imputation_method | [str](`str`) | Gap-filling strategy — ``"weighted"`` (n2n-style rolling weights) or ``"linear"`` (linear interpolation). | `'weighted'` | +| window_size | [int](`int`) | Rolling window size in hours for gap detection (weighted imputation). | `72` | +| use_exogenous_features | [bool](`bool`) | If True, build weather/calendar/day-night/holiday features. | `True` | +| latitude | [float](`float`) | Latitude of the target location in decimal degrees. | `51.5136` | +| longitude | [float](`float`) | Longitude of the target location in decimal degrees. | `7.4653` | +| timezone | [str](`str`) | IANA timezone string for the target location (e.g. ``"Europe/Berlin"``). | `'UTC'` | +| state | [str](`str`) | ISO 3166-2 subdivision code for regional holidays (e.g. ``"NW"``). | `'NW'` | +| include_weather_windows | [bool](`bool`) | If True, include rolling weather-window features. | `False` | +| include_holiday_features | [bool](`bool`) | If True, include public-holiday indicator features. | `False` | +| include_holiday_adjacency_features | [bool](`bool`) | If True, include Brückentag and before/after-holiday indicators (``is_brueckentag``, ``is_before_holiday``, ``is_after_holiday``). Defaults to ``False``. | `False` | +| poly_features_degree | [int](`int`) | Polynomial-interaction degree. ``1`` (default) generates no interactions; ``2`` adds pairwise bilinear terms; ``3+`` higher order. | `1` | +| max_poly_features | [int](`int`) | Cap on polynomial interaction columns; only the top ``max_poly_features`` ranked by mutual information with the target are kept (``<= 0`` disables). Defaults to ``10``. | `10` | +| poly_mi_n_jobs | [Optional](`typing.Optional`)\[[int](`int`)\] | Parallel jobs for the mutual-information ranking that enforces ``max_poly_features``. ``-1`` (default) uses all cores; ``None`` runs single-threaded. Parallelism does not change the selection. | `-1` | +| poly_mi_sample_size | [Optional](`typing.Optional`)\[[int](`int`)\] | Row cap for that ranking; longer series are scored on a reproducible random subsample of this size (seeded by ``random_state``), which can change which borderline columns make the top K. ``None`` scores every row (the pre-15.8 behaviour). Defaults to ``4000``. | `4000` | +| index_name | [str](`str`) | Name assigned to the datetime column when the index is reset. Defaults to ``"DateTime"``. | `'DateTime'` | +| bounds | [Optional](`typing.Optional`)\[[List](`typing.List`)\[[tuple](`tuple`)\]\] | Per-column outlier bounds as a list of ``(lower, upper)`` tuples, one entry per target column. ``None`` until set. | `None` | +| verbose | [bool](`bool`) | If ``True``, enable verbose output for pipeline steps. Defaults to ``False``. | `False` | +| cache_home | [Optional](`typing.Optional`)\[[Any](`typing.Any`)\] | Path to the cache directory. ``None`` means the library default (``~/spotforecast2_cache/``) is used. | `None` | +| n_trials_optuna | [int](`int`) | Number of Optuna Bayesian-search trials for hyperparameter optimization (task 3). Defaults to ``15``. | `15` | +| n_trials_spotoptim | [int](`int`) | Number of SpotOptim surrogate-search trials (task 4). Defaults to ``10``. | `10` | +| n_initial_spotoptim | [int](`int`) | Number of initial random evaluations for SpotOptim (task 4). Defaults to ``5``. | `5` | +| n_jobs_spotoptim | [Optional](`typing.Optional`)\[[int](`int`)\] | Worker count for SpotOptim's parallel (steady-state) evaluation. ``None`` (default) runs sequentially; ``-1`` uses all CPU cores; a positive integer pins the worker count. Parallel tuning is faster but, being steady-state, changes the search trajectory, so the tuned result is not bit-identical to a sequential run even with a fixed ``random_state``. | `None` | +| warm_start_lags | [bool](`bool`) | When True, the SpotOptim task injects ``lags_consider`` as a candidate lag set and seeds the optimizer's first evaluation with it. Defaults to ``False``. | `False` | +| task | [str](`str`) | Active prediction task — one of ``"lazy"``, ``"training"``, ``"optuna"``, or ``"spotoptim"``. Defaults to ``"lazy"``. | `'lazy'` | +| agg_weights | [Optional](`typing.Optional`)\[[List](`typing.List`)\[[float](`float`)\]\] | Per-target aggregation weights used when combining individual target forecasts into a single weighted sum. The list must contain one weight per entry in ``targets`` (in the same order). Positive values add the target's contribution; negative values invert it. Slice the list to ``agg_weights[:len(targets)]`` when only a subset of targets is active. Defaults to ``None`` (no weights pre-defined; set after loading the dataset). | `None` | +| auto_save_models | [bool](`bool`) | Whether ``BaseTask._run_strategy`` should persist fitted forecasters to ``/models/`` after every training run. Defaults to ``True`` so that saved models are immediately available for ``PredictTask`` without an explicit ``save_models()`` call. | `True` | +| data_frame_name | [str](`str`) | Identifier for the active dataset. Used by ``BaseTask`` to name cache subdirectories, model files, and the per-dataset log file. Defaults to ``"default"``. | `'default'` | +| on_weather_failure | [Literal](`typing.Literal`)\[\'raise\', \'skip\'\] | Policy for handling Open-Meteo fetch failures inside ``BaseTask.build_exogenous_features``. ``"raise"`` (default) aborts the pipeline with a ``WeatherFetchError`` and preserves the safety-critical fail-safe semantics. ``"skip"`` logs a warning and continues with empty weather features so the rest of the pipeline can run without the Open-Meteo dependency. | `'raise'` | +| exog_max_gap_hours | [int](`int`) | Maximum length, in hours, of a contiguous run of missing exogenous-provider values healed before the provider is rejected. Interior gaps are time-interpolated; leading/trailing edge gaps are back-/forward-filled. ``0`` (default) keeps the strict fail-safe (any gap raises). Healed runs are logged with count and span. Only already-published day-ahead vintages are involved, so healing is leakage-clean (CR-3). | `0` | +| exog_max_tail_gap_hours | [int](`int`) | Extended healing budget, in hours, applied exclusively to the trailing-edge NaN run (the run containing the last index timestamp). The effective tail budget is ``max(exog_max_gap_hours, exog_max_tail_gap_hours)``. The canonical use case is the ENTSO-E day-ahead publication frontier: the last published vintage is zero-order-held forward to the forecast horizon without touching interior gaps (CR-3-clean). When ``exog_max_tail_gap_hours <= exog_max_gap_hours`` the parameter is inert (the interior budget already covers the tail) and a warning is logged. Defaults to ``0``. | `0` | +| exog_provider_window | [Literal](`typing.Literal`)\[\'full\', \'train\'\] | Span the exogenous providers are validated against. ``"full"`` (default) requires coverage of the entire ``data_start``→``cov_end`` request, matching prior behaviour. ``"train"`` validates only the consumed window ``[start_train_ts, cov_end]``, tolerating missing values before the training window. Honoured by the MultiTask pipeline; the forecaster-wrapper path currently always validates the full span. | `'full'` | ## Attributes {.doc-section .doc-section-attributes} diff --git a/src/spotforecast2_safe/configurator/_base_config.py b/src/spotforecast2_safe/configurator/_base_config.py index b2a17dd9..e45f652c 100644 --- a/src/spotforecast2_safe/configurator/_base_config.py +++ b/src/spotforecast2_safe/configurator/_base_config.py @@ -7,7 +7,8 @@ defaults and the same ``get_params`` / ``set_params`` semantics. Those were previously hand-duplicated in both classes, which let fields drift between the two copies. These helpers centralise the logic so each config only declares its -ordered parameter names (``_PARAM_NAMES``) and its ``__init__``; the +``@dataclass`` fields; ``_PARAM_NAMES`` is derived from +``dataclasses.fields()`` (so it can never drift from the fields), and the ``get_params`` / ``set_params`` behaviour and the period defaults live here. """ @@ -270,8 +271,7 @@ def validate_config(config: object) -> None: on_weather_failure = getattr(config, "on_weather_failure", None) if on_weather_failure is not None and on_weather_failure not in ("raise", "skip"): raise ValueError( - "on_weather_failure must be 'raise' or 'skip'; got " - f"{on_weather_failure!r}." + f"on_weather_failure must be 'raise' or 'skip'; got {on_weather_failure!r}." ) on_exog_provider_failure = getattr(config, "on_exog_provider_failure", None) diff --git a/src/spotforecast2_safe/configurator/config_entsoe.py b/src/spotforecast2_safe/configurator/config_entsoe.py index b5dcb483..bf86ab96 100644 --- a/src/spotforecast2_safe/configurator/config_entsoe.py +++ b/src/spotforecast2_safe/configurator/config_entsoe.py @@ -3,6 +3,7 @@ """Configuration for ENTSO-E task pipeline.""" +from dataclasses import dataclass, field, fields from typing import Any, Dict, List, Literal, Optional import pandas as pd @@ -16,6 +17,7 @@ from spotforecast2_safe.data import Period +@dataclass class ConfigEntsoe: """Configuration for the ENTSO-E forecasting pipeline. @@ -220,310 +222,126 @@ class ConfigEntsoe: ``` """ - _PARAM_NAMES = ( - "country_code", - "periods", - "lags_consider", - "train_size", - "end_train_default", - "delta_val", - "predict_size", - "cv_block_size", - "refit_size", - "random_state", - "n_hyperparameters_trials", - "data_filename", - "targets", - "use_outlier_detection", - "contamination", - "imputation_method", - "window_size", - "imputation_window_size", - "use_exogenous_features", - "latitude", - "longitude", - "timezone", - "state", - "include_weather_windows", - "include_holiday_features", - "include_holiday_adjacency_features", - "poly_features_degree", - "max_poly_features", - "poly_mi_n_jobs", - "poly_mi_sample_size", - "include_covid_infection_rate", - "include_entsoe_forecast_load", - "include_entsoe_renewable_forecast", - "include_entsoe_net_load", - "include_entsoe_day_ahead_price", - "index_name", - "bounds", - "verbose", - "cache_home", - "n_trials_optuna", - "n_trials_spotoptim", - "n_initial_spotoptim", - "n_jobs_spotoptim", - "warm_start_lags", - "task", - "agg_weights", - "forecaster_factory", - "data_loader", - "test_data_loader", - "auto_save_models", - "data_frame_name", - "number_folds", - "on_weather_failure", - "on_exog_provider_failure", - "exog_max_gap_hours", - "exog_max_tail_gap_hours", - "exog_provider_window", - "retrain_max_age", - "target_qc_range_mw", - "target_qc_step_mw", - "target_qc_window_days", - "target_corruption_policy", - "target_max_heal_hours", - "target_anchor_zone_hours", - "target_qc_deviation_mw", - "target_qc_deviation_ref", - "target_qc_deviation_slots", + country_code: str = "DE" + periods: List[Period] = field(default_factory=default_periods) + lags_consider: List[int] = field(default_factory=lambda: list(range(1, 24))) + train_size: pd.Timedelta = field(default_factory=lambda: pd.Timedelta(days=3 * 365)) + end_train_default: str = "2025-12-31 00:00+00:00" + delta_val: pd.Timedelta = field( + default_factory=lambda: pd.Timedelta(hours=24 * 7 * 10) ) + predict_size: int = 24 + # Cross-validation test-block width (hours). ``None`` defers to + # ``predict_size``; the actual CV-split logic lives in the sibling + # ``spotforecast2`` package (``BaseTask.cv_ts``). + cv_block_size: Optional[int] = None + refit_size: int = 7 + random_state: int = 314159 + n_hyperparameters_trials: int = 20 + data_filename: str = "interim/energy_load.csv" + targets: Optional[List[str]] = None + # Outlier detection + use_outlier_detection: bool = True + contamination: float = 0.01 + # Imputation + imputation_method: str = "weighted" + window_size: int = 72 + imputation_window_size: Optional[int] = None + # Exogenous features + use_exogenous_features: bool = True + latitude: float = 51.5136 + longitude: float = 7.4653 + timezone: str = "UTC" + state: str = "NW" + # Feature selection toggles + include_weather_windows: bool = False + include_holiday_features: bool = False + include_holiday_adjacency_features: bool = False + poly_features_degree: int = 1 + max_poly_features: int = 10 + poly_mi_n_jobs: Optional[int] = -1 + poly_mi_sample_size: Optional[int] = 4000 + # Provider-based exogenous toggles, each gated by a registry flag in + # ``spotforecast2_safe.preprocessing.exog_providers``. + include_covid_infection_rate: bool = False + include_entsoe_forecast_load: bool = False + include_entsoe_renewable_forecast: bool = False + include_entsoe_net_load: bool = False + include_entsoe_day_ahead_price: bool = False + # Data source and index (ENTSO-E CSVs use "Time (UTC)") + index_name: str = "Time (UTC)" + # Per-column outlier bounds + bounds: Optional[List[tuple]] = None + # Verbosity and caching + verbose: bool = False + cache_home: Optional[Any] = None + # Hyperparameter tuning trial budgets + n_trials_optuna: int = 15 + n_trials_spotoptim: int = 10 + n_initial_spotoptim: int = 5 + # SpotOptim parallel-evaluation worker count (None=serial, -1=all cores); + # consumed by spotforecast2.multitask.strategies.SpotOptimStrategy + n_jobs_spotoptim: Optional[int] = None + # Seed the SpotOptim search with ``lags_consider`` (consumed by + # spotforecast2.multitask.strategies.SpotOptimStrategy) + warm_start_lags: bool = False + # Active task + task: str = "lazy" + # Aggregation weights (single-target uses [1.0] or None) + agg_weights: Optional[List[float]] = None + # Forecaster factory hook (consumed by spotforecast2.multitask.base): + # ``factory(config, *, weight_func, target) -> forecaster``. + forecaster_factory: Optional[Any] = None + # Data-loader hook (consumed by ``BaseTask.prepare_data``): + # ``data_loader(config) -> pd.DataFrame``, invoked when no DataFrame is + # supplied. + data_loader: Optional[Any] = None + # Test-data-loader hook (consumed by ``BaseTask.prepare_data``): mirrors + # ``data_loader`` for the test/ground-truth slice. + test_data_loader: Optional[Any] = None + # Persistence policy and active-dataset name (consumed by + # spotforecast2.multitask.base). + auto_save_models: bool = True + data_frame_name: str = "default" + # Cross-validation fold count (consumed by spotforecast2.multitask.base.cv_ts) + number_folds: int = 10 + # Weather-fetch failure policy (consumed by + # spotforecast2.multitask.base.build_exogenous_features) + on_weather_failure: Literal["raise", "skip"] = "raise" + # Exog-provider failure policy (consumed by + # preprocessing.exog_builder.ExogBuilder) + on_exog_provider_failure: Literal["raise", "skip"] = "raise" + # Gap-healing budget for exog providers (0 = strict fail-safe) + exog_max_gap_hours: int = 0 + # Extended trailing-edge healing budget (0 = same as exog_max_gap_hours) + exog_max_tail_gap_hours: int = 0 + # Validation window for exog providers ("full" or "train") + exog_provider_window: Literal["full", "train"] = "full" + # Maximum age of a previously trained model before retraining is required + # (consumed by spotforecast2_safe.manager.trainer.should_retrain) + retrain_max_age: pd.Timedelta = field(default_factory=lambda: pd.Timedelta(days=7)) + # Target-side corruption detector knobs. Detector active only when + # target_qc_window_days AND at least one of target_qc_range_mw / + # target_qc_step_mw / target_qc_deviation_mw are set. Defaults are all + # None / off, so the pipeline is byte-identical to the pre-feature baseline. + # Recommended episode policy: "truncate" (auto-extends predict_size). + # "heal" under the default anchor_zone_hours=168 with a <=7-day QC window + # never engages (refusal by design). The deviation rule (dropout-only, vs a + # published reference column such as "Forecasted Load") catches corruption + # that stays below the dynamics thresholds; when enabling it, scope + # ``targets`` to the actuals so heal/truncate leave the reference intact. + target_qc_range_mw: Optional[float] = None + target_qc_step_mw: Optional[float] = None + target_qc_window_days: Optional[int] = None + target_corruption_policy: str = "abort" + target_max_heal_hours: int = 0 + target_anchor_zone_hours: int = 168 + target_qc_deviation_mw: Optional[float] = None + target_qc_deviation_ref: Optional[str] = None + target_qc_deviation_slots: int = 2 - def __init__( - self, - country_code: str = "DE", - periods: Optional[List[Period]] = None, - lags_consider: Optional[List[int]] = None, - train_size: Optional[pd.Timedelta] = None, - end_train_default: str = "2025-12-31 00:00+00:00", - delta_val: Optional[pd.Timedelta] = None, - predict_size: int = 24, - cv_block_size: Optional[int] = None, - refit_size: int = 7, - random_state: int = 314159, - n_hyperparameters_trials: int = 20, - data_filename: str = "interim/energy_load.csv", - targets: Optional[List[str]] = None, - # Outlier detection - use_outlier_detection: bool = True, - contamination: float = 0.01, - # Imputation - imputation_method: str = "weighted", - window_size: int = 72, - imputation_window_size: Optional[int] = None, - # Exogenous features - use_exogenous_features: bool = True, - latitude: float = 51.5136, - longitude: float = 7.4653, - timezone: str = "UTC", - state: str = "NW", - # Feature selection toggles - include_weather_windows: bool = False, - include_holiday_features: bool = False, - include_holiday_adjacency_features: bool = False, - poly_features_degree: int = 1, - max_poly_features: int = 10, - poly_mi_n_jobs: Optional[int] = -1, - poly_mi_sample_size: Optional[int] = 4000, - # Provider-based exogenous toggles (preprocessing.exog_providers) - include_covid_infection_rate: bool = False, - include_entsoe_forecast_load: bool = False, - include_entsoe_renewable_forecast: bool = False, - include_entsoe_net_load: bool = False, - include_entsoe_day_ahead_price: bool = False, - # Data source and index - index_name: str = "Time (UTC)", - # Per-column outlier bounds - bounds: Optional[List[tuple]] = None, - # Verbosity and caching - verbose: bool = False, - cache_home: Optional[Any] = None, - # Hyperparameter tuning trial budgets - n_trials_optuna: int = 15, - n_trials_spotoptim: int = 10, - n_initial_spotoptim: int = 5, - # SpotOptim parallel-evaluation worker count (None=serial, -1=all cores); - # consumed by spotforecast2.multitask.strategies.SpotOptimStrategy - n_jobs_spotoptim: Optional[int] = None, - # Seed the SpotOptim search with ``lags_consider`` (consumed by - # spotforecast2.multitask.strategies.SpotOptimStrategy) - warm_start_lags: bool = False, - # Active task - task: str = "lazy", - # Aggregation weights (single-target uses [1.0] or None) - agg_weights: Optional[List[float]] = None, - # Forecaster factory hook (consumed by spotforecast2.multitask.base) - forecaster_factory: Optional[Any] = None, - # Data-loader hook (consumed by spotforecast2.multitask.base.prepare_data) - data_loader: Optional[Any] = None, - # Test-data-loader hook (consumed by spotforecast2.multitask.base.prepare_data) - test_data_loader: Optional[Any] = None, - # Persistence policy and active-dataset name (consumed by spotforecast2.multitask.base) - auto_save_models: bool = True, - data_frame_name: str = "default", - # Cross-validation fold count (consumed by spotforecast2.multitask.base.cv_ts) - number_folds: int = 10, - # Weather-fetch failure policy (consumed by spotforecast2.multitask.base.build_exogenous_features) - on_weather_failure: Literal["raise", "skip"] = "raise", - # Exog-provider failure policy (consumed by preprocessing.exog_builder.ExogBuilder) - on_exog_provider_failure: Literal["raise", "skip"] = "raise", - # Gap-healing budget for exog providers (0 = strict fail-safe) - exog_max_gap_hours: int = 0, - # Extended trailing-edge healing budget (0 = same as exog_max_gap_hours) - exog_max_tail_gap_hours: int = 0, - # Validation window for exog providers ("full" or "train") - exog_provider_window: Literal["full", "train"] = "full", - # Retraining cadence (consumed by spotforecast2_safe.manager.trainer.should_retrain) - retrain_max_age: Optional[pd.Timedelta] = None, - # Target-side corruption detector knobs. - # Detector active only when target_qc_window_days AND at least one of - # target_qc_range_mw / target_qc_step_mw / target_qc_deviation_mw are - # set. Defaults are all None / off, so the pipeline is byte-identical - # to the pre-feature baseline. - # Recommended episode policy: "truncate" (auto-extends predict_size). - # "heal" under the default anchor_zone_hours=168 with a <=7-day QC window - # never engages (refusal by design — lowering the zone is a deliberate - # operator decision). - # The deviation rule (dropout-only, vs a published reference column - # such as "Forecasted Load") catches corruption that stays below the - # dynamics thresholds; when enabling it, scope `targets` to the - # actuals so heal/truncate leave the reference column intact. - target_qc_range_mw: Optional[float] = None, - target_qc_step_mw: Optional[float] = None, - target_qc_window_days: Optional[int] = None, - target_corruption_policy: str = "abort", - target_max_heal_hours: int = 0, - target_anchor_zone_hours: int = 168, - target_qc_deviation_mw: Optional[float] = None, - target_qc_deviation_ref: Optional[str] = None, - target_qc_deviation_slots: int = 2, - ): - """Initialize ConfigEntsoe with specified or default parameters.""" - self.country_code = country_code - - self.periods = periods if periods is not None else default_periods() - self.lags_consider = ( - lags_consider if lags_consider is not None else list(range(1, 24)) - ) - self.train_size = ( - train_size if train_size is not None else pd.Timedelta(days=3 * 365) - ) - self.end_train_default = end_train_default - self.delta_val = ( - delta_val if delta_val is not None else pd.Timedelta(hours=24 * 7 * 10) - ) - self.predict_size = predict_size - # Cross-validation test-block width (hours). ``None`` defers to - # ``predict_size``; the actual CV-split logic lives in the sibling - # ``spotforecast2`` package (``BaseTask.cv_ts``). - self.cv_block_size = cv_block_size - self.refit_size = refit_size - self.random_state = random_state - self.n_hyperparameters_trials = n_hyperparameters_trials - self.data_filename = data_filename - self.targets = targets - # Outlier detection - self.use_outlier_detection = use_outlier_detection - self.contamination = contamination - # Imputation - self.imputation_method = imputation_method - self.window_size = window_size - self.imputation_window_size = imputation_window_size - # Exogenous features - self.use_exogenous_features = use_exogenous_features - self.latitude = latitude - self.longitude = longitude - self.timezone = timezone - self.state = state - # Feature selection toggles - self.include_weather_windows = include_weather_windows - self.include_holiday_features = include_holiday_features - self.include_holiday_adjacency_features = include_holiday_adjacency_features - self.poly_features_degree = poly_features_degree - self.max_poly_features = max_poly_features - self.poly_mi_n_jobs = poly_mi_n_jobs - self.poly_mi_sample_size = poly_mi_sample_size - # Provider-based exogenous toggles, each gated by a registry flag in - # ``spotforecast2_safe.preprocessing.exog_providers``. - self.include_covid_infection_rate = include_covid_infection_rate - self.include_entsoe_forecast_load = include_entsoe_forecast_load - self.include_entsoe_renewable_forecast = include_entsoe_renewable_forecast - self.include_entsoe_net_load = include_entsoe_net_load - self.include_entsoe_day_ahead_price = include_entsoe_day_ahead_price - # Data source and index - self.index_name = index_name - # Per-column outlier bounds - self.bounds = bounds - # Verbosity and caching - self.verbose = verbose - self.cache_home = cache_home - # Hyperparameter tuning trial budgets - self.n_trials_optuna = n_trials_optuna - self.n_trials_spotoptim = n_trials_spotoptim - self.n_initial_spotoptim = n_initial_spotoptim - self.n_jobs_spotoptim = n_jobs_spotoptim - # When True, ``SpotOptimStrategy`` injects ``lags_consider`` as a - # candidate lag set and seeds the optimizer's first evaluation with - # it (via SpotOptim's ``x0``). Pure data here; the behaviour lives - # in the sibling ``spotforecast2`` package. - self.warm_start_lags = warm_start_lags - # Active task - self.task = task - # Aggregation weights - self.agg_weights = agg_weights - # Optional callable ``factory(config, *, weight_func, target) -> forecaster``. - self.forecaster_factory = forecaster_factory - # Optional callable ``data_loader(config) -> pd.DataFrame`` invoked - # by ``BaseTask.prepare_data`` when no DataFrame is supplied. - self.data_loader = data_loader - # Optional callable ``test_data_loader(config) -> pd.DataFrame`` invoked - # by ``BaseTask.prepare_data`` when no test DataFrame is supplied. - # Returned frame populates ``test_actual`` and ``metrics_future`` in - # the prediction package; mirrors ``data_loader`` for the test slice. - self.test_data_loader = test_data_loader - # Whether ``BaseTask._run_strategy`` should persist fitted models to - # the cache directory after every training run. Defaults to ``True`` - # so that saved models are immediately available for ``PredictTask``. - self.auto_save_models = auto_save_models - # Identifier for the active dataset, used by ``BaseTask`` for - # cache-subdirectory naming, model-file naming, and per-dataset - # log-file routing. - self.data_frame_name = data_frame_name - # Number of TimeSeriesSplit folds used by ``BaseTask.cv_ts`` when - # building cross-validation splitters for tuning tasks. - self.number_folds = number_folds - # Policy for Open-Meteo fetch failures consumed by - # ``BaseTask.build_exogenous_features``: ``"raise"`` aborts the - # pipeline (default, preserves fail-safe semantics); ``"skip"`` - # logs a warning and continues without weather features. - self.on_weather_failure = on_weather_failure - # Policy for exog-provider failures consumed by - # ``ExogBuilder``: ``"raise"`` aborts (default, fail-safe); ``"skip"`` - # logs a warning and omits the failing provider's columns. - self.on_exog_provider_failure = on_exog_provider_failure - # Maximum contiguous gap in hours that providers will heal (0 = strict). - self.exog_max_gap_hours = exog_max_gap_hours - # Extended trailing-edge healing budget (0 = same as exog_max_gap_hours). - self.exog_max_tail_gap_hours = exog_max_tail_gap_hours - # Validation window for providers: "full" (default) or "train". - self.exog_provider_window = exog_provider_window - # Maximum age of a previously trained model before retraining is - # required. Consumed by - # ``spotforecast2_safe.manager.trainer.should_retrain``. - self.retrain_max_age = ( - retrain_max_age if retrain_max_age is not None else pd.Timedelta(days=7) - ) - # Target-side corruption detector and policy knobs. - self.target_qc_range_mw = target_qc_range_mw - self.target_qc_step_mw = target_qc_step_mw - self.target_qc_window_days = target_qc_window_days - self.target_corruption_policy = target_corruption_policy - self.target_max_heal_hours = target_max_heal_hours - self.target_anchor_zone_hours = target_anchor_zone_hours - self.target_qc_deviation_mw = target_qc_deviation_mw - self.target_qc_deviation_ref = target_qc_deviation_ref - self.target_qc_deviation_slots = target_qc_deviation_slots + def __post_init__(self) -> None: + """Reject clearly-invalid hyperparameter values (fail-safe).""" validate_config(self) def get_params(self, deep: bool = True) -> Dict[str, object]: @@ -550,7 +368,7 @@ def get_params(self, deep: bool = True) -> Dict[str, object]: assert p["cv_block_size"] is None ``` """ - return build_get_params(self, self._PARAM_NAMES, deep) + return build_get_params(self, [f.name for f in fields(self)], deep) def set_params( self, params: Dict[str, object] = None, **kwargs: object @@ -590,3 +408,10 @@ def set_params( ``` """ return apply_set_params(self, params, **kwargs) + + +# ``_PARAM_NAMES`` is derived from the dataclass fields (declaration order) so it +# can never drift from the actual fields; consumers and tests still read it as a +# class attribute. Set after the class body because ``fields()`` needs the +# finished dataclass. +ConfigEntsoe._PARAM_NAMES = tuple(f.name for f in fields(ConfigEntsoe)) diff --git a/src/spotforecast2_safe/configurator/config_multi.py b/src/spotforecast2_safe/configurator/config_multi.py index 9657e354..e360e2c0 100644 --- a/src/spotforecast2_safe/configurator/config_multi.py +++ b/src/spotforecast2_safe/configurator/config_multi.py @@ -3,6 +3,7 @@ """Configuration for multi-input task pipeline.""" +from dataclasses import dataclass, field, fields from typing import Any, Dict, List, Literal, Optional import pandas as pd @@ -16,6 +17,7 @@ from spotforecast2_safe.data import Period +@dataclass class ConfigMulti: """Configuration for the multi-input forecasting pipeline. @@ -274,309 +276,125 @@ class ConfigMulti: ``` """ - _PARAM_NAMES = ( - "country_code", - "periods", - "lags_consider", - "train_size", - "end_train_default", - "delta_val", - "predict_size", - "cv_block_size", - "refit_size", - "random_state", - "n_hyperparameters_trials", - "data_filename", - "targets", - "use_outlier_detection", - "contamination", - "imputation_method", - "window_size", - "imputation_window_size", - "use_exogenous_features", - "latitude", - "longitude", - "timezone", - "state", - "include_weather_windows", - "include_holiday_features", - "include_holiday_adjacency_features", - "poly_features_degree", - "max_poly_features", - "poly_mi_n_jobs", - "poly_mi_sample_size", - "include_covid_infection_rate", - "include_entsoe_forecast_load", - "include_entsoe_renewable_forecast", - "include_entsoe_net_load", - "include_entsoe_day_ahead_price", - "index_name", - "bounds", - "verbose", - "cache_home", - "n_trials_optuna", - "n_trials_spotoptim", - "n_initial_spotoptim", - "n_jobs_spotoptim", - "warm_start_lags", - "task", - "agg_weights", - "forecaster_factory", - "data_loader", - "test_data_loader", - "auto_save_models", - "data_frame_name", - "number_folds", - "on_weather_failure", - "on_exog_provider_failure", - "exog_max_gap_hours", - "exog_max_tail_gap_hours", - "exog_provider_window", - "target_qc_range_mw", - "target_qc_step_mw", - "target_qc_window_days", - "target_corruption_policy", - "target_max_heal_hours", - "target_anchor_zone_hours", - "target_qc_deviation_mw", - "target_qc_deviation_ref", - "target_qc_deviation_slots", + country_code: str = "DE" + periods: List[Period] = field(default_factory=default_periods) + lags_consider: List[int] = field(default_factory=lambda: list(range(1, 24))) + train_size: pd.Timedelta = field(default_factory=lambda: pd.Timedelta(days=3 * 365)) + end_train_default: str = "2025-12-31 00:00+00:00" + delta_val: pd.Timedelta = field( + default_factory=lambda: pd.Timedelta(hours=24 * 7 * 10) ) + predict_size: int = 24 + # Cross-validation test-block width (hours). ``None`` defers to + # ``predict_size``; the actual CV-split logic lives in the sibling + # ``spotforecast2`` package (``BaseTask.cv_ts``). + cv_block_size: Optional[int] = None + refit_size: int = 7 + random_state: int = 314159 + n_hyperparameters_trials: int = 20 + data_filename: str = "interim/energy_load.csv" + targets: Optional[List[str]] = None + # Outlier detection + use_outlier_detection: bool = True + contamination: float = 0.01 + # Imputation + imputation_method: str = "weighted" + window_size: int = 72 + imputation_window_size: Optional[int] = None + # Exogenous features + use_exogenous_features: bool = True + latitude: float = 51.5136 + longitude: float = 7.4653 + timezone: str = "UTC" + state: str = "NW" + # Feature selection toggles + include_weather_windows: bool = False + include_holiday_features: bool = False + include_holiday_adjacency_features: bool = False + poly_features_degree: int = 1 + max_poly_features: int = 10 + poly_mi_n_jobs: Optional[int] = -1 + poly_mi_sample_size: Optional[int] = 4000 + # Provider-based exogenous toggles, each gated by a registry flag in + # ``spotforecast2_safe.preprocessing.exog_providers``. + include_covid_infection_rate: bool = False + include_entsoe_forecast_load: bool = False + include_entsoe_renewable_forecast: bool = False + include_entsoe_net_load: bool = False + include_entsoe_day_ahead_price: bool = False + # Data source and index + index_name: str = "DateTime" + # Per-column outlier bounds [(lower, upper), ...] + bounds: Optional[List[tuple]] = None + # Verbosity and caching + verbose: bool = False + cache_home: Optional[Any] = None + # Hyperparameter tuning trial budgets + n_trials_optuna: int = 15 + n_trials_spotoptim: int = 10 + n_initial_spotoptim: int = 5 + # SpotOptim parallel-evaluation worker count (None=serial, -1=all cores); + # consumed by spotforecast2.multitask.strategies.SpotOptimStrategy + n_jobs_spotoptim: Optional[int] = None + # Seed the SpotOptim search with ``lags_consider`` (consumed by + # spotforecast2.multitask.strategies.SpotOptimStrategy) + warm_start_lags: bool = False + # Active task + task: str = "lazy" + # Aggregation weights (one per target, in target order) + agg_weights: Optional[List[float]] = None + # Forecaster factory hook (consumed by spotforecast2.multitask.base): + # ``factory(config, *, weight_func, target) -> forecaster``. When ``None``, + # ``BaseTask.create_forecaster`` falls back to + # ``default_lgbm_forecaster_factory``. + forecaster_factory: Optional[Any] = None + # Data-loader hook (consumed by ``BaseTask.prepare_data``): + # ``data_loader(config) -> pd.DataFrame``. Invoked iff no DataFrame is + # supplied via the constructor or ``prepare_data``. + data_loader: Optional[Any] = None + # Test-data-loader hook (consumed by ``BaseTask.prepare_data``): mirrors + # ``data_loader`` for the test/ground-truth slice. + test_data_loader: Optional[Any] = None + # Persistence policy and active-dataset name (consumed by + # spotforecast2.multitask.base). + auto_save_models: bool = True + data_frame_name: str = "default" + # Cross-validation fold count (consumed by spotforecast2.multitask.base.cv_ts) + number_folds: int = 10 + # Weather-fetch failure policy (consumed by + # spotforecast2.multitask.base.build_exogenous_features) + on_weather_failure: Literal["raise", "skip"] = "raise" + # Exog-provider failure policy (consumed by + # preprocessing.exog_builder.ExogBuilder) + on_exog_provider_failure: Literal["raise", "skip"] = "raise" + # Gap-healing budget for exog providers (0 = strict fail-safe) + exog_max_gap_hours: int = 0 + # Extended trailing-edge healing budget (0 = same as exog_max_gap_hours) + exog_max_tail_gap_hours: int = 0 + # Validation window for exog providers ("full" or "train") + exog_provider_window: Literal["full", "train"] = "full" + # Target-side corruption detector knobs. Detector active only when + # target_qc_window_days AND at least one of target_qc_range_mw / + # target_qc_step_mw / target_qc_deviation_mw are set. Defaults are all + # None / off, so the pipeline is byte-identical to the pre-feature baseline. + # Recommended episode policy: "truncate" (auto-extends predict_size). + # "heal" under the default anchor_zone_hours=168 with a <=7-day QC window + # never engages (refusal by design). The deviation rule (dropout-only, vs a + # published reference column such as "Forecasted Load") catches corruption + # that stays below the dynamics thresholds; when enabling it, scope + # ``targets`` to the actuals so heal/truncate leave the reference intact. + target_qc_range_mw: Optional[float] = None + target_qc_step_mw: Optional[float] = None + target_qc_window_days: Optional[int] = None + target_corruption_policy: str = "abort" + target_max_heal_hours: int = 0 + target_anchor_zone_hours: int = 168 + target_qc_deviation_mw: Optional[float] = None + target_qc_deviation_ref: Optional[str] = None + target_qc_deviation_slots: int = 2 - def __init__( - self, - country_code: str = "DE", - periods: Optional[List[Period]] = None, - lags_consider: Optional[List[int]] = None, - train_size: Optional[pd.Timedelta] = None, - end_train_default: str = "2025-12-31 00:00+00:00", - delta_val: Optional[pd.Timedelta] = None, - predict_size: int = 24, - cv_block_size: Optional[int] = None, - refit_size: int = 7, - random_state: int = 314159, - n_hyperparameters_trials: int = 20, - data_filename: str = "interim/energy_load.csv", - targets: Optional[List[str]] = None, - # Outlier detection - use_outlier_detection: bool = True, - contamination: float = 0.01, - # Imputation - imputation_method: str = "weighted", - window_size: int = 72, - imputation_window_size: Optional[int] = None, - # Exogenous features - use_exogenous_features: bool = True, - latitude: float = 51.5136, - longitude: float = 7.4653, - timezone: str = "UTC", - state: str = "NW", - # Feature selection toggles - include_weather_windows: bool = False, - include_holiday_features: bool = False, - include_holiday_adjacency_features: bool = False, - poly_features_degree: int = 1, - max_poly_features: int = 10, - poly_mi_n_jobs: Optional[int] = -1, - poly_mi_sample_size: Optional[int] = 4000, - # Provider-based exogenous toggles (preprocessing.exog_providers) - include_covid_infection_rate: bool = False, - include_entsoe_forecast_load: bool = False, - include_entsoe_renewable_forecast: bool = False, - include_entsoe_net_load: bool = False, - include_entsoe_day_ahead_price: bool = False, - # Data source and index - index_name: str = "DateTime", - # Per-column outlier bounds [(lower, upper), ...] - bounds: Optional[List[tuple]] = None, - # Verbosity and caching - verbose: bool = False, - cache_home: Optional[Any] = None, - # Hyperparameter tuning trial budgets - n_trials_optuna: int = 15, - n_trials_spotoptim: int = 10, - n_initial_spotoptim: int = 5, - # SpotOptim parallel-evaluation worker count (None=serial, -1=all cores); - # consumed by spotforecast2.multitask.strategies.SpotOptimStrategy - n_jobs_spotoptim: Optional[int] = None, - # Seed the SpotOptim search with ``lags_consider`` (consumed by - # spotforecast2.multitask.strategies.SpotOptimStrategy) - warm_start_lags: bool = False, - # Active task - task: str = "lazy", - # Aggregation weights (one per target, in target order) - agg_weights: Optional[List[float]] = None, - # Forecaster factory hook (consumed by spotforecast2.multitask.base) - forecaster_factory: Optional[Any] = None, - # Data-loader hook (consumed by spotforecast2.multitask.base.prepare_data) - data_loader: Optional[Any] = None, - # Test-data-loader hook (consumed by spotforecast2.multitask.base.prepare_data) - test_data_loader: Optional[Any] = None, - # Persistence policy and active-dataset name (consumed by spotforecast2.multitask.base) - auto_save_models: bool = True, - data_frame_name: str = "default", - # Cross-validation fold count (consumed by spotforecast2.multitask.base.cv_ts) - number_folds: int = 10, - # Weather-fetch failure policy (consumed by spotforecast2.multitask.base.build_exogenous_features) - on_weather_failure: Literal["raise", "skip"] = "raise", - # Exog-provider failure policy (consumed by preprocessing.exog_builder.ExogBuilder) - on_exog_provider_failure: Literal["raise", "skip"] = "raise", - # Gap-healing budget for exog providers (0 = strict fail-safe) - exog_max_gap_hours: int = 0, - # Extended trailing-edge healing budget (0 = same as exog_max_gap_hours) - exog_max_tail_gap_hours: int = 0, - # Validation window for exog providers ("full" or "train") - exog_provider_window: Literal["full", "train"] = "full", - # Target-side corruption detector knobs. - # Detector active only when target_qc_window_days AND at least one of - # target_qc_range_mw / target_qc_step_mw / target_qc_deviation_mw are - # set. Defaults are all None / off, so the pipeline is byte-identical - # to the pre-feature baseline. - # Recommended episode policy: "truncate" (auto-extends predict_size). - # "heal" under the default anchor_zone_hours=168 with a <=7-day QC window - # never engages (refusal by design — lowering the zone is a deliberate - # operator decision). - # The deviation rule (dropout-only, vs a published reference column - # such as "Forecasted Load") catches corruption that stays below the - # dynamics thresholds; when enabling it, scope `targets` to the - # actuals so heal/truncate leave the reference column intact. - target_qc_range_mw: Optional[float] = None, - target_qc_step_mw: Optional[float] = None, - target_qc_window_days: Optional[int] = None, - target_corruption_policy: str = "abort", - target_max_heal_hours: int = 0, - target_anchor_zone_hours: int = 168, - target_qc_deviation_mw: Optional[float] = None, - target_qc_deviation_ref: Optional[str] = None, - target_qc_deviation_slots: int = 2, - ): - """Initialize ConfigMulti with specified or default parameters.""" - self.country_code = country_code - - self.periods = periods if periods is not None else default_periods() - self.lags_consider = ( - lags_consider if lags_consider is not None else list(range(1, 24)) - ) - self.train_size = ( - train_size if train_size is not None else pd.Timedelta(days=3 * 365) - ) - self.end_train_default = end_train_default - self.delta_val = ( - delta_val if delta_val is not None else pd.Timedelta(hours=24 * 7 * 10) - ) - self.predict_size = predict_size - # Cross-validation test-block width (hours). ``None`` defers to - # ``predict_size``; the actual CV-split logic lives in the sibling - # ``spotforecast2`` package (``BaseTask.cv_ts``). - self.cv_block_size = cv_block_size - self.refit_size = refit_size - self.random_state = random_state - self.n_hyperparameters_trials = n_hyperparameters_trials - self.data_filename = data_filename - self.targets = targets - # Outlier detection - self.use_outlier_detection = use_outlier_detection - self.contamination = contamination - # Imputation - self.imputation_method = imputation_method - self.window_size = window_size - self.imputation_window_size = imputation_window_size - # Exogenous features - self.use_exogenous_features = use_exogenous_features - self.latitude = latitude - self.longitude = longitude - self.timezone = timezone - self.state = state - # Feature selection toggles - self.include_weather_windows = include_weather_windows - self.include_holiday_features = include_holiday_features - self.include_holiday_adjacency_features = include_holiday_adjacency_features - self.poly_features_degree = poly_features_degree - self.max_poly_features = max_poly_features - self.poly_mi_n_jobs = poly_mi_n_jobs - self.poly_mi_sample_size = poly_mi_sample_size - # Provider-based exogenous toggles, each gated by a registry flag in - # ``spotforecast2_safe.preprocessing.exog_providers``. - self.include_covid_infection_rate = include_covid_infection_rate - self.include_entsoe_forecast_load = include_entsoe_forecast_load - self.include_entsoe_renewable_forecast = include_entsoe_renewable_forecast - self.include_entsoe_net_load = include_entsoe_net_load - self.include_entsoe_day_ahead_price = include_entsoe_day_ahead_price - # Data source and index - self.index_name = index_name - # Per-column outlier bounds [(lower, upper), ...] - self.bounds = bounds - # Verbosity and caching - self.verbose = verbose - self.cache_home = cache_home - # Hyperparameter tuning trial budgets - self.n_trials_optuna = n_trials_optuna - self.n_trials_spotoptim = n_trials_spotoptim - self.n_initial_spotoptim = n_initial_spotoptim - self.n_jobs_spotoptim = n_jobs_spotoptim - # When True, ``SpotOptimStrategy`` injects ``lags_consider`` as a - # candidate lag set and seeds the optimizer's first evaluation with - # it (via SpotOptim's ``x0``). Pure data here; the behaviour lives - # in the sibling ``spotforecast2`` package. - self.warm_start_lags = warm_start_lags - # Active task - self.task = task - # Aggregation weights (one per target, in target order) - self.agg_weights = agg_weights - # Optional callable consumed by ``spotforecast2.multitask.base``: - # ``factory(config, *, weight_func, target) -> forecaster``. When - # ``None``, ``BaseTask.create_forecaster`` falls back to - # ``default_lgbm_forecaster_factory``. - self.forecaster_factory = forecaster_factory - # Optional callable consumed by ``BaseTask.prepare_data``: - # ``data_loader(config) -> pd.DataFrame``. When set, it is invoked iff - # no DataFrame is supplied via the constructor or ``prepare_data``. - # Enables data-acquisition extensions (e.g. the ENTSO-E API download) - # without subclassing ``BaseTask``. - self.data_loader = data_loader - # Optional callable consumed by ``BaseTask.prepare_data``: - # ``test_data_loader(config) -> pd.DataFrame``. Mirrors ``data_loader`` - # for the test/ground-truth slice — invoked iff no test DataFrame is - # supplied via the constructor or ``prepare_data``. When set, the - # returned frame is used to populate ``test_actual`` and - # ``metrics_future`` in the prediction package. - self.test_data_loader = test_data_loader - # Whether ``BaseTask._run_strategy`` should persist fitted models to - # the cache directory after every training run. Defaults to ``True`` - # so that saved models are immediately available for ``PredictTask``. - self.auto_save_models = auto_save_models - # Identifier for the active dataset, used by ``BaseTask`` for - # cache-subdirectory naming, model-file naming, and per-dataset - # log-file routing. - self.data_frame_name = data_frame_name - # Number of TimeSeriesSplit folds used by ``BaseTask.cv_ts`` when - # building cross-validation splitters for tuning tasks. - self.number_folds = number_folds - # Policy for Open-Meteo fetch failures consumed by - # ``BaseTask.build_exogenous_features``: ``"raise"`` aborts the - # pipeline (default, preserves fail-safe semantics); ``"skip"`` - # logs a warning and continues without weather features. - self.on_weather_failure = on_weather_failure - # Policy for exog-provider failures consumed by ``ExogBuilder``: - # ``"raise"`` aborts (default, fail-safe); ``"skip"`` logs a warning - # and omits the failing provider's columns. - self.on_exog_provider_failure = on_exog_provider_failure - # Maximum contiguous gap in hours that providers will heal (0 = strict). - self.exog_max_gap_hours = exog_max_gap_hours - # Extended trailing-edge healing budget (0 = same as exog_max_gap_hours). - self.exog_max_tail_gap_hours = exog_max_tail_gap_hours - # Validation window for providers: "full" (default) or "train". - self.exog_provider_window = exog_provider_window - # Target-side corruption detector and policy knobs. - self.target_qc_range_mw = target_qc_range_mw - self.target_qc_step_mw = target_qc_step_mw - self.target_qc_window_days = target_qc_window_days - self.target_corruption_policy = target_corruption_policy - self.target_max_heal_hours = target_max_heal_hours - self.target_anchor_zone_hours = target_anchor_zone_hours - self.target_qc_deviation_mw = target_qc_deviation_mw - self.target_qc_deviation_ref = target_qc_deviation_ref - self.target_qc_deviation_slots = target_qc_deviation_slots + def __post_init__(self) -> None: + """Reject clearly-invalid hyperparameter values (fail-safe).""" validate_config(self) def get_params(self, deep: bool = True) -> Dict[str, object]: @@ -603,7 +421,7 @@ def get_params(self, deep: bool = True) -> Dict[str, object]: print(f"agg_weights: {p['agg_weights']}") ``` """ - return build_get_params(self, self._PARAM_NAMES, deep) + return build_get_params(self, [f.name for f in fields(self)], deep) def set_params( self, params: Dict[str, object] = None, **kwargs: object @@ -637,3 +455,10 @@ def set_params( ``` """ return apply_set_params(self, params, **kwargs) + + +# ``_PARAM_NAMES`` is derived from the dataclass fields (declaration order) so it +# can never drift from the actual fields; consumers and tests still read it as a +# class attribute. Set after the class body because ``fields()`` needs the +# finished dataclass. +ConfigMulti._PARAM_NAMES = tuple(f.name for f in fields(ConfigMulti))